#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
+#include "blk.h"
#include "cfq.h"
/*
#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
-#define RQ_CIC(rq) \
- ((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2])
+#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
+#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
static struct kmem_cache *cfq_pool;
-static struct kmem_cache *cfq_ioc_pool;
-
-static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
+struct cfq_ttime {
+ unsigned long last_end_request;
+
+ unsigned long ttime_total;
+ unsigned long ttime_samples;
+ unsigned long ttime_mean;
+};
+
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
unsigned count;
unsigned total_weight;
u64 min_vdisktime;
+ struct cfq_ttime ttime;
};
-#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
- .count = 0, .min_vdisktime = 0, }
+#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \
+ .ttime = {.last_end_request = jiffies,},}
/*
* Per process-grouping structure
unsigned long slice_end;
long slice_resid;
+ /* pending priority requests */
+ int prio_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
- unsigned short ioprio_class, org_ioprio_class;
+ unsigned short ioprio_class;
pid_t pid;
#endif
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
+ struct cfq_ttime ttime;
+};
+
+struct cfq_io_cq {
+ struct io_cq icq; /* must be the first member */
+ struct cfq_queue *cfqq[2];
+ struct cfq_ttime ttime;
};
/*
struct work_struct unplug_work;
struct cfq_queue *active_queue;
- struct cfq_io_context *active_cic;
+ struct cfq_io_cq *active_cic;
/*
* async queue for each priority case
unsigned int cfq_slice_idle;
unsigned int cfq_group_idle;
unsigned int cfq_latency;
-
- unsigned int cic_index;
- struct list_head cic_list;
+ unsigned int cfq_target_latency;
/*
* Fallback dummy cfqq for extreme OOM conditions
j++, st = i < IDLE_WORKLOAD ? \
&cfqg->service_trees[i][j]: NULL) \
+static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
+ struct cfq_ttime *ttime, bool group_idle)
+{
+ unsigned long slice;
+ if (!sample_valid(ttime->ttime_samples))
+ return false;
+ if (group_idle)
+ slice = cfqd->cfq_group_idle;
+ else
+ slice = cfqd->cfq_slice_idle;
+ return ttime->ttime_mean > slice;
+}
static inline bool iops_mode(struct cfq_data *cfqd)
{
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
- struct io_context *);
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
- bool is_sync)
+static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
- return cic->cfqq[is_sync];
+ /* cic->icq is the first member, %NULL will convert to %NULL */
+ return container_of(icq, struct cfq_io_cq, icq);
}
-static inline void cic_set_cfqq(struct cfq_io_context *cic,
- struct cfq_queue *cfqq, bool is_sync)
+static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
+ struct io_context *ioc)
{
- cic->cfqq[is_sync] = cfqq;
+ if (ioc)
+ return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
+ return NULL;
}
-#define CIC_DEAD_KEY 1ul
-#define CIC_DEAD_INDEX_SHIFT 1
-
-static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
{
- return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+ return cic->cfqq[is_sync];
}
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
+ bool is_sync)
{
- struct cfq_data *cfqd = cic->key;
-
- if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
- return NULL;
+ cic->cfqq[is_sync] = cfqq;
+}
- return cfqd;
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
+{
+ return cic->icq.q->elevator->elevator_data;
}
/*
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
- return cfq_target_latency * cfqg->weight / st->total_weight;
+ return cfqd->cfq_target_latency * cfqg->weight / st->total_weight;
}
static inline unsigned
if (rq_is_sync(rq1) != rq_is_sync(rq2))
return rq_is_sync(rq1) ? rq1 : rq2;
+ if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
+ return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
+
s1 = blk_rq_pos(rq1);
s2 = blk_rq_pos(rq2);
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
+ cfqg->ttime.last_end_request = jiffies;
+
/*
* Take the initial reference that will be released on destroy
* This can be thought of a joint reference by cgroup and
hlist_del_init(&cfqg->cfqd_node);
+ BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+ cfqd->nr_blkcg_linked_grps--;
+
/*
* Put the reference taken at the time of creation so that when all
* queues are gone, group can be destroyed.
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
{
struct task_struct *tsk = current;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
cic = cfq_cic_lookup(cfqd, tsk->io_context);
cfqq->cfqd->rq_queued--;
cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
rq_data_dir(rq), rq_is_sync(rq));
+ if (rq->cmd_flags & REQ_PRIO) {
+ WARN_ON(!cfqq->prio_pending);
+ cfqq->prio_pending--;
+ }
}
static int cfq_merge(struct request_queue *q, struct request **req,
struct request *next)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
+ struct cfq_data *cfqd = q->elevator->elevator_data;
+
/*
* reposition in fifo if next is older than rq
*/
cfq_remove_request(next);
cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg,
rq_data_dir(next), rq_is_sync(next));
+
+ cfqq = RQ_CFQQ(next);
+ /*
+ * all requests of this queue are merged to other queues, delete it
+ * from the service tree. If it's the active_queue,
+ * cfq_dispatch_requests() will choose to expire it or do idle
+ */
+ if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
+ cfqq != cfqd->active_queue)
+ cfq_del_cfqq_rr(cfqd, cfqq);
}
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
/*
return false;
/*
- * Lookup the cfqq that this bio will be queued with. Allow
+ * Lookup the cfqq that this bio will be queued with and allow
* merge only if rq is queued there.
*/
cic = cfq_cic_lookup(cfqd, current->io_context);
cfqd->active_queue = NULL;
if (cfqd->active_cic) {
- put_io_context(cfqd->active_cic->ioc);
+ put_io_context(cfqd->active_cic->icq.ioc);
cfqd->active_cic = NULL;
}
}
* Otherwise, we do only if they are the last ones
* in their service tree.
*/
- if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
+ if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
+ !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
return true;
cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
service_tree->count);
static void cfq_arm_slice_timer(struct cfq_data *cfqd)
{
struct cfq_queue *cfqq = cfqd->active_queue;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
unsigned long sl, group_idle = 0;
/*
* task has exited, don't wait
*/
cic = cfqd->active_cic;
- if (!cic || !atomic_read(&cic->ioc->nr_tasks))
+ if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
return;
/*
* slice, then don't idle. This avoids overrunning the allotted
* time slice.
*/
- if (sample_valid(cic->ttime_samples) &&
- (cfqq->slice_end - jiffies < cic->ttime_mean)) {
+ if (sample_valid(cic->ttime.ttime_samples) &&
+ (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
- cic->ttime_mean);
+ cic->ttime.ttime_mean);
return;
}
* to have higher weight. A more accurate thing would be to
* calculate system wide asnc/sync ratio.
*/
- tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
+ tmp = cfqd->cfq_target_latency *
+ cfqg_busy_async_queues(cfqd, cfqg);
tmp = tmp/cfqd->busy_queues;
slice = min_t(unsigned, slice, tmp);
* this group, wait for requests to complete.
*/
check_group_idle:
- if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1
- && cfqq->cfqg->dispatched) {
+ if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
+ cfqq->cfqg->dispatched &&
+ !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
cfqq = NULL;
goto keep_queue;
}
cfq_dispatch_insert(cfqd->queue, rq);
if (!cfqd->active_cic) {
- struct cfq_io_context *cic = RQ_CIC(rq);
+ struct cfq_io_cq *cic = RQ_CIC(rq);
- atomic_long_inc(&cic->ioc->refcount);
+ atomic_long_inc(&cic->icq.ioc->refcount);
cfqd->active_cic = cic;
}
cfq_put_cfqg(cfqg);
}
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
- void (*func)(struct io_context *, struct cfq_io_context *))
-{
- struct cfq_io_context *cic;
- struct hlist_node *n;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
- func(ioc, cic);
-
- rcu_read_unlock();
-}
-
-static void cfq_cic_free_rcu(struct rcu_head *head)
-{
- struct cfq_io_context *cic;
-
- cic = container_of(head, struct cfq_io_context, rcu_head);
-
- kmem_cache_free(cfq_ioc_pool, cic);
- elv_ioc_count_dec(cfq_ioc_count);
-
- if (ioc_gone) {
- /*
- * CFQ scheduler is exiting, grab exit lock and check
- * the pending io context count. If it hits zero,
- * complete ioc_gone and set it back to NULL
- */
- spin_lock(&ioc_gone_lock);
- if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
- complete(ioc_gone);
- ioc_gone = NULL;
- }
- spin_unlock(&ioc_gone_lock);
- }
-}
-
-static void cfq_cic_free(struct cfq_io_context *cic)
-{
- call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
-}
-
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
-{
- unsigned long flags;
- unsigned long dead_key = (unsigned long) cic->key;
-
- BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
- spin_lock_irqsave(&ioc->lock, flags);
- radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
- hlist_del_rcu(&cic->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- cfq_cic_free(cic);
-}
-
-/*
- * Must be called with rcu_read_lock() held or preemption otherwise disabled.
- * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
- * and ->trim() which is called with the task lock held
- */
-static void cfq_free_io_context(struct io_context *ioc)
-{
- /*
- * ioc->refcount is zero here, or we are called from elv_unregister(),
- * so no more cic's are allowed to be linked into this ioc. So it
- * should be ok to iterate over the known list, we will see all cic's
- * since no new ones are added.
- */
- call_for_each_cic(ioc, cic_free_func);
-}
-
static void cfq_put_cooperator(struct cfq_queue *cfqq)
{
struct cfq_queue *__cfqq, *next;
cfq_put_queue(cfqq);
}
-static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
- struct cfq_io_context *cic)
+static void cfq_init_icq(struct io_cq *icq)
{
- struct io_context *ioc = cic->ioc;
+ struct cfq_io_cq *cic = icq_to_cic(icq);
- list_del_init(&cic->queue_list);
-
- /*
- * Make sure dead mark is seen for dead queues
- */
- smp_wmb();
- cic->key = cfqd_dead_key(cfqd);
+ cic->ttime.last_end_request = jiffies;
+}
- rcu_read_lock();
- if (rcu_dereference(ioc->ioc_data) == cic) {
- rcu_read_unlock();
- spin_lock(&ioc->lock);
- rcu_assign_pointer(ioc->ioc_data, NULL);
- spin_unlock(&ioc->lock);
- } else
- rcu_read_unlock();
+static void cfq_exit_icq(struct io_cq *icq)
+{
+ struct cfq_io_cq *cic = icq_to_cic(icq);
+ struct cfq_data *cfqd = cic_to_cfqd(cic);
if (cic->cfqq[BLK_RW_ASYNC]) {
cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
}
}
-static void cfq_exit_single_io_context(struct io_context *ioc,
- struct cfq_io_context *cic)
-{
- struct cfq_data *cfqd = cic_to_cfqd(cic);
-
- if (cfqd) {
- struct request_queue *q = cfqd->queue;
- unsigned long flags;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- /*
- * Ensure we get a fresh copy of the ->key to prevent
- * race between exiting task and queue
- */
- smp_read_barrier_depends();
- if (cic->key == cfqd)
- __cfq_exit_single_io_context(cfqd, cic);
-
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
- call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
-static struct cfq_io_context *
-cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
- struct cfq_io_context *cic;
-
- cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
- cfqd->queue->node);
- if (cic) {
- cic->last_end_request = jiffies;
- INIT_LIST_HEAD(&cic->queue_list);
- INIT_HLIST_NODE(&cic->cic_list);
- cic->dtor = cfq_free_io_context;
- cic->exit = cfq_exit_io_context;
- elv_ioc_count_inc(cfq_ioc_count);
- }
-
- return cic;
-}
-
static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
{
struct task_struct *tsk = current;
* elevate the priority of this queue
*/
cfqq->org_ioprio = cfqq->ioprio;
- cfqq->org_ioprio_class = cfqq->ioprio_class;
cfq_clear_cfqq_prio_changed(cfqq);
}
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_cq *cic)
{
struct cfq_data *cfqd = cic_to_cfqd(cic);
struct cfq_queue *cfqq;
- unsigned long flags;
if (unlikely(!cfqd))
return;
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
cfqq = cic->cfqq[BLK_RW_ASYNC];
if (cfqq) {
struct cfq_queue *new_cfqq;
- new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+ new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
GFP_ATOMIC);
if (new_cfqq) {
cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
cfqq = cic->cfqq[BLK_RW_SYNC];
if (cfqq)
cfq_mark_cfqq_prio_changed(cfqq);
-
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-}
-
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
- call_for_each_cic(ioc, changed_ioprio);
- ioc->ioprio_changed = 0;
}
static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_cq *cic)
{
struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
struct cfq_data *cfqd = cic_to_cfqd(cic);
- unsigned long flags;
struct request_queue *q;
if (unlikely(!cfqd))
q = cfqd->queue;
- spin_lock_irqsave(q->queue_lock, flags);
-
if (sync_cfqq) {
/*
* Drop reference to sync queue. A new sync queue will be
cic_set_cfqq(cic, NULL, 1);
cfq_put_queue(sync_cfqq);
}
-
- spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
- call_for_each_cic(ioc, changed_cgroup);
- ioc->cgroup_changed = 0;
}
#endif /* CONFIG_CFQ_GROUP_IOSCHED */
struct io_context *ioc, gfp_t gfp_mask)
{
struct cfq_queue *cfqq, *new_cfqq = NULL;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_group *cfqg;
retry:
return cfqq;
}
-/*
- * We drop cfq io contexts lazily, so we may find a dead one.
- */
static void
-cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
- struct cfq_io_context *cic)
-{
- unsigned long flags;
-
- WARN_ON(!list_empty(&cic->queue_list));
- BUG_ON(cic->key != cfqd_dead_key(cfqd));
-
- spin_lock_irqsave(&ioc->lock, flags);
-
- BUG_ON(rcu_dereference_check(ioc->ioc_data,
- lockdep_is_held(&ioc->lock)) == cic);
-
- radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
- hlist_del_rcu(&cic->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- cfq_cic_free(cic);
-}
-
-static struct cfq_io_context *
-cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
+__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
{
- struct cfq_io_context *cic;
- unsigned long flags;
-
- if (unlikely(!ioc))
- return NULL;
-
- rcu_read_lock();
-
- /*
- * we maintain a last-hit cache, to avoid browsing over the tree
- */
- cic = rcu_dereference(ioc->ioc_data);
- if (cic && cic->key == cfqd) {
- rcu_read_unlock();
- return cic;
- }
-
- do {
- cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
- rcu_read_unlock();
- if (!cic)
- break;
- if (unlikely(cic->key != cfqd)) {
- cfq_drop_dead_cic(cfqd, ioc, cic);
- rcu_read_lock();
- continue;
- }
-
- spin_lock_irqsave(&ioc->lock, flags);
- rcu_assign_pointer(ioc->ioc_data, cic);
- spin_unlock_irqrestore(&ioc->lock, flags);
- break;
- } while (1);
+ unsigned long elapsed = jiffies - ttime->last_end_request;
+ elapsed = min(elapsed, 2UL * slice_idle);
- return cic;
+ ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
+ ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
+ ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
}
-/*
- * Add cic into ioc, using cfqd as the search key. This enables us to lookup
- * the process specific cfq io context when entered from the block layer.
- * Also adds the cic to a per-cfqd list, used when this queue is removed.
- */
-static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
- struct cfq_io_context *cic, gfp_t gfp_mask)
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+ struct cfq_io_cq *cic)
{
- unsigned long flags;
- int ret;
-
- ret = radix_tree_preload(gfp_mask);
- if (!ret) {
- cic->ioc = ioc;
- cic->key = cfqd;
-
- spin_lock_irqsave(&ioc->lock, flags);
- ret = radix_tree_insert(&ioc->radix_root,
- cfqd->cic_index, cic);
- if (!ret)
- hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
- spin_unlock_irqrestore(&ioc->lock, flags);
-
- radix_tree_preload_end();
-
- if (!ret) {
- spin_lock_irqsave(cfqd->queue->queue_lock, flags);
- list_add(&cic->queue_list, &cfqd->cic_list);
- spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
- }
+ if (cfq_cfqq_sync(cfqq)) {
+ __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
+ __cfq_update_io_thinktime(&cfqq->service_tree->ttime,
+ cfqd->cfq_slice_idle);
}
-
- if (ret)
- printk(KERN_ERR "cfq: cic link failed!\n");
-
- return ret;
-}
-
-/*
- * Setup general io context and cfq io context. There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
- */
-static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
- struct io_context *ioc = NULL;
- struct cfq_io_context *cic;
-
- might_sleep_if(gfp_mask & __GFP_WAIT);
-
- ioc = get_io_context(gfp_mask, cfqd->queue->node);
- if (!ioc)
- return NULL;
-
- cic = cfq_cic_lookup(cfqd, ioc);
- if (cic)
- goto out;
-
- cic = cfq_alloc_io_context(cfqd, gfp_mask);
- if (cic == NULL)
- goto err;
-
- if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
- goto err_free;
-
-out:
- smp_read_barrier_depends();
- if (unlikely(ioc->ioprio_changed))
- cfq_ioc_set_ioprio(ioc);
-
#ifdef CONFIG_CFQ_GROUP_IOSCHED
- if (unlikely(ioc->cgroup_changed))
- cfq_ioc_set_cgroup(ioc);
+ __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
#endif
- return cic;
-err_free:
- cfq_cic_free(cic);
-err:
- put_io_context(ioc);
- return NULL;
-}
-
-static void
-cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
-{
- unsigned long elapsed = jiffies - cic->last_end_request;
- unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
-
- cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
- cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
- cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
}
static void
*/
static void
cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
- struct cfq_io_context *cic)
+ struct cfq_io_cq *cic)
{
int old_idle, enable_idle;
if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
enable_idle = 0;
- else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
- (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
+ else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+ !cfqd->cfq_slice_idle ||
+ (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
enable_idle = 0;
- else if (sample_valid(cic->ttime_samples)) {
- if (cic->ttime_mean > cfqd->cfq_slice_idle)
+ else if (sample_valid(cic->ttime.ttime_samples)) {
+ if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
enable_idle = 0;
else
enable_idle = 1;
return true;
/*
+ * So both queues are sync. Let the new request get disk time if
+ * it's a metadata request and the current queue is doing regular IO.
+ */
+ if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
+ return true;
+
+ /*
* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
*/
if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
*/
static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- struct cfq_queue *old_cfqq = cfqd->active_queue;
+ enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
cfq_log_cfqq(cfqd, cfqq, "preempt");
cfq_slice_expired(cfqd, 1);
* workload type is changed, don't save slice, otherwise preempt
* doesn't happen
*/
- if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+ if (old_type != cfqq_type(cfqq))
cfqq->cfqg->saved_workload_slice = 0;
/*
cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *rq)
{
- struct cfq_io_context *cic = RQ_CIC(rq);
+ struct cfq_io_cq *cic = RQ_CIC(rq);
cfqd->rq_queued++;
+ if (rq->cmd_flags & REQ_PRIO)
+ cfqq->prio_pending++;
- cfq_update_io_thinktime(cfqd, cic);
+ cfq_update_io_thinktime(cfqd, cfqq, cic);
cfq_update_io_seektime(cfqd, cfqq, rq);
cfq_update_idle_window(cfqd, cfqq, cic);
struct cfq_queue *cfqq = RQ_CFQQ(rq);
cfq_log_cfqq(cfqd, cfqq, "insert_request");
- cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
+ cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
list_add_tail(&rq->queuelist, &cfqq->fifo);
static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
- struct cfq_io_context *cic = cfqd->active_cic;
+ struct cfq_io_cq *cic = cfqd->active_cic;
/* If the queue already has requests, don't wait */
if (!RB_EMPTY_ROOT(&cfqq->sort_list))
if (cfqq->cfqg->nr_cfqq > 1)
return false;
+ /* the only queue in the group, but think time is big */
+ if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
+ return false;
+
if (cfq_slice_used(cfqq))
return true;
/* if slice left is less than think time, wait busy */
- if (cic && sample_valid(cic->ttime_samples)
- && (cfqq->slice_end - jiffies < cic->ttime_mean))
+ if (cic && sample_valid(cic->ttime.ttime_samples)
+ && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
return true;
/*
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
if (sync) {
- RQ_CIC(rq)->last_end_request = now;
+ struct cfq_rb_root *service_tree;
+
+ RQ_CIC(rq)->ttime.last_end_request = now;
+
+ if (cfq_cfqq_on_rr(cfqq))
+ service_tree = cfqq->service_tree;
+ else
+ service_tree = service_tree_for(cfqq->cfqg,
+ cfqq_prio(cfqq), cfqq_type(cfqq));
+ service_tree->ttime.last_end_request = now;
if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
cfqd->last_delayed_sync = now;
}
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ cfqq->cfqg->ttime.last_end_request = now;
+#endif
+
/*
* If this is the active queue, check if it needs to be expired,
* or if we want to idle in case it has no pending requests.
cfq_schedule_dispatch(cfqd);
}
-/*
- * we temporarily boost lower priority queues if they are holding fs exclusive
- * resources. they are boosted to normal prio (CLASS_BE/4)
- */
-static void cfq_prio_boost(struct cfq_queue *cfqq)
-{
- if (has_fs_excl()) {
- /*
- * boost idle prio on transactions that would lock out other
- * users of the filesystem
- */
- if (cfq_class_idle(cfqq))
- cfqq->ioprio_class = IOPRIO_CLASS_BE;
- if (cfqq->ioprio > IOPRIO_NORM)
- cfqq->ioprio = IOPRIO_NORM;
- } else {
- /*
- * unboost the queue (if needed)
- */
- cfqq->ioprio_class = cfqq->org_ioprio_class;
- cfqq->ioprio = cfqq->org_ioprio;
- }
-}
-
static inline int __cfq_may_queue(struct cfq_queue *cfqq)
{
if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct task_struct *tsk = current;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
/*
cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
if (cfqq) {
- cfq_init_prio_data(cfqq, cic->ioc);
- cfq_prio_boost(cfqq);
+ cfq_init_prio_data(cfqq, cic->icq.ioc);
return __cfq_may_queue(cfqq);
}
BUG_ON(!cfqq->allocated[rw]);
cfqq->allocated[rw]--;
- put_io_context(RQ_CIC(rq)->ioc);
-
- rq->elevator_private[0] = NULL;
- rq->elevator_private[1] = NULL;
-
/* Put down rq reference on cfqg */
cfq_put_cfqg(RQ_CFQG(rq));
- rq->elevator_private[2] = NULL;
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
cfq_put_queue(cfqq);
}
}
static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
struct cfq_queue *cfqq)
{
cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
* was the last process referring to said cfqq.
*/
static struct cfq_queue *
-split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
{
if (cfqq_process_refs(cfqq) == 1) {
cfqq->pid = current->pid;
cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
- struct cfq_io_context *cic;
+ struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
const int rw = rq_data_dir(rq);
const bool is_sync = rq_is_sync(rq);
struct cfq_queue *cfqq;
- unsigned long flags;
+ unsigned int changed;
might_sleep_if(gfp_mask & __GFP_WAIT);
- cic = cfq_get_io_context(cfqd, gfp_mask);
-
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irq(q->queue_lock);
- if (!cic)
- goto queue_fail;
+ /* handle changed notifications */
+ changed = icq_get_changed(&cic->icq);
+ if (unlikely(changed & ICQ_IOPRIO_CHANGED))
+ changed_ioprio(cic);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+ if (unlikely(changed & ICQ_CGROUP_CHANGED))
+ changed_cgroup(cic);
+#endif
new_queue:
cfqq = cic_to_cfqq(cic, is_sync);
if (!cfqq || cfqq == &cfqd->oom_cfqq) {
- cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+ cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
cic_set_cfqq(cic, cfqq, is_sync);
} else {
/*
cfqq->allocated[rw]++;
cfqq->ref++;
- rq->elevator_private[0] = cic;
- rq->elevator_private[1] = cfqq;
- rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ rq->elv.priv[0] = cfqq;
+ rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+ spin_unlock_irq(q->queue_lock);
return 0;
-
-queue_fail:
- cfq_schedule_dispatch(cfqd);
- spin_unlock_irqrestore(q->queue_lock, flags);
- cfq_log(cfqd, "set_request fail");
- return 1;
}
static void cfq_kick_queue(struct work_struct *work)
if (cfqd->active_queue)
__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
- while (!list_empty(&cfqd->cic_list)) {
- struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
- struct cfq_io_context,
- queue_list);
-
- __cfq_exit_single_io_context(cfqd, cic);
- }
-
cfq_put_async_queues(cfqd);
cfq_release_cfq_groups(cfqd);
cfq_shutdown_timer_wq(cfqd);
- spin_lock(&cic_index_lock);
- ida_remove(&cic_index_ida, cfqd->cic_index);
- spin_unlock(&cic_index_lock);
-
/*
* Wait for cfqg->blkg->key accessors to exit their grace periods.
* Do this wait only if there are other unlinked groups out
kfree(cfqd);
}
-static int cfq_alloc_cic_index(void)
-{
- int index, error;
-
- do {
- if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
- return -ENOMEM;
-
- spin_lock(&cic_index_lock);
- error = ida_get_new(&cic_index_ida, &index);
- spin_unlock(&cic_index_lock);
- if (error && error != -EAGAIN)
- return error;
- } while (error);
-
- return index;
-}
-
static void *cfq_init_queue(struct request_queue *q)
{
struct cfq_data *cfqd;
struct cfq_group *cfqg;
struct cfq_rb_root *st;
- i = cfq_alloc_cic_index();
- if (i < 0)
- return NULL;
-
cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!cfqd) {
- spin_lock(&cic_index_lock);
- ida_remove(&cic_index_ida, i);
- spin_unlock(&cic_index_lock);
+ if (!cfqd)
return NULL;
- }
-
- /*
- * Don't need take queue_lock in the routine, since we are
- * initializing the ioscheduler, and nobody is using cfqd
- */
- cfqd->cic_index = i;
/* Init root service tree */
cfqd->grp_service_tree = CFQ_RB_ROOT;
cfqd->oom_cfqq.ref++;
cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
- INIT_LIST_HEAD(&cfqd->cic_list);
-
cfqd->queue = q;
init_timer(&cfqd->idle_slice_timer);
cfqd->cfq_back_penalty = cfq_back_penalty;
cfqd->cfq_slice[0] = cfq_slice_async;
cfqd->cfq_slice[1] = cfq_slice_sync;
+ cfqd->cfq_target_latency = cfq_target_latency;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
cfqd->cfq_group_idle = cfq_group_idle;
return cfqd;
}
-static void cfq_slab_kill(void)
-{
- /*
- * Caller already ensured that pending RCU callbacks are completed,
- * so we should have no busy allocations at this point.
- */
- if (cfq_pool)
- kmem_cache_destroy(cfq_pool);
- if (cfq_ioc_pool)
- kmem_cache_destroy(cfq_ioc_pool);
-}
-
-static int __init cfq_slab_setup(void)
-{
- cfq_pool = KMEM_CACHE(cfq_queue, 0);
- if (!cfq_pool)
- goto fail;
-
- cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
- if (!cfq_ioc_pool)
- goto fail;
-
- return 0;
-fail:
- cfq_slab_kill();
- return -ENOMEM;
-}
-
/*
* sysfs parts below -->
*/
SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
+SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
#undef SHOW_FUNCTION
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
UINT_MAX, 0);
STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
+STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
#undef STORE_FUNCTION
#define CFQ_ATTR(name) \
CFQ_ATTR(slice_idle),
CFQ_ATTR(group_idle),
CFQ_ATTR(low_latency),
+ CFQ_ATTR(target_latency),
__ATTR_NULL
};
.elevator_completed_req_fn = cfq_completed_request,
.elevator_former_req_fn = elv_rb_former_request,
.elevator_latter_req_fn = elv_rb_latter_request,
+ .elevator_init_icq_fn = cfq_init_icq,
+ .elevator_exit_icq_fn = cfq_exit_icq,
.elevator_set_req_fn = cfq_set_request,
.elevator_put_req_fn = cfq_put_request,
.elevator_may_queue_fn = cfq_may_queue,
.elevator_init_fn = cfq_init_queue,
.elevator_exit_fn = cfq_exit_queue,
- .trim = cfq_free_io_context,
},
+ .icq_size = sizeof(struct cfq_io_cq),
+ .icq_align = __alignof__(struct cfq_io_cq),
.elevator_attrs = cfq_attrs,
- .elevator_name = "cfq",
+ .elevator_name = "cfq",
.elevator_owner = THIS_MODULE,
};
static int __init cfq_init(void)
{
+ int ret;
+
/*
* could be 0 on HZ < 1000 setups
*/
#else
cfq_group_idle = 0;
#endif
- if (cfq_slab_setup())
+ cfq_pool = KMEM_CACHE(cfq_queue, 0);
+ if (!cfq_pool)
return -ENOMEM;
- elv_register(&iosched_cfq);
+ ret = elv_register(&iosched_cfq);
+ if (ret) {
+ kmem_cache_destroy(cfq_pool);
+ return ret;
+ }
+
blkio_policy_register(&blkio_policy_cfq);
return 0;
static void __exit cfq_exit(void)
{
- DECLARE_COMPLETION_ONSTACK(all_gone);
blkio_policy_unregister(&blkio_policy_cfq);
elv_unregister(&iosched_cfq);
- ioc_gone = &all_gone;
- /* ioc_gone's update must be visible before reading ioc_count */
- smp_wmb();
-
- /*
- * this also protects us from entering cfq_slab_kill() with
- * pending RCU callbacks
- */
- if (elv_ioc_count_read(cfq_ioc_count))
- wait_for_completion(&all_gone);
- ida_destroy(&cic_index_ida);
- cfq_slab_kill();
+ kmem_cache_destroy(cfq_pool);
}
module_init(cfq_init);