- Include patches from upstream:
authorHannes Reinecke <hare@suse.de>
Fri, 24 Oct 2008 12:49:18 +0000 (12:49 +0000)
committerHannes Reinecke <hare@suse.de>
Fri, 24 Oct 2008 12:49:18 +0000 (12:49 +0000)
  - patches.fixes/block-use-bio_has_data: Implement bio_has_data().
  - patches.fixes/block-git-fixes: Block layer fixes for 2.6.28.
  - patches.fixes/block-rq-affinity: Implement rq affinity.
  - patches.fixes/dm-mpath-remove-is_active: dm mpath: remove
    is_active from struct dm_path.
  - patches.fixes/block-discard-requests: Implement block discard.
  - patches.drivers/dm-abort-queue-on-failed-paths: dm: Call
    blk_abort_queue on failed paths (bnc#417544).
  - patches.drivers/block-call-sync-on-cleanup: block:
    blk_cleanup_queue() should call blk_sync_queue().
- Refaktor and update request-based multipath patches:
  - patches.drivers/blk-request-based-multipath-update:
    Update request-based multipathing patches to upstream version
    (bnc#434105).
  - patches.suse/rq-based-multipath: Update to latest version
    of request-based multipathing patchset (bnc#434105)
  - patches.drivers/block-abort-request-rq-complete-marking:
    use rq complete marking in blk_abort_request (bnc#434105).
  - patches.fixes/scsi-atomic-blk-timer-deletes: Delete.
  - patches.fixes/dm-mpath-abort-queue: Delete.
  - patches.suse/rq-based-block-layer: Delete.
  - patches.suse/rq-based-dm-interface: Delete.
  - patches.suse/rq-based-multipath-functions: Delete.
  - patches.suse/rq-based-init-crash: Delete.
- Update patches to upstream version:
  - patches.drivers/bdev-resize-check-for-device-resize
  - patches.drivers/bdev-resize-added-flush_disk
  - patches.drivers/bdev-resize-call-flush_disk
  - patches.drivers/bdev-resize-sd-driver-calls
  - patches.drivers/block-timeout-handling
  - patches.drivers/bdev-resize-adjust-block-device-size
  - patches.drivers/bdev-resize-wrapper-for-revalidate_disk
  - patches.drivers/block-abort-queue
  - patches.fixes/scsi-enhance-error-codes
- Rediff patches:
  - patches.fixes/scsi-misc-git-update
  - patches.suse/dm-barrier-single-device
  - patches.suse/kdb-common
  - patches.drivers/lpfc-8.2.8-update
  - patches.drivers/lpfc-8.2.8.3-update
  - patches.drivers/mpt-fusion-4.00.43.00-update

suse-commit: f085bef2d8f278fa561752dfc20cc4eeae9c54e8

48 files changed:
Documentation/DocBook/kernel-api.tmpl
Documentation/block/deadline-iosched.txt
block/Makefile
block/as-iosched.c
block/blk-barrier.c
block/blk-core.c
block/blk-exec.c
block/blk-integrity.c
block/blk-map.c
block/blk-merge.c
block/blk-settings.c
block/blk-softirq.c [new file with mode: 0644]
block/blk-sysfs.c
block/blk-tag.c
block/blk.h
block/blktrace.c
block/cfq-iosched.c
block/compat_ioctl.c
block/deadline-iosched.c
block/elevator.c
block/genhd.c
block/ioctl.c
drivers/block/ps3disk.c
drivers/block/virtio_blk.c
drivers/md/dm-ioctl.c
drivers/md/dm-mpath.c
drivers/md/dm-mpath.h
drivers/md/dm-table.c
drivers/md/dm.c
drivers/md/dm.h
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/mtd/ftl.c
drivers/mtd/mtd_blkdevs.c
drivers/scsi/scsi_lib.c
drivers/scsi/scsi_priv.h
fs/bio.c
fs/fat/fatent.c
include/linux/bio.h
include/linux/blkdev.h
include/linux/blktrace_api.h
include/linux/device-mapper.h
include/linux/elevator.h
include/linux/fs.h
include/linux/mtd/blktrans.h
kdb/modules/kdbm_pg.c
mm/bounce.c

index b7b1482..f5696ba 100644 (file)
@@ -364,6 +364,10 @@ X!Edrivers/pnp/system.c
 !Eblock/blk-barrier.c
 !Eblock/blk-tag.c
 !Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
   </chapter>
 
   <chapter id="chrdev">
index c23cab1..7257676 100644 (file)
@@ -30,12 +30,18 @@ write_expire        (in ms)
 Similar to read_expire mentioned above, but for writes.
 
 
-fifo_batch
+fifo_batch     (number of requests)
 ----------
 
-When a read request expires its deadline, we must move some requests from
-the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move.
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order.  To limit extra seeking,
+deadline expiries are only checked between batches.  fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput.  When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour).  Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
 
 
 writes_starved (number of dispatches)
index a27ebe6..bfe7304 100644 (file)
@@ -4,8 +4,8 @@
 
 obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
-                       blk-exec.o blk-merge.o blk-timeout.o ioctl.o genhd.o \
-                       scsi_ioctl.o cmd-filter.o
+                       blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+                       ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
 
 obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
index cf4eb0e..80af925 100644 (file)
@@ -462,7 +462,7 @@ static void as_antic_stop(struct as_data *ad)
                        del_timer(&ad->antic_timer);
                ad->antic_status = ANTIC_FINISHED;
                /* see as_work_handler */
-               kblockd_schedule_work(&ad->antic_work);
+               kblockd_schedule_work(ad->q, &ad->antic_work);
        }
 }
 
@@ -483,7 +483,7 @@ static void as_antic_timeout(unsigned long data)
                aic = ad->io_context->aic;
 
                ad->antic_status = ANTIC_FINISHED;
-               kblockd_schedule_work(&ad->antic_work);
+               kblockd_schedule_work(q, &ad->antic_work);
 
                if (aic->ttime_samples == 0) {
                        /* process anticipated on has exited or timed out*/
@@ -844,7 +844,7 @@ static void as_completed_request(struct request_queue *q, struct request *rq)
        if (ad->changed_batch && ad->nr_dispatched == 1) {
                ad->current_batch_expires = jiffies +
                                        ad->batch_expire[ad->batch_data_dir];
-               kblockd_schedule_work(&ad->antic_work);
+               kblockd_schedule_work(q, &ad->antic_work);
                ad->changed_batch = 0;
 
                if (ad->batch_data_dir == REQ_SYNC)
index a09ead1..1eb5743 100644 (file)
@@ -315,3 +315,72 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
        return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       }
+
+       bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev:      blockdev to issue discard for
+ * @sector:    start sector
+ * @nr_sects:  number of sectors to discard
+ *
+ * Description:
+ *    Issue a discard request for the sectors in question. Does not wait.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+                        unsigned nr_sects)
+{
+       struct request_queue *q;
+       struct bio *bio;
+       int ret = 0;
+
+       if (bdev->bd_disk == NULL)
+               return -ENXIO;
+
+       q = bdev_get_queue(bdev);
+       if (!q)
+               return -ENXIO;
+
+       if (!q->prepare_discard_fn)
+               return -EOPNOTSUPP;
+
+       while (nr_sects && !ret) {
+               bio = bio_alloc(GFP_KERNEL, 0);
+               if (!bio)
+                       return -ENOMEM;
+
+               bio->bi_end_io = blkdev_discard_end_io;
+               bio->bi_bdev = bdev;
+
+               bio->bi_sector = sector;
+
+               if (nr_sects > q->max_hw_sectors) {
+                       bio->bi_size = q->max_hw_sectors << 9;
+                       nr_sects -= q->max_hw_sectors;
+                       sector += q->max_hw_sectors;
+               } else {
+                       bio->bi_size = nr_sects << 9;
+                       nr_sects = 0;
+               }
+               bio_get(bio);
+               submit_bio(DISCARD_BARRIER, bio);
+
+               /* Check if it failed immediately */
+               if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                       ret = -EOPNOTSUPP;
+               else if (!bio_flagged(bio, BIO_UPTODATE))
+                       ret = -EIO;
+               bio_put(bio);
+       }
+       return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
index f563303..6d0e8a2 100644 (file)
@@ -26,8 +26,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -50,8 +48,6 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
 static void drive_stat_acct(struct request *rq, int new_io)
 {
        struct hd_struct *part;
@@ -113,8 +109,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        memset(rq, 0, sizeof(*rq));
 
        INIT_LIST_HEAD(&rq->queuelist);
-       INIT_LIST_HEAD(&rq->donelist);
        INIT_LIST_HEAD(&rq->timeout_list);
+       rq->cpu = -1;
        rq->q = q;
        rq->sector = rq->hard_sector = (sector_t) -1;
        INIT_HLIST_NODE(&rq->hash);
@@ -309,7 +305,7 @@ void blk_unplug_timeout(unsigned long data)
        blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
                                q->rq.count[READ] + q->rq.count[WRITE]);
 
-       kblockd_schedule_work(&q->unplug_work);
+       kblockd_schedule_work(q, &q->unplug_work);
 }
 
 void blk_unplug(struct request_queue *q)
@@ -326,6 +322,21 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
 
+static void blk_invoke_request_fn(struct request_queue *q)
+{
+       /*
+        * one level of recursion is ok and is much faster than kicking
+        * the unplug handling
+        */
+       if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+               q->request_fn(q);
+               queue_flag_clear(QUEUE_FLAG_REENTER, q);
+       } else {
+               queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+               kblockd_schedule_work(q, &q->unplug_work);
+       }
+}
+
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -340,18 +351,7 @@ void blk_start_queue(struct request_queue *q)
        WARN_ON(!irqs_disabled());
 
        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-
-       /*
-        * one level of recursion is ok and is much faster than kicking
-        * the unplug handling
-        */
-       if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-               q->request_fn(q);
-               queue_flag_clear(QUEUE_FLAG_REENTER, q);
-       } else {
-               blk_plug_device(q);
-               kblockd_schedule_work(&q->unplug_work);
-       }
+       blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -409,15 +409,8 @@ void __blk_run_queue(struct request_queue *q)
         * Only recurse once to avoid overrunning the stack, let the unplug
         * handling reinvoke the handler shortly if we already got there.
         */
-       if (!elv_queue_empty(q)) {
-               if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-                       q->request_fn(q);
-                       queue_flag_clear(QUEUE_FLAG_REENTER, q);
-               } else {
-                       blk_plug_device(q);
-                       kblockd_schedule_work(&q->unplug_work);
-               }
-       }
+       if (!elv_queue_empty(q))
+               blk_invoke_request_fn(q);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
@@ -442,6 +435,14 @@ void blk_put_queue(struct request_queue *q)
 
 void blk_cleanup_queue(struct request_queue *q)
 {
+       /*
+        * We know we have process context here, so we can be a little
+        * cautious and ensure that pending block actions on this device
+        * are done before moving on. Going into this function, we should
+        * not have processes doing IO to this device.
+        */
+       blk_sync_queue(q);
+
        mutex_lock(&q->sysfs_lock);
        queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
        mutex_unlock(&q->sysfs_lock);
@@ -534,7 +535,7 @@ EXPORT_SYMBOL(blk_alloc_queue_node);
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
- *    Function returns a pointer to the initialized request queue, or NULL if
+ *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
@@ -572,7 +573,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
        q->request_fn           = rfn;
        q->prep_rq_fn           = NULL;
        q->unplug_fn            = generic_unplug_device;
-       q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER);
+       q->queue_flags          = (1 << QUEUE_FLAG_CLUSTER |
+                                  1 << QUEUE_FLAG_STACKABLE);
        q->queue_lock           = lock;
 
        blk_queue_segment_boundary(q, 0xffffffff);
@@ -627,10 +629,6 @@ blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 
        blk_rq_init(q, rq);
 
-       /*
-        * first three bits are identical in rq->cmd_flags and bio->bi_rw,
-        * see bio.h and blkdev.h
-        */
        rq->cmd_flags = rw | REQ_ALLOCED;
 
        if (priv) {
@@ -922,7 +920,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL(blk_requeue_request);
 
 /**
- * blk_insert_request - insert a special request in to a request queue
+ * blk_insert_request - insert a special request into a request queue
  * @q:         request queue where request should be inserted
  * @rq:                request to be inserted
  * @at_head:   insert request at head or tail of queue
@@ -932,8 +930,8 @@ EXPORT_SYMBOL(blk_requeue_request);
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_SPECIAL in to the corresponding request queue, and letting them be
- *    scheduled for actual execution by the request queue.
+ *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
+ *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
@@ -1075,6 +1073,7 @@ EXPORT_SYMBOL(blk_put_request);
 
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
+       req->cpu = bio->bi_comp_cpu;
        req->cmd_type = REQ_TYPE_FS;
 
        /*
@@ -1093,7 +1092,12 @@ void init_request_from_bio(struct request *req, struct bio *bio)
        /*
         * REQ_BARRIER implies no merging, but lets make it explicit
         */
-       if (unlikely(bio_barrier(bio)))
+       if (unlikely(bio_discard(bio))) {
+               req->cmd_flags |= REQ_DISCARD;
+               if (bio_barrier(bio))
+                       req->cmd_flags |= REQ_SOFTBARRIER;
+               req->q->prepare_discard_fn(req->q, req);
+       } else if (unlikely(bio_barrier(bio)))
                req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 
        if (bio_sync(bio))
@@ -1111,7 +1115,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
        struct request *req;
-       int el_ret, nr_sectors, barrier, err;
+       int el_ret, nr_sectors, barrier, discard, err;
        const unsigned short prio = bio_prio(bio);
        const int sync = bio_sync(bio);
        int rw_flags;
@@ -1126,7 +1130,14 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        blk_queue_bounce(q, &bio);
 
        barrier = bio_barrier(bio);
-       if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+       if (unlikely(barrier) && bio_has_data(bio) &&
+           (q->next_ordered == QUEUE_ORDERED_NONE)) {
+               err = -EOPNOTSUPP;
+               goto end_io;
+       }
+
+       discard = bio_discard(bio);
+       if (unlikely(discard) && !q->prepare_discard_fn) {
                err = -EOPNOTSUPP;
                goto end_io;
        }
@@ -1150,6 +1161,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                req->biotail = bio;
                req->nr_sectors = req->hard_nr_sectors += nr_sectors;
                req->ioprio = ioprio_best(req->ioprio, prio);
+               if (!blk_rq_cpu_valid(req))
+                       req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
                if (!attempt_back_merge(q, req))
                        elv_merged_request(q, req, el_ret);
@@ -1177,6 +1190,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                req->sector = req->hard_sector = bio->bi_sector;
                req->nr_sectors = req->hard_nr_sectors += nr_sectors;
                req->ioprio = ioprio_best(req->ioprio, prio);
+               if (!blk_rq_cpu_valid(req))
+                       req->cpu = bio->bi_comp_cpu;
                drive_stat_acct(req, 0);
                if (!attempt_front_merge(q, req))
                        elv_merged_request(q, req, el_ret);
@@ -1212,13 +1227,15 @@ get_rq:
        init_request_from_bio(req, bio);
 
        spin_lock_irq(q->queue_lock);
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+           bio_flagged(bio, BIO_CPU_AFFINE))
+               req->cpu = blk_cpu_to_group(smp_processor_id());
        if (elv_queue_empty(q))
                blk_plug_device(q);
        add_request(q, req);
 out:
        if (sync)
                __generic_unplug_device(q);
-
        spin_unlock_irq(q->queue_lock);
        return 0;
 
@@ -1326,7 +1343,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 }
 
 /**
- * generic_make_request: hand a buffer to its device driver for I/O
+ * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
@@ -1421,7 +1438,8 @@ end_io:
 
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
-               if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+               if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
+                   (bio_discard(bio) && !q->prepare_discard_fn)) {
                        err = -EOPNOTSUPP;
                        goto end_io;
                }
@@ -1483,13 +1501,13 @@ void generic_make_request(struct bio *bio)
 EXPORT_SYMBOL(generic_make_request);
 
 /**
- * submit_bio: submit a bio to the block device layer for I/O
+ * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
- * interfaces, @bio must be presetup and ready for I/O.
+ * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
@@ -1502,11 +1520,7 @@ void submit_bio(int rw, struct bio *bio)
         * If it's a regular read/write or a barrier with data attached,
         * go through the normal accounting stuff before submission.
         */
-       if (!bio_empty_barrier(bio)) {
-
-               BIO_BUG_ON(!bio->bi_size);
-               BIO_BUG_ON(!bio->bi_io_vec);
-
+       if (bio_has_data(bio)) {
                if (rw & WRITE) {
                        count_vm_events(PGPGOUT, count);
                } else {
@@ -1528,15 +1542,33 @@ void submit_bio(int rw, struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
-/*
- * Check a request for queue limits
+/**
+ * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * @q:  the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ *    @rq may have been made based on weaker limitations of upper-level queues
+ *    in request stacking drivers, and it may violate the limitation of @q.
+ *    Since the block layer and the underlying device driver trust @rq
+ *    after it is inserted to @q, it should be checked against @q before
+ *    the insertion using this generic function.
+ *
+ *    This function should also be useful for request stacking drivers
+ *    in some cases below, so export this fuction.
+ *    Request stacking drivers like request-based dm may change the queue
+ *    limits while requests are in the queue (e.g. dm's table swapping).
+ *    Such request stacking drivers should check those requests agaist
+ *    the new queue limits again when they dispatch those requests,
+ *    although such checkings are also done against the old queue limits
+ *    when submitting requests.
  */
-static int check_queue_limit(struct request_queue *q, struct request *rq)
+int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
        if (rq->nr_sectors > q->max_sectors ||
-           rq->data_len >> 9 > q->max_hw_sectors) {
+           rq->data_len > q->max_hw_sectors << 9) {
                printk(KERN_ERR "%s: over max size limit.\n", __func__);
-               return 1;
+               return -EIO;
        }
 
        /*
@@ -1547,30 +1579,31 @@ static int check_queue_limit(struct request_queue *q, struct request *rq)
         */
        blk_recalc_rq_segments(rq);
        if (rq->nr_phys_segments > q->max_phys_segments ||
-           rq->nr_hw_segments > q->max_hw_segments) {
+           rq->nr_phys_segments > q->max_hw_segments) {
                printk(KERN_ERR "%s: over max segments limit.\n", __func__);
-               return 1;
+               return -EIO;
        }
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 
 /**
- * blk_submit_request - Helper for stacking drivers to submit the request
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
  * @q:  the queue to submit the request
  * @rq: the request being queued
- **/
-void blk_submit_request(struct request_queue *q, struct request *rq)
+ */
+int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
        unsigned long flags;
 
-       if (check_queue_limit(q, rq))
-               goto end_io;
+       if (blk_rq_check_limits(q, rq))
+               return -EIO;
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
-       if (rq->rq_disk && rq->rq_disk->flags & GENHD_FL_FAIL &&
+       if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
            should_fail(&fail_make_request, blk_rq_bytes(rq)))
-               goto end_io;
+               return -EIO;
 #endif
 
        spin_lock_irqsave(q->queue_lock, flags);
@@ -1586,17 +1619,14 @@ void blk_submit_request(struct request_queue *q, struct request *rq)
 
        spin_unlock_irqrestore(q->queue_lock, flags);
 
-       return;
-
-end_io:
-       blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+       return 0;
 }
-EXPORT_SYMBOL_GPL(blk_submit_request);
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -1604,8 +1634,8 @@ EXPORT_SYMBOL_GPL(blk_submit_request);
  *     for the next range of segments (if any) in the cluster.
  *
  * Return:
- *     0 - we are done with this request, call end_that_request_last()
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request, call end_that_request_last()
+ *     %1 - still buffers pending for this request
  **/
 static int __end_that_request_first(struct request *req, int error,
                                    int nr_bytes)
@@ -1616,7 +1646,7 @@ static int __end_that_request_first(struct request *req, int error,
        blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
 
        /*
-        * for a REQ_BLOCK_PC request, we want to carry any eventual
+        * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
         * sense key with us all the way through
         */
        if (!blk_pc_request(req))
@@ -1718,100 +1748,6 @@ static int __end_that_request_first(struct request *req, int error,
 }
 
 /*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
-       struct list_head *cpu_list, local_list;
-
-       local_irq_disable();
-       cpu_list = &__get_cpu_var(blk_cpu_done);
-       list_replace_init(cpu_list, &local_list);
-       local_irq_enable();
-
-       while (!list_empty(&local_list)) {
-               struct request *rq;
-
-               rq = list_entry(local_list.next, struct request, donelist);
-               list_del_init(&rq->donelist);
-               rq->q->softirq_done_fn(rq);
-       }
-}
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
-                                   unsigned long action, void *hcpu)
-{
-       /*
-        * If a CPU goes away, splice its entries to the current CPU
-        * and trigger a run of the softirq
-        */
-       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-               int cpu = (unsigned long) hcpu;
-
-               local_irq_disable();
-               list_splice_init(&per_cpu(blk_cpu_done, cpu),
-                                &__get_cpu_var(blk_cpu_done));
-               raise_softirq_irqoff(BLOCK_SOFTIRQ);
-               local_irq_enable();
-       }
-
-       return NOTIFY_OK;
-}
-
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
-       .notifier_call  = blk_cpu_notify,
-};
-
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-
-void __blk_complete_request(struct request *req)
-{
-       struct list_head *cpu_list;
-       unsigned long flags;
-
-       BUG_ON(!req->q->softirq_done_fn);
-
-       local_irq_save(flags);
-
-       cpu_list = &__get_cpu_var(blk_cpu_done);
-       list_add_tail(&req->donelist, cpu_list);
-       raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
-       local_irq_restore(flags);
-}
-
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-
-void blk_complete_request(struct request *req)
-{
-       if (!blk_mark_rq_complete(req))
-               __blk_complete_request(req);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
-/*
  * queue lock must be held
  */
 static void end_that_request_last(struct request *req, int error)
@@ -1902,11 +1838,11 @@ EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 /**
  * end_queued_request - end all I/O on a queued request
  * @rq:                the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
+ * @uptodate:  error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends all I/O on a request, and removes it from the block layer queues.
- *     Not suitable for normal IO completion, unless the driver still has
+ *     Not suitable for normal I/O completion, unless the driver still has
  *     the request attached to the block layer.
  *
  **/
@@ -1919,7 +1855,7 @@ EXPORT_SYMBOL(end_queued_request);
 /**
  * end_dequeued_request - end all I/O on a dequeued request
  * @rq:                the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
+ * @uptodate:  error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends all I/O on a request. The request must already have been
@@ -1937,14 +1873,14 @@ EXPORT_SYMBOL(end_dequeued_request);
 /**
  * end_request - end I/O on the current segment of the request
  * @req:       the request being processed
- * @uptodate:  error value or 0/1 uptodate flag
+ * @uptodate:  error value or %0/%1 uptodate flag
  *
  * Description:
  *     Ends I/O on the current segment of a request. If that is the only
  *     remaining segment, the request is also completed and freed.
  *
- *     This is a remnant of how older block drivers handled IO completions.
- *     Modern drivers typically end IO on the full request in one go, unless
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
  *     they have a residual value to account for. For that case this function
  *     isn't really useful, unless the residual just happens to be the
  *     full current segment. In other words, don't use this function in new
@@ -1962,7 +1898,7 @@ EXPORT_SYMBOL(end_request);
 static int end_that_request_data(struct request *rq, int error,
                                 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
-       if (blk_fs_request(rq) || blk_pc_request(rq)) {
+       if (rq->bio) {
                if (__end_that_request_first(rq, error, nr_bytes))
                        return 1;
 
@@ -1978,12 +1914,12 @@ static int end_that_request_data(struct request *rq, int error,
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
@@ -1991,8 +1927,8 @@ static int end_that_request_data(struct request *rq, int error,
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet, it still has pending buffers.
+ *     %0 - we are done with this request
+ *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
                      unsigned int bidi_bytes,
@@ -2020,7 +1956,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
@@ -2028,8 +1964,8 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
@@ -2040,22 +1976,20 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
- * @error:    0 for success, < 0 for error
+ * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Must be called with queue lock held unlike blk_end_request().
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-       if (blk_fs_request(rq) || blk_pc_request(rq)) {
-               if (__end_that_request_first(rq, error, nr_bytes))
-                       return 1;
-       }
+       if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
+               return 1;
 
        add_disk_randomness(rq->rq_disk);
 
@@ -2068,7 +2002,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * blk_end_bidi_request - Helper function for drivers to complete bidi request.
  * @rq:         the bidi request being processed
- * @error:      0 for success, < 0 for error
+ * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
@@ -2076,8 +2010,8 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - still buffers pending for this request
+ *     %0 - we are done with this request
+ *     %1 - still buffers pending for this request
  **/
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
                         unsigned int bidi_bytes)
@@ -2089,7 +2023,7 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  *
  * Description:
@@ -2100,20 +2034,18 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
  *     Actual device drivers should use blk_end_request instead.
- **/
+ */
 void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
 {
        if (!end_that_request_data(rq, error, nr_bytes, 0)) {
                /*
-                * All bios in the request have been completed.
-                * Then, members of the request are not updated.
-                * Update those members to avoid double charge of diskstat
-                * when the stacking driver calls blk_end_request()
-                * to complete the request actually.
+                * These members are not updated in end_that_request_data()
+                * when all bios are completed.
+                * Update them so that the request stacking driver can find
+                * how many bytes remain in the request later.
                 */
                rq->nr_sectors = rq->hard_nr_sectors = 0;
                rq->current_nr_sectors = rq->hard_cur_sectors = 0;
-               rq->nr_phys_segments = rq->nr_hw_segments = 0;
        }
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
@@ -2121,11 +2053,11 @@ EXPORT_SYMBOL_GPL(blk_update_request);
 /**
  * blk_end_request_callback - Special helper function for tricky drivers
  * @rq:           the request being processed
- * @error:        0 for success, < 0 for error
+ * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete
  * @drv_callback: function called between completion of bios in the request
  *                and completion of the request.
- *                If the callback returns non 0, this helper returns without
+ *                If the callback returns non %0, this helper returns without
  *                completion of the request.
  *
  * Description:
@@ -2138,10 +2070,10 @@ EXPORT_SYMBOL_GPL(blk_update_request);
  *     Don't use this interface in other places anymore.
  *
  * Return:
- *     0 - we are done with this request
- *     1 - this request is not freed yet.
- *         this request still has pending buffers or
- *         the driver doesn't want to finish this request yet.
+ *     %0 - we are done with this request
+ *     %1 - this request is not freed yet.
+ *          this request still has pending buffers or
+ *          the driver doesn't want to finish this request yet.
  **/
 int blk_end_request_callback(struct request *rq, int error,
                             unsigned int nr_bytes,
@@ -2154,15 +2086,17 @@ EXPORT_SYMBOL_GPL(blk_end_request_callback);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                     struct bio *bio)
 {
-       /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+       /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+          we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
        rq->cmd_flags |= (bio->bi_rw & 3);
 
-       rq->nr_phys_segments = bio_phys_segments(q, bio);
-       rq->nr_hw_segments = bio_hw_segments(q, bio);
+       if (bio_has_data(bio)) {
+               rq->nr_phys_segments = bio_phys_segments(q, bio);
+               rq->buffer = bio_data(bio);
+       }
        rq->current_nr_sectors = bio_cur_sectors(bio);
        rq->hard_cur_sectors = rq->current_nr_sectors;
        rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-       rq->buffer = bio_data(bio);
        rq->data_len = bio->bi_size;
 
        rq->bio = rq->biotail = bio;
@@ -2171,7 +2105,35 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
                rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 
-int kblockd_schedule_work(struct work_struct *work)
+/**
+ * blk_lld_busy - Check if underlying low-level drivers of a device are busy
+ * @q : the queue of the device being checked
+ *
+ * Description:
+ *    Check if underlying low-level drivers of a device are busy.
+ *    If the drivers want to export their busy state, they must set own
+ *    exporting function using blk_queue_lld_busy() first.
+ *
+ *    Basically, this function is used only by request stacking drivers
+ *    to stop dispatching requests to underlying devices when underlying
+ *    devices are busy.  This behavior helps more I/O merging on the queue
+ *    of the request stacking driver and prevents I/O throughput regression
+ *    on burst I/O load.
+ *
+ * Return:
+ *    0 - Not busy (The request stacking driver should dispatch request)
+ *    1 - Busy (The request stacking driver should stop dispatching request)
+ */
+int blk_lld_busy(struct request_queue *q)
+{
+       if (q->lld_busy_fn)
+               return q->lld_busy_fn(q);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blk_lld_busy);
+
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
        return queue_work(kblockd_workqueue, work);
 }
@@ -2185,8 +2147,6 @@ EXPORT_SYMBOL(kblockd_flush_work);
 
 int __init blk_dev_init(void)
 {
-       int i;
-
        kblockd_workqueue = create_workqueue("kblockd");
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");
@@ -2197,12 +2157,6 @@ int __init blk_dev_init(void)
        blk_requestq_cachep = kmem_cache_create("blkdev_queue",
                        sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
-       for_each_possible_cpu(i)
-               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
-       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
-       register_hotcpu_notifier(&blk_cpu_notifier);
-
        return 0;
 }
 
index 9bceff7..6af716d 100644 (file)
@@ -16,7 +16,7 @@
 /**
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
- * @error: end io status of the request
+ * @error: end I/O status of the request
  */
 static void blk_end_sync_rq(struct request *rq, int error)
 {
@@ -41,7 +41,7 @@ static void blk_end_sync_rq(struct request *rq, int error)
  * @done:      I/O completion handler
  *
  * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution.  Don't wait for completion.
  */
 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
@@ -72,7 +72,7 @@ EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
  * @at_head:    insert request at head or tail of queue
  *
  * Description:
- *    Insert a fully prepared request at the back of the io scheduler queue
+ *    Insert a fully prepared request at the back of the I/O scheduler queue
  *    for execution and wait for completion.
  */
 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
index 3f1a847..d87606e 100644 (file)
@@ -109,8 +109,8 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg);
 
 /**
  * blk_integrity_compare - Compare integrity profile of two block devices
- * @b1:                Device to compare
- * @b2:                Device to compare
+ * @bd1:       Device to compare
+ * @bd2:       Device to compare
  *
  * Description: Meta-devices like DM and MD need to verify that all
  * sub-devices use the same integrity format before advertising to
index af37e4a..ea1bf53 100644 (file)
@@ -85,17 +85,17 @@ static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 }
 
 /**
- * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:         request queue where request should be inserted
  * @rq:                request structure to fill
  * @ubuf:      the user buffer
  * @len:       length of user data
  *
  * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -154,7 +154,7 @@ unmap_rq:
 EXPORT_SYMBOL(blk_rq_map_user);
 
 /**
- * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:         request queue where request should be inserted
  * @rq:                request to map data to
  * @iov:       pointer to the iovec
@@ -162,10 +162,10 @@ EXPORT_SYMBOL(blk_rq_map_user);
  * @len:       I/O byte count
  *
  * Description:
- *    Data will be mapped directly for zero copy io, if possible. Otherwise
+ *    Data will be mapped directly for zero copy I/O, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
- *    A matching blk_rq_unmap_user() must be issued at the end of io, while
+ *    A matching blk_rq_unmap_user() must be issued at the end of I/O, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
@@ -224,7 +224,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
  * Description:
  *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
  *    supply the original rq->bio from the blk_rq_map_user() return, since
- *    the io completion may have changed rq->bio.
+ *    the I/O completion may have changed rq->bio.
  */
 int blk_rq_unmap_user(struct bio *bio)
 {
@@ -250,7 +250,7 @@ int blk_rq_unmap_user(struct bio *bio)
 EXPORT_SYMBOL(blk_rq_unmap_user);
 
 /**
- * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
  * @q:         request queue where request should be inserted
  * @rq:                request to fill
  * @kbuf:      the kernel buffer
index 5efc9e7..ad271c0 100644 (file)
@@ -11,7 +11,7 @@
 
 void blk_recalc_rq_sectors(struct request *rq, int nsect)
 {
-       if (blk_fs_request(rq)) {
+       if (blk_fs_request(rq) || blk_discard_rq(rq)) {
                rq->hard_sector += nsect;
                rq->hard_nr_sectors -= nsect;
 
@@ -41,12 +41,9 @@ void blk_recalc_rq_sectors(struct request *rq, int nsect)
 void blk_recalc_rq_segments(struct request *rq)
 {
        int nr_phys_segs;
-       int nr_hw_segs;
        unsigned int phys_size;
-       unsigned int hw_size;
        struct bio_vec *bv, *bvprv = NULL;
        int seg_size;
-       int hw_seg_size;
        int cluster;
        struct req_iterator iter;
        int high, highprv = 1;
@@ -56,8 +53,8 @@ void blk_recalc_rq_segments(struct request *rq)
                return;
 
        cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
-       hw_seg_size = seg_size = 0;
-       phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+       seg_size = 0;
+       phys_size = nr_phys_segs = 0;
        rq_for_each_segment(bv, rq, iter) {
                /*
                 * the trick here is making sure that a high page is never
@@ -66,7 +63,7 @@ void blk_recalc_rq_segments(struct request *rq)
                 */
                high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
                if (high || highprv)
-                       goto new_hw_segment;
+                       goto new_segment;
                if (cluster) {
                        if (seg_size + bv->bv_len > q->max_segment_size)
                                goto new_segment;
@@ -74,40 +71,19 @@ void blk_recalc_rq_segments(struct request *rq)
                                goto new_segment;
                        if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
                                goto new_segment;
-                       if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-                               goto new_hw_segment;
 
                        seg_size += bv->bv_len;
-                       hw_seg_size += bv->bv_len;
                        bvprv = bv;
                        continue;
                }
 new_segment:
-               if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
-                   !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
-                       hw_seg_size += bv->bv_len;
-               else {
-new_hw_segment:
-                       if (nr_hw_segs == 1 &&
-                           hw_seg_size > rq->bio->bi_hw_front_size)
-                               rq->bio->bi_hw_front_size = hw_seg_size;
-                       hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
-                       nr_hw_segs++;
-               }
-
                nr_phys_segs++;
                bvprv = bv;
                seg_size = bv->bv_len;
                highprv = high;
        }
 
-       if (nr_hw_segs == 1 &&
-           hw_seg_size > rq->bio->bi_hw_front_size)
-               rq->bio->bi_hw_front_size = hw_seg_size;
-       if (hw_seg_size > rq->biotail->bi_hw_back_size)
-               rq->biotail->bi_hw_back_size = hw_seg_size;
        rq->nr_phys_segments = nr_phys_segs;
-       rq->nr_hw_segments = nr_hw_segs;
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
@@ -120,7 +96,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
        blk_recalc_rq_segments(&rq);
        bio->bi_next = nxt;
        bio->bi_phys_segments = rq.nr_phys_segments;
-       bio->bi_hw_segments = rq.nr_hw_segments;
        bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
@@ -131,13 +106,17 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
                return 0;
 
-       if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
-               return 0;
        if (bio->bi_size + nxt->bi_size > q->max_segment_size)
                return 0;
 
+       if (!bio_has_data(bio))
+               return 1;
+
+       if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+               return 0;
+
        /*
-        * bio and nxt are contigous in memory, check if the queue allows
+        * bio and nxt are contiguous in memory; check if the queue allows
         * these two to be merged into one
         */
        if (BIO_SEG_BOUNDARY(q, bio, nxt))
@@ -146,22 +125,6 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
        return 0;
 }
 
-static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
-                                struct bio *nxt)
-{
-       if (!bio_flagged(bio, BIO_SEG_VALID))
-               blk_recount_segments(q, bio);
-       if (!bio_flagged(nxt, BIO_SEG_VALID))
-               blk_recount_segments(q, nxt);
-       if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
-           BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
-               return 0;
-       if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
-               return 0;
-
-       return 1;
-}
-
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
@@ -275,10 +238,9 @@ static inline int ll_new_hw_segment(struct request_queue *q,
                                    struct request *req,
                                    struct bio *bio)
 {
-       int nr_hw_segs = bio_hw_segments(q, bio);
        int nr_phys_segs = bio_phys_segments(q, bio);
 
-       if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
+       if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
            || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
                req->cmd_flags |= REQ_NOMERGE;
                if (req == q->last_merge)
@@ -290,7 +252,6 @@ static inline int ll_new_hw_segment(struct request_queue *q,
         * This will form the start of a new hw segment.  Bump both
         * counters.
         */
-       req->nr_hw_segments += nr_hw_segs;
        req->nr_phys_segments += nr_phys_segs;
        return 1;
 }
@@ -299,7 +260,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
                     struct bio *bio)
 {
        unsigned short max_sectors;
-       int len;
 
        if (unlikely(blk_pc_request(req)))
                max_sectors = q->max_hw_sectors;
@@ -316,19 +276,6 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
                blk_recount_segments(q, req->biotail);
        if (!bio_flagged(bio, BIO_SEG_VALID))
                blk_recount_segments(q, bio);
-       len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
-       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
-           && !BIOVEC_VIRT_OVERSIZE(len)) {
-               int mergeable =  ll_new_mergeable(q, req, bio);
-
-               if (mergeable) {
-                       if (req->nr_hw_segments == 1)
-                               req->bio->bi_hw_front_size = len;
-                       if (bio->bi_hw_segments == 1)
-                               bio->bi_hw_back_size = len;
-               }
-               return mergeable;
-       }
 
        return ll_new_hw_segment(q, req, bio);
 }
@@ -337,7 +284,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
                      struct bio *bio)
 {
        unsigned short max_sectors;
-       int len;
 
        if (unlikely(blk_pc_request(req)))
                max_sectors = q->max_hw_sectors;
@@ -351,23 +297,10 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
                        q->last_merge = NULL;
                return 0;
        }
-       len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
        if (!bio_flagged(bio, BIO_SEG_VALID))
                blk_recount_segments(q, bio);
        if (!bio_flagged(req->bio, BIO_SEG_VALID))
                blk_recount_segments(q, req->bio);
-       if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
-           !BIOVEC_VIRT_OVERSIZE(len)) {
-               int mergeable =  ll_new_mergeable(q, req, bio);
-
-               if (mergeable) {
-                       if (bio->bi_hw_segments == 1)
-                               bio->bi_hw_front_size = len;
-                       if (req->nr_hw_segments == 1)
-                               req->biotail->bi_hw_back_size = len;
-               }
-               return mergeable;
-       }
 
        return ll_new_hw_segment(q, req, bio);
 }
@@ -376,7 +309,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
                                struct request *next)
 {
        int total_phys_segments;
-       int total_hw_segments;
 
        /*
         * First check if the either of the requests are re-queued
@@ -398,26 +330,11 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
        if (total_phys_segments > q->max_phys_segments)
                return 0;
 
-       total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
-       if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
-               int len = req->biotail->bi_hw_back_size +
-                               next->bio->bi_hw_front_size;
-               /*
-                * propagate the combined length to the end of the requests
-                */
-               if (req->nr_hw_segments == 1)
-                       req->bio->bi_hw_front_size = len;
-               if (next->nr_hw_segments == 1)
-                       next->biotail->bi_hw_back_size = len;
-               total_hw_segments--;
-       }
-
-       if (total_hw_segments > q->max_hw_segments)
+       if (total_phys_segments > q->max_hw_segments)
                return 0;
 
        /* Merge is OK... */
        req->nr_phys_segments = total_phys_segments;
-       req->nr_hw_segments = total_hw_segments;
        return 1;
 }
 
@@ -481,6 +398,8 @@ static int attempt_merge(struct request_queue *q, struct request *req,
        }
 
        req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+       if (blk_rq_cpu_valid(next))
+               req->cpu = next->cpu;
 
        __blk_put_request(q, next);
        return 1;
index 6bf2cd2..5edd818 100644 (file)
@@ -33,6 +33,23 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 EXPORT_SYMBOL(blk_queue_prep_rq);
 
 /**
+ * blk_queue_set_discard - set a discard_sectors function for queue
+ * @q:         queue
+ * @dfn:       prepare_discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to transform a discard request into the appropriate type for the
+ * hardware. If none is registered, then discard requests are failed
+ * with %EOPNOTSUPP.
+ *
+ */
+void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
+{
+       q->prepare_discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_set_discard);
+
+/**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:         queue
  * @mbfn:      merge_bvec_fn
@@ -72,6 +89,12 @@ void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
 
+void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
+{
+       q->lld_busy_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
+
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
@@ -139,7 +162,7 @@ EXPORT_SYMBOL(blk_queue_make_request);
  *    Different hardware can have different requirements as to what pages
  *    it can do I/O directly to. A low level driver can call
  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
- *    buffers for doing I/O to pages residing above @page.
+ *    buffers for doing I/O to pages residing above @dma_addr.
  **/
 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
 {
@@ -224,7 +247,7 @@ EXPORT_SYMBOL(blk_queue_max_phys_segments);
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
  *    hw data segments in a request.  This would be the largest number of
- *    address/length pairs the host adapter can actually give as once
+ *    address/length pairs the host adapter can actually give at once
  *    to the device.
  **/
 void blk_queue_max_hw_segments(struct request_queue *q,
@@ -405,7 +428,7 @@ EXPORT_SYMBOL(blk_queue_segment_boundary);
  * @mask:  alignment mask
  *
  * description:
- *    set required memory and length aligment for direct dma transactions.
+ *    set required memory and length alignment for direct dma transactions.
  *    this is used when buiding direct io requests for the queue.
  *
  **/
@@ -421,7 +444,7 @@ EXPORT_SYMBOL(blk_queue_dma_alignment);
  * @mask:  alignment mask
  *
  * description:
- *    update required memory and length aligment for direct dma transactions.
+ *    update required memory and length alignment for direct dma transactions.
  *    If the requested alignment is larger than the current alignment, then
  *    the current queue alignment is updated to the new value, otherwise it
  *    is left alone.  The design of this is to allow multiple objects
@@ -438,7 +461,7 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
 
-static int __init blk_settings_init(void)
+int __init blk_settings_init(void)
 {
        blk_max_low_pfn = max_low_pfn - 1;
        blk_max_pfn = max_pfn - 1;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
new file mode 100644 (file)
index 0000000..7ab344a
--- /dev/null
@@ -0,0 +1,173 @@
+/*
+ * Functions related to softirq rq completions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+       struct list_head *cpu_list, local_list;
+
+       local_irq_disable();
+       cpu_list = &__get_cpu_var(blk_cpu_done);
+       list_replace_init(cpu_list, &local_list);
+       local_irq_enable();
+
+       while (!list_empty(&local_list)) {
+               struct request *rq;
+
+               rq = list_entry(local_list.next, struct request, csd.list);
+               list_del_init(&rq->csd.list);
+               rq->q->softirq_done_fn(rq);
+       }
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+static void trigger_softirq(void *data)
+{
+       struct request *rq = data;
+       unsigned long flags;
+       struct list_head *list;
+
+       local_irq_save(flags);
+       list = &__get_cpu_var(blk_cpu_done);
+       list_add_tail(&rq->csd.list, list);
+
+       if (list->next == &rq->csd.list)
+               raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+       local_irq_restore(flags);
+}
+
+/*
+ * Setup and invoke a run of 'trigger_softirq' on the given cpu.
+ */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+       if (cpu_online(cpu)) {
+               struct call_single_data *data = &rq->csd;
+
+               data->func = trigger_softirq;
+               data->info = rq;
+               data->flags = 0;
+
+               __smp_call_function_single(cpu, data);
+               return 0;
+       }
+
+       return 1;
+}
+#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+       return 1;
+}
+#endif
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self,
+                                   unsigned long action, void *hcpu)
+{
+       /*
+        * If a CPU goes away, splice its entries to the current CPU
+        * and trigger a run of the softirq
+        */
+       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+               int cpu = (unsigned long) hcpu;
+
+               local_irq_disable();
+               list_splice_init(&per_cpu(blk_cpu_done, cpu),
+                                &__get_cpu_var(blk_cpu_done));
+               raise_softirq_irqoff(BLOCK_SOFTIRQ);
+               local_irq_enable();
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_cpu_notifier = {
+       .notifier_call  = blk_cpu_notify,
+};
+
+void __blk_complete_request(struct request *req)
+{
+       struct request_queue *q = req->q;
+       unsigned long flags;
+       int ccpu, cpu, group_cpu;
+
+       BUG_ON(!q->softirq_done_fn);
+
+       local_irq_save(flags);
+       cpu = smp_processor_id();
+       group_cpu = blk_cpu_to_group(cpu);
+
+       /*
+        * Select completion CPU
+        */
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+               ccpu = req->cpu;
+       else
+               ccpu = cpu;
+
+       if (ccpu == cpu || ccpu == group_cpu) {
+               struct list_head *list;
+do_local:
+               list = &__get_cpu_var(blk_cpu_done);
+               list_add_tail(&req->csd.list, list);
+
+               /*
+                * if the list only contains our just added request,
+                * signal a raise of the softirq. If there are already
+                * entries there, someone already raised the irq but it
+                * hasn't run yet.
+                */
+               if (list->next == &req->csd.list)
+                       raise_softirq_irqoff(BLOCK_SOFTIRQ);
+       } else if (raise_blk_irq(ccpu, req))
+               goto do_local;
+
+       local_irq_restore(flags);
+}
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req:      the request being processed
+ *
+ * Description:
+ *     Ends all I/O on a request. It does not handle partial completions,
+ *     unless the driver actually implements this in its completion callback
+ *     through requeueing. The actual completion happens out-of-order,
+ *     through a softirq handler. The user must have registered a completion
+ *     callback through blk_queue_softirq_done().
+ **/
+void blk_complete_request(struct request *req)
+{
+       if (!blk_mark_rq_complete(req))
+               __blk_complete_request(req);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
+__init int blk_softirq_init(void)
+{
+       int i;
+
+       for_each_possible_cpu(i)
+               INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+       open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+       register_hotcpu_notifier(&blk_cpu_notifier);
+       return 0;
+}
+subsys_initcall(blk_softirq_init);
index a00ef7d..196f079 100644 (file)
@@ -156,6 +156,30 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
        return ret;
 }
 
+static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+{
+       unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+
+       return queue_var_show(set != 0, page);
+}
+
+static ssize_t
+queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+{
+       ssize_t ret = -EINVAL;
+#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+       unsigned long val;
+
+       ret = queue_var_store(&val, page, count);
+       spin_lock_irq(q->queue_lock);
+       if (val)
+               queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+       else
+               queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+       spin_unlock_irq(q->queue_lock);
+#endif
+       return ret;
+}
 
 static struct queue_sysfs_entry queue_requests_entry = {
        .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
@@ -197,6 +221,12 @@ static struct queue_sysfs_entry queue_nomerges_entry = {
        .store = queue_nomerges_store,
 };
 
+static struct queue_sysfs_entry queue_rq_affinity_entry = {
+       .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+       .show = queue_rq_affinity_show,
+       .store = queue_rq_affinity_store,
+};
+
 static struct attribute *default_attrs[] = {
        &queue_requests_entry.attr,
        &queue_ra_entry.attr,
@@ -205,6 +235,7 @@ static struct attribute *default_attrs[] = {
        &queue_iosched_entry.attr,
        &queue_hw_sector_size_entry.attr,
        &queue_nomerges_entry.attr,
+       &queue_rq_affinity_entry.attr,
        NULL,
 };
 
@@ -326,7 +357,6 @@ int blk_register_queue(struct gendisk *disk)
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(blk_register_queue);
 
 void blk_unregister_queue(struct gendisk *disk)
 {
index ed5166f..8a99688 100644 (file)
@@ -29,7 +29,7 @@ EXPORT_SYMBOL(blk_queue_find_tag);
  * __blk_free_tags - release a given set of tag maintenance info
  * @bqt:       the tag map to free
  *
- * Tries to free the specified @bqt@.  Returns true if it was
+ * Tries to free the specified @bqt.  Returns true if it was
  * actually freed and false if there are still references using it
  */
 static int __blk_free_tags(struct blk_queue_tag *bqt)
@@ -78,7 +78,7 @@ void __blk_queue_free_tags(struct request_queue *q)
  * blk_free_tags - release a given set of tag maintenance info
  * @bqt:       the tag map to free
  *
- * For externally managed @bqt@ frees the map.  Callers of this
+ * For externally managed @bqt frees the map.  Callers of this
  * function must guarantee to have released all the queues that
  * might have been using this tag map.
  */
@@ -94,7 +94,7 @@ EXPORT_SYMBOL(blk_free_tags);
  * @q:  the request queue for the device
  *
  *  Notes:
- *     This is used to disabled tagged queuing to a device, yet leave
+ *     This is used to disable tagged queuing to a device, yet leave
  *     queue in function.
  **/
 void blk_queue_free_tags(struct request_queue *q)
@@ -271,7 +271,7 @@ EXPORT_SYMBOL(blk_queue_resize_tags);
  * @rq: the request that has completed
  *
  *  Description:
- *    Typically called when end_that_request_first() returns 0, meaning
+ *    Typically called when end_that_request_first() returns %0, meaning
  *    all transfers have been done for a request. It's important to call
  *    this function before end_that_request_last(), as that will put the
  *    request back on the free list thus corrupting the internal tag list.
index ea2df0a..a4f4a50 100644 (file)
@@ -83,4 +83,16 @@ static inline int queue_congestion_off_threshold(struct request_queue *q)
 
 #endif /* BLK_DEV_INTEGRITY */
 
+static inline int blk_cpu_to_group(int cpu)
+{
+#ifdef CONFIG_SCHED_MC
+       cpumask_t mask = cpu_coregroup_map(cpu);
+       return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+       return first_cpu(per_cpu(cpu_sibling_map, cpu));
+#else
+       return cpu;
+#endif
+}
+
 #endif
index eb9651c..9e0212c 100644 (file)
@@ -111,23 +111,9 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
  */
 static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
 
-/*
- * Bio action bits of interest
- */
-static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
-
-/*
- * More could be added as needed, taking care to increment the decrementer
- * to get correct indexing
- */
-#define trace_barrier_bit(rw)  \
-       (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
-#define trace_sync_bit(rw)     \
-       (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
-#define trace_ahead_bit(rw)    \
-       (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
-#define trace_meta_bit(rw)     \
-       (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
+         (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
 
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
@@ -147,10 +133,11 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                return;
 
        what |= ddir_act[rw & WRITE];
-       what |= bio_act[trace_barrier_bit(rw)];
-       what |= bio_act[trace_sync_bit(rw)];
-       what |= bio_act[trace_ahead_bit(rw)];
-       what |= bio_act[trace_meta_bit(rw)];
+       what |= MASK_TC_BIT(rw, BARRIER);
+       what |= MASK_TC_BIT(rw, SYNC);
+       what |= MASK_TC_BIT(rw, AHEAD);
+       what |= MASK_TC_BIT(rw, META);
+       what |= MASK_TC_BIT(rw, DISCARD);
 
        pid = tsk->pid;
        if (unlikely(act_log_check(bt, what, sector, pid)))
index 1e2aff8..494b6fd 100644 (file)
@@ -39,6 +39,7 @@ static int cfq_slice_idle = HZ / 125;
 #define CFQ_MIN_TT             (2)
 
 #define CFQ_SLICE_SCALE                (5)
+#define CFQ_HW_QUEUE_MIN       (5)
 
 #define RQ_CIC(rq)             \
        ((struct cfq_io_context *) (rq)->elevator_private)
@@ -86,7 +87,14 @@ struct cfq_data {
 
        int rq_in_driver;
        int sync_flight;
+
+       /*
+        * queue-depth detection
+        */
+       int rq_queued;
        int hw_tag;
+       int hw_tag_samples;
+       int rq_in_driver_peak;
 
        /*
         * idle window management
@@ -244,7 +252,7 @@ static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
        if (cfqd->busy_queues) {
                cfq_log(cfqd, "schedule dispatch");
-               kblockd_schedule_work(&cfqd->unplug_work);
+               kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
        }
 }
 
@@ -654,15 +662,6 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
        cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
                                                cfqd->rq_in_driver);
 
-       /*
-        * If the depth is larger 1, it really could be queueing. But lets
-        * make the mark a little higher - idling could still be good for
-        * low queueing, and a low queueing number could also just indicate
-        * a SCSI mid layer like behaviour where limit+1 is often seen.
-        */
-       if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
-               cfqd->hw_tag = 1;
-
        cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
 }
 
@@ -686,6 +685,7 @@ static void cfq_remove_request(struct request *rq)
        list_del_init(&rq->queuelist);
        cfq_del_rq_rb(rq);
 
+       cfqq->cfqd->rq_queued--;
        if (rq_is_meta(rq)) {
                WARN_ON(!cfqq->meta_pending);
                cfqq->meta_pending--;
@@ -1833,6 +1833,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 {
        struct cfq_io_context *cic = RQ_CIC(rq);
 
+       cfqd->rq_queued++;
        if (rq_is_meta(rq))
                cfqq->meta_pending++;
 
@@ -1880,6 +1881,31 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
        cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void cfq_update_hw_tag(struct cfq_data *cfqd)
+{
+       if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
+               cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+
+       if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+           cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+               return;
+
+       if (cfqd->hw_tag_samples++ < 50)
+               return;
+
+       if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+               cfqd->hw_tag = 1;
+       else
+               cfqd->hw_tag = 0;
+
+       cfqd->hw_tag_samples = 0;
+       cfqd->rq_in_driver_peak = 0;
+}
+
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
        struct cfq_queue *cfqq = RQ_CFQQ(rq);
@@ -1890,6 +1916,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq)
        now = jiffies;
        cfq_log_cfqq(cfqd, cfqq, "complete");
 
+       cfq_update_hw_tag(cfqd);
+
        WARN_ON(!cfqd->rq_in_driver);
        WARN_ON(!cfqq->dispatched);
        cfqd->rq_in_driver--;
@@ -2200,6 +2228,7 @@ static void *cfq_init_queue(struct request_queue *q)
        cfqd->cfq_slice[1] = cfq_slice_sync;
        cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
        cfqd->cfq_slice_idle = cfq_slice_idle;
+       cfqd->hw_tag = 1;
 
        return cfqd;
 }
index c23177e..1e559fb 100644 (file)
@@ -788,6 +788,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
        case BLKFLSBUF:
        case BLKROSET:
+       case BLKDISCARD:
        /*
         * the ones below are implemented in blkdev_locked_ioctl,
         * but we call blkdev_ioctl, which gets the lock for us
index 342448c..fd31117 100644 (file)
@@ -33,7 +33,7 @@ struct deadline_data {
         */
        struct rb_root sort_list[2];    
        struct list_head fifo_list[2];
-       
+
        /*
         * next in sort order. read, write or both are NULL
         */
@@ -53,7 +53,11 @@ struct deadline_data {
 
 static void deadline_move_request(struct deadline_data *, struct request *);
 
-#define RQ_RB_ROOT(dd, rq)     (&(dd)->sort_list[rq_data_dir((rq))])
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+       return &dd->sort_list[rq_data_dir(rq)];
+}
 
 /*
  * get the request after `rq' in sector-sorted order
@@ -72,15 +76,11 @@ deadline_latter_request(struct request *rq)
 static void
 deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
 {
-       struct rb_root *root = RQ_RB_ROOT(dd, rq);
+       struct rb_root *root = deadline_rb_root(dd, rq);
        struct request *__alias;
 
-retry:
-       __alias = elv_rb_add(root, rq);
-       if (unlikely(__alias)) {
+       while (unlikely(__alias = elv_rb_add(root, rq)))
                deadline_move_request(dd, __alias);
-               goto retry;
-       }
 }
 
 static inline void
@@ -91,7 +91,7 @@ deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
        if (dd->next_rq[data_dir] == rq)
                dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-       elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
+       elv_rb_del(deadline_rb_root(dd, rq), rq);
 }
 
 /*
@@ -106,7 +106,7 @@ deadline_add_request(struct request_queue *q, struct request *rq)
        deadline_add_rq_rb(dd, rq);
 
        /*
-        * set expire time (only used for reads) and add to fifo list
+        * set expire time and add to fifo list
         */
        rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
        list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
@@ -162,7 +162,7 @@ static void deadline_merged_request(struct request_queue *q,
         * if the merge was a front merge, we need to reposition request
         */
        if (type == ELEVATOR_FRONT_MERGE) {
-               elv_rb_del(RQ_RB_ROOT(dd, req), req);
+               elv_rb_del(deadline_rb_root(dd, req), req);
                deadline_add_rq_rb(dd, req);
        }
 }
@@ -212,7 +212,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
        dd->next_rq[WRITE] = NULL;
        dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-       dd->last_sector = rq->sector + rq->nr_sectors;
+       dd->last_sector = rq_end_sector(rq);
 
        /*
         * take it off the sort and fifo list, move
@@ -222,7 +222,7 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
 }
 
 /*
- * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
  * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
  */
 static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
@@ -258,17 +258,9 @@ static int deadline_dispatch_requests(struct request_queue *q, int force)
        else
                rq = dd->next_rq[READ];
 
-       if (rq) {
-               /* we have a "next request" */
-               
-               if (dd->last_sector != rq->sector)
-                       /* end the batch on a non sequential request */
-                       dd->batching += dd->fifo_batch;
-               
-               if (dd->batching < dd->fifo_batch)
-                       /* we are still entitled to batch */
-                       goto dispatch_request;
-       }
+       if (rq && dd->batching < dd->fifo_batch)
+               /* we have a next request are still entitled to batch */
+               goto dispatch_request;
 
        /*
         * at this point we are not running a batch. select the appropriate
index b3c1d06..8a74eed 100644 (file)
@@ -34,8 +34,7 @@
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "blk.h"
 
@@ -77,6 +76,12 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
                return 0;
 
        /*
+        * Don't merge file system requests and discard requests
+        */
+       if (bio_discard(bio) != bio_discard(rq->bio))
+               return 0;
+
+       /*
         * different data direction or already started, don't merge
         */
        if (bio_data_dir(bio) != rq_data_dir(rq))
@@ -304,9 +309,6 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
 
        if (e->ops->elevator_activate_req_fn)
                e->ops->elevator_activate_req_fn(q, rq);
-
-       if (q->rq_timed_out_fn)
-               blk_add_timer(rq);
 }
 
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
@@ -315,9 +317,6 @@ static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 
        if (e->ops->elevator_deactivate_req_fn)
                e->ops->elevator_deactivate_req_fn(q, rq);
-
-       if (q->rq_timed_out_fn)
-               blk_delete_timer(rq);
 }
 
 static inline void __elv_rqhash_del(struct request *rq)
@@ -446,6 +445,8 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
        list_for_each_prev(entry, &q->queue_head) {
                struct request *pos = list_entry_rq(entry);
 
+               if (blk_discard_rq(rq) != blk_discard_rq(pos))
+                       break;
                if (rq_data_dir(rq) != rq_data_dir(pos))
                        break;
                if (pos->cmd_flags & stop_flags)
@@ -615,7 +616,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
                break;
 
        case ELEVATOR_INSERT_SORT:
-               BUG_ON(!blk_fs_request(rq));
+               BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
                rq->cmd_flags |= REQ_SORTED;
                q->nr_sorted++;
                if (rq_mergeable(rq)) {
@@ -700,7 +701,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where,
                 * this request is scheduling boundary, update
                 * end_sector
                 */
-               if (blk_fs_request(rq)) {
+               if (blk_fs_request(rq) || blk_discard_rq(rq)) {
                        q->end_sector = rq_end_sector(rq);
                        q->boundary_rq = rq;
                }
@@ -772,6 +773,12 @@ struct request *elv_next_request(struct request_queue *q)
                         */
                        rq->cmd_flags |= REQ_STARTED;
                        blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+
+                       /*
+                        * We are now handing the request to the hardware,
+                        * add the timeout handler
+                        */
+                       blk_add_timer(rq);
                }
 
                if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -790,7 +797,6 @@ struct request *elv_next_request(struct request_queue *q)
                         * device can handle
                         */
                        rq->nr_phys_segments++;
-                       rq->nr_hw_segments++;
                }
 
                if (!q->prep_rq_fn)
@@ -813,7 +819,6 @@ struct request *elv_next_request(struct request_queue *q)
                                 * so that we don't add it again
                                 */
                                --rq->nr_phys_segments;
-                               --rq->nr_hw_segments;
                        }
 
                        rq = NULL;
index 546dfc0..61deb06 100644 (file)
@@ -225,10 +225,11 @@ void unlink_gendisk(struct gendisk *disk)
 
 /**
  * get_gendisk - get partitioning information for a given device
- * @dev: device to get partitioning information for
+ * @devt: device to get partitioning information for
+ * @part: returned partition index
  *
  * This function gets the structure containing partitioning
- * information for the given device @dev.
+ * information for the given device @devt.
  */
 struct gendisk *get_gendisk(dev_t devt, int *part)
 {
index 77185e5..375c579 100644 (file)
@@ -111,6 +111,69 @@ static int blkdev_reread_part(struct block_device *bdev)
        return res;
 }
 
+static void blk_ioc_discard_endio(struct bio *bio, int err)
+{
+       if (err) {
+               if (err == -EOPNOTSUPP)
+                       set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+       }
+       complete(bio->bi_private);
+}
+
+static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
+                            uint64_t len)
+{
+       struct request_queue *q = bdev_get_queue(bdev);
+       int ret = 0;
+
+       if (start & 511)
+               return -EINVAL;
+       if (len & 511)
+               return -EINVAL;
+       start >>= 9;
+       len >>= 9;
+
+       if (start + len > (bdev->bd_inode->i_size >> 9))
+               return -EINVAL;
+
+       if (!q->prepare_discard_fn)
+               return -EOPNOTSUPP;
+
+       while (len && !ret) {
+               DECLARE_COMPLETION_ONSTACK(wait);
+               struct bio *bio;
+
+               bio = bio_alloc(GFP_KERNEL, 0);
+               if (!bio)
+                       return -ENOMEM;
+
+               bio->bi_end_io = blk_ioc_discard_endio;
+               bio->bi_bdev = bdev;
+               bio->bi_private = &wait;
+               bio->bi_sector = start;
+
+               if (len > q->max_hw_sectors) {
+                       bio->bi_size = q->max_hw_sectors << 9;
+                       len -= q->max_hw_sectors;
+                       start += q->max_hw_sectors;
+               } else {
+                       bio->bi_size = len << 9;
+                       len = 0;
+               }
+               submit_bio(DISCARD_NOBARRIER, bio);
+
+               wait_for_completion(&wait);
+
+               if (bio_flagged(bio, BIO_EOPNOTSUPP))
+                       ret = -EOPNOTSUPP;
+               else if (!bio_flagged(bio, BIO_UPTODATE))
+                       ret = -EIO;
+               bio_put(bio);
+       }
+       return ret;
+}
+
 static int put_ushort(unsigned long arg, unsigned short val)
 {
        return put_user(val, (unsigned short __user *)arg);
@@ -258,6 +321,19 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
                set_device_ro(bdev, n);
                unlock_kernel();
                return 0;
+
+       case BLKDISCARD: {
+               uint64_t range[2];
+
+               if (!(file->f_mode & FMODE_WRITE))
+                       return -EBADF;
+
+               if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+                       return -EFAULT;
+
+               return blk_ioctl_discard(bdev, range[0], range[1]);
+       }
+
        case HDIO_GETGEO: {
                struct hd_geometry geo;
 
index d797e20..4b0d6c7 100644 (file)
@@ -199,7 +199,8 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
                if (blk_fs_request(req)) {
                        if (ps3disk_submit_request_sg(dev, req))
                                break;
-               } else if (req->cmd_type == REQ_TYPE_FLUSH) {
+               } else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+                          req->cmd[0] == REQ_LB_OP_FLUSH) {
                        if (ps3disk_submit_flush_request(dev, req))
                                break;
                } else {
@@ -257,7 +258,8 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
                return IRQ_HANDLED;
        }
 
-       if (req->cmd_type == REQ_TYPE_FLUSH) {
+       if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+           req->cmd[0] == REQ_LB_OP_FLUSH) {
                read = 0;
                num_sectors = req->hard_cur_sectors;
                op = "flush";
@@ -405,7 +407,8 @@ static void ps3disk_prepare_flush(struct request_queue *q, struct request *req)
 
        dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
-       req->cmd_type = REQ_TYPE_FLUSH;
+       req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+       req->cmd[0] = REQ_LB_OP_FLUSH;
 }
 
 static unsigned long ps3disk_mask;
index 4225109..879506a 100644 (file)
@@ -84,11 +84,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
        if (blk_fs_request(vbr->req)) {
                vbr->out_hdr.type = 0;
                vbr->out_hdr.sector = vbr->req->sector;
-               vbr->out_hdr.ioprio = vbr->req->ioprio;
+               vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
        } else if (blk_pc_request(vbr->req)) {
                vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
                vbr->out_hdr.sector = 0;
-               vbr->out_hdr.ioprio = vbr->req->ioprio;
+               vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
        } else {
                /* We don't put anything else in the queue. */
                BUG();
index b262c00..649da8a 100644 (file)
@@ -1045,6 +1045,12 @@ static int populate_table(struct dm_table *table,
                next = spec->next;
        }
 
+       r = dm_table_set_type(table);
+       if (r) {
+               DMWARN("unable to set table type");
+               return r;
+       }
+
        return dm_table_complete(table);
 }
 
@@ -1069,6 +1075,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
                goto out;
        }
 
+       r = dm_init_md_mempool(md, dm_table_get_type(t));
+       if (r) {
+               DMWARN("unable to initialize the md mempools for this table");
+               dm_table_put(t);
+               goto out;
+       }
+
        down_write(&_hash_lock);
        hc = dm_get_mdptr(md);
        if (!hc || hc->md != md) {
index 490ddcf..5cf4f79 100644 (file)
@@ -7,8 +7,6 @@
 
 #include "dm.h"
 #include "dm-path-selector.h"
-#include "dm-bio-list.h"
-#include "dm-bio-record.h"
 #include "dm-uevent.h"
 
 #include <linux/ctype.h>
@@ -30,6 +28,7 @@ struct pgpath {
        struct list_head list;
 
        struct priority_group *pg;      /* Owning PG */
+       unsigned is_active;             /* Path status */
        unsigned fail_count;            /* Cumulative failure count */
 
        struct dm_path path;
@@ -83,8 +82,7 @@ struct multipath {
        unsigned pg_init_count;         /* Number of times pg_init called */
 
        struct work_struct process_queued_ios;
-       struct bio_list queued_bios;
-       struct list_head queued_reqs;
+       struct list_head queued_ios;
        unsigned queue_size;
 
        struct work_struct trigger_event;
@@ -101,7 +99,6 @@ struct multipath {
  */
 struct dm_mpath_io {
        struct pgpath *pgpath;
-       struct dm_bio_details details;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -126,7 +123,7 @@ static struct pgpath *alloc_pgpath(void)
        struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
 
        if (pgpath) {
-               pgpath->path.is_active = 1;
+               pgpath->is_active = 1;
                INIT_WORK(&pgpath->deactivate_path, deactivate_path);
        }
 
@@ -198,7 +195,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
        m = kzalloc(sizeof(*m), GFP_KERNEL);
        if (m) {
                INIT_LIST_HEAD(&m->priority_groups);
-               INIT_LIST_HEAD(&m->queued_reqs);
+               INIT_LIST_HEAD(&m->queued_ios);
                spin_lock_init(&m->lock);
                m->queue_io = 1;
                INIT_WORK(&m->process_queued_ios, process_queued_ios);
@@ -323,55 +320,12 @@ static int __must_push_back(struct multipath *m)
                dm_noflush_suspending(m->ti));
 }
 
-static int map_bio(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
                  struct dm_mpath_io *mpio, unsigned was_queued)
 {
        int r = DM_MAPIO_REMAPPED;
        unsigned long flags;
        struct pgpath *pgpath;
-
-       spin_lock_irqsave(&m->lock, flags);
-
-       /* Do we need to select a new pgpath? */
-       if (!m->current_pgpath ||
-           (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-               __choose_pgpath(m);
-
-       pgpath = m->current_pgpath;
-
-       if (was_queued)
-               m->queue_size--;
-
-       if ((pgpath && m->queue_io) ||
-           (!pgpath && m->queue_if_no_path)) {
-               /* Queue for the daemon to resubmit */
-               bio_list_add(&m->queued_bios, bio);
-               m->queue_size++;
-               if ((m->pg_init_required && !m->pg_init_in_progress) ||
-                   !m->queue_io)
-                       queue_work(kmultipathd, &m->process_queued_ios);
-               pgpath = NULL;
-               r = DM_MAPIO_SUBMITTED;
-       } else if (pgpath)
-               bio->bi_bdev = pgpath->path.dev->bdev;
-       else if (__must_push_back(m))
-               r = DM_MAPIO_REQUEUE;
-       else
-               r = -EIO;       /* Failed */
-
-       mpio->pgpath = pgpath;
-
-       spin_unlock_irqrestore(&m->lock, flags);
-
-       return r;
-}
-
-static int map_req(struct multipath *m, struct request *clone,
-                  struct dm_mpath_io *mpio, unsigned was_queued)
-{
-       int r = DM_MAPIO_REMAPPED;
-       unsigned long flags;
-       struct pgpath *pgpath;
        struct block_device *bdev;
 
        spin_lock_irqsave(&m->lock, flags);
@@ -389,28 +343,21 @@ static int map_req(struct multipath *m, struct request *clone,
        if ((pgpath && m->queue_io) ||
            (!pgpath && m->queue_if_no_path)) {
                /* Queue for the daemon to resubmit */
-               list_add_tail(&clone->queuelist, &m->queued_reqs);
+               list_add_tail(&clone->queuelist, &m->queued_ios);
                m->queue_size++;
                if ((m->pg_init_required && !m->pg_init_in_progress) ||
                    !m->queue_io)
                        queue_work(kmultipathd, &m->process_queued_ios);
                pgpath = NULL;
-               clone->q = NULL;
-               clone->rq_disk = NULL;
                r = DM_MAPIO_SUBMITTED;
        } else if (pgpath) {
                bdev = pgpath->path.dev->bdev;
                clone->q = bdev_get_queue(bdev);
                clone->rq_disk = bdev->bd_disk;
-       } else if (__must_push_back(m)) {
-               clone->q = NULL;
-               clone->rq_disk = NULL;
+       } else if (__must_push_back(m))
                r = DM_MAPIO_REQUEUE;
-       } else {
-               clone->q = NULL;
-               clone->rq_disk = NULL;
+       else
                r = -EIO;       /* Failed */
-       }
 
        mpio->pgpath = pgpath;
 
@@ -446,38 +393,7 @@ static int queue_if_no_path(struct multipath *m, unsigned queue_if_no_path,
  * The multipath daemon is responsible for resubmitting queued ios.
  *---------------------------------------------------------------*/
 
-static void dispatch_queued_bios(struct multipath *m)
-{
-       int r;
-       unsigned long flags;
-       struct bio *bio = NULL, *next;
-       struct dm_mpath_io *mpio;
-       union map_info *info;
-
-       spin_lock_irqsave(&m->lock, flags);
-       bio = bio_list_get(&m->queued_bios);
-       spin_unlock_irqrestore(&m->lock, flags);
-
-       while (bio) {
-               next = bio->bi_next;
-               bio->bi_next = NULL;
-
-               info = dm_get_mapinfo(bio);
-               mpio = info->ptr;
-
-               r = map_bio(m, bio, mpio, 1);
-               if (r < 0)
-                       bio_endio(bio, r);
-               else if (r == DM_MAPIO_REMAPPED)
-                       generic_make_request(bio);
-               else if (r == DM_MAPIO_REQUEUE)
-                       bio_endio(bio, -EIO);
-
-               bio = next;
-       }
-}
-
-static void dispatch_queued_reqs(struct multipath *m)
+static void dispatch_queued_ios(struct multipath *m)
 {
        int r;
        unsigned long flags;
@@ -487,7 +403,7 @@ static void dispatch_queued_reqs(struct multipath *m)
        LIST_HEAD(cl);
 
        spin_lock_irqsave(&m->lock, flags);
-       list_splice_init(&m->queued_reqs, &cl);
+       list_splice_init(&m->queued_ios, &cl);
        spin_unlock_irqrestore(&m->lock, flags);
 
        list_for_each_entry_safe(clone, n, &cl, queuelist) {
@@ -496,14 +412,16 @@ static void dispatch_queued_reqs(struct multipath *m)
                info = dm_get_rq_mapinfo(clone);
                mpio = info->ptr;
 
-               r = map_req(m, clone, mpio, 1);
-               if (r < 0 || r == DM_MAPIO_REQUEUE) {
+               r = map_io(m, clone, mpio, 1);
+               if (r < 0) {
                        mempool_free(mpio, m->mpio_pool);
-                       if (r == DM_MAPIO_REQUEUE)
-                               r = DM_ENDIO_REQUEUE;
-                       dm_end_request(clone, r);
+                       dm_kill_request(clone, r);
                } else if (r == DM_MAPIO_REMAPPED)
                        dm_dispatch_request(clone);
+               else if (r == DM_MAPIO_REQUEUE) {
+                       mempool_free(mpio, m->mpio_pool);
+                       dm_requeue_request(clone);
+               }
        }
 }
 
@@ -543,12 +461,8 @@ out:
        if (init_required)
                queue_work(kmpath_handlerd, &m->activate_path);
 
-       if (!must_queue) {
-               if (dm_table_request_based(m->ti->table))
-                       dispatch_queued_reqs(m);
-               else
-                       dispatch_queued_bios(m);
-       }
+       if (!must_queue)
+               dispatch_queued_ios(m);
 }
 
 /*
@@ -806,7 +720,7 @@ static int parse_features(struct arg_set *as, struct multipath *m)
        const char *param_name;
 
        static struct param _params[] = {
-               {0, 4, "invalid number of feature args"},
+               {0, 3, "invalid number of feature args"},
                {1, 50, "pg_init_retries must be between 1 and 50"},
        };
 
@@ -834,11 +748,6 @@ static int parse_features(struct arg_set *as, struct multipath *m)
                        continue;
                }
 
-               if (!strnicmp(param_name, MESG_STR("rq_based"))) {
-                       dm_table_set_request_based(ti->table);
-                       continue;
-               }
-
                ti->error = "Unrecognised multipath feature request";
                r = -EINVAL;
        } while (argc && !r);
@@ -929,27 +838,8 @@ static void multipath_dtr(struct dm_target *ti)
 /*
  * Map cloned requests
  */
-static int multipath_map_bio(struct dm_target *ti, struct bio *bio,
-                            union map_info *map_context)
-{
-       int r;
-       struct dm_mpath_io *mpio;
-       struct multipath *m = (struct multipath *) ti->private;
-
-       mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
-       dm_bio_record(&mpio->details, bio);
-
-       map_context->ptr = mpio;
-       bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
-       r = map_bio(m, bio, mpio, 0);
-       if (r < 0 || r == DM_MAPIO_REQUEUE)
-               mempool_free(mpio, m->mpio_pool);
-
-       return r;
-}
-
-static int multipath_map_req(struct dm_target *ti, struct request *clone,
-                            union map_info *map_context)
+static int multipath_map(struct dm_target *ti, struct request *clone,
+                        union map_info *map_context)
 {
        int r;
        struct dm_mpath_io *mpio;
@@ -963,8 +853,7 @@ static int multipath_map_req(struct dm_target *ti, struct request *clone,
 
        map_context->ptr = mpio;
        clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-
-       r = map_req(m, clone, mpio, 0);
+       r = map_io(m, clone, mpio, 0);
        if (r < 0 || r == DM_MAPIO_REQUEUE)
                mempool_free(mpio, m->mpio_pool);
 
@@ -981,13 +870,13 @@ static int fail_path(struct pgpath *pgpath)
 
        spin_lock_irqsave(&m->lock, flags);
 
-       if (!pgpath->path.is_active)
+       if (!pgpath->is_active)
                goto out;
 
        DMWARN("Failing path %s.", pgpath->path.dev->name);
 
        pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
-       pgpath->path.is_active = 0;
+       pgpath->is_active = 0;
        pgpath->fail_count++;
 
        m->nr_valid_paths--;
@@ -1018,7 +907,7 @@ static int reinstate_path(struct pgpath *pgpath)
 
        spin_lock_irqsave(&m->lock, flags);
 
-       if (pgpath->path.is_active)
+       if (pgpath->is_active)
                goto out;
 
        if (!pgpath->pg->ps.type->reinstate_path) {
@@ -1032,7 +921,7 @@ static int reinstate_path(struct pgpath *pgpath)
        if (r)
                goto out;
 
-       pgpath->path.is_active = 1;
+       pgpath->is_active = 1;
 
        m->current_pgpath = NULL;
        if (!m->nr_valid_paths++ && m->queue_size)
@@ -1246,104 +1135,41 @@ static void activate_path(struct work_struct *work)
 /*
  * end_io handling
  */
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
                     int error, struct dm_mpath_io *mpio)
 {
+       /*
+        * We don't queue any clone request inside the multipath target
+        * during end I/O handling, since those clone requests don't have
+        * bio clones.  If we queue them inside the multipath target,
+        * we need to make bio clones, that requires memory allocation.
+        * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+        *  don't have bio clones.)
+        * Instead of queueing the clone request here, we queue the original
+        * request into dm core, which will remake a clone request and
+        * clone bios for it and resubmit it later.
+        */
+       int r = DM_ENDIO_REQUEUE;
        unsigned long flags;
 
-       if (!error)
+       if (!error && !clone->errors)
                return 0;       /* I/O complete */
 
-       if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-               return error;
-
        if (error == -EOPNOTSUPP)
                return error;
 
-       spin_lock_irqsave(&m->lock, flags);
-       if (!m->nr_valid_paths) {
-               if (__must_push_back(m)) {
-                       spin_unlock_irqrestore(&m->lock, flags);
-                       return DM_ENDIO_REQUEUE;
-               } else if (!m->queue_if_no_path) {
-                       spin_unlock_irqrestore(&m->lock, flags);
-                       return -EIO;
-               } else {
-                       spin_unlock_irqrestore(&m->lock, flags);
-                       goto requeue;
-               }
-       }
-       spin_unlock_irqrestore(&m->lock, flags);
-
        if (mpio->pgpath)
                fail_path(mpio->pgpath);
 
-requeue:
-       dm_bio_restore(&mpio->details, bio);
-
-       /* queue for the daemon to resubmit or fail */
        spin_lock_irqsave(&m->lock, flags);
-       bio_list_add(&m->queued_bios, bio);
-       m->queue_size++;
-       if (!m->queue_io)
-               queue_work(kmultipathd, &m->process_queued_ios);
-       spin_unlock_irqrestore(&m->lock, flags);
-
-       return DM_ENDIO_INCOMPLETE;     /* io not complete */
-}
-
-static int do_end_req(struct multipath *m, struct request *clone,
-                    int error, struct dm_mpath_io *mpio)
-{
-       unsigned long flags;
-       int r;
-
-       if (!error && !clone->errors)
-               return 0;       /* I/O complete */
-
-       if (error == -EOPNOTSUPP)
-               return error;
-
-       spin_lock_irqsave(&m->lock, flags);
-       if (!m->nr_valid_paths) {
-               if (__must_push_back(m) || m->queue_if_no_path)
-                       r = DM_ENDIO_REQUEUE;
-               else
-                       r = -EIO;
-
-               spin_unlock_irqrestore(&m->lock, flags);
-               return r;
-       }
+       if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
+               r = -EIO;
        spin_unlock_irqrestore(&m->lock, flags);
 
-       if (mpio->pgpath)
-               fail_path(mpio->pgpath);
-
-       return DM_ENDIO_REQUEUE;
-}
-
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
-                           int error, union map_info *map_context)
-{
-       struct multipath *m = ti->private;
-       struct dm_mpath_io *mpio = map_context->ptr;
-       struct pgpath *pgpath = mpio->pgpath;
-       struct path_selector *ps;
-       int r;
-
-       r  = do_end_io(m, bio, error, mpio);
-       if (pgpath) {
-               ps = &pgpath->pg->ps;
-               if (ps->type->end_io)
-                       ps->type->end_io(ps, &pgpath->path);
-       }
-       if (r != DM_ENDIO_INCOMPLETE)
-               mempool_free(mpio, m->mpio_pool);
-
        return r;
 }
 
-static int multipath_end_req(struct dm_target *ti, struct request *clone,
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
                            int error, union map_info *map_context)
 {
        struct multipath *m = ti->private;
@@ -1352,7 +1178,7 @@ static int multipath_end_req(struct dm_target *ti, struct request *clone,
        struct path_selector *ps;
        int r;
 
-       r  = do_end_req(m, clone, error, mpio);
+       r  = do_end_io(m, clone, error, mpio);
        if (pgpath) {
                ps = &pgpath->pg->ps;
                if (ps->type->end_io)
@@ -1422,16 +1248,12 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
        if (type == STATUSTYPE_INFO)
                DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
        else {
-               int rq_based = dm_table_request_based(ti->table);
-
-               DMEMIT("%u ", m->queue_if_no_path + rq_based +
+               DMEMIT("%u ", m->queue_if_no_path +
                              (m->pg_init_retries > 0) * 2);
                if (m->queue_if_no_path)
                        DMEMIT("queue_if_no_path ");
                if (m->pg_init_retries)
                        DMEMIT("pg_init_retries %u ", m->pg_init_retries);
-               if (rq_based)
-                       DMEMIT("rq_based ");
        }
 
        if (!m->hw_handler_name || type == STATUSTYPE_INFO)
@@ -1474,7 +1296,7 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
 
                        list_for_each_entry(p, &pg->pgpaths, list) {
                                DMEMIT("%s %s %u ", p->path.dev->name,
-                                      p->path.is_active ? "A" : "F",
+                                      p->is_active ? "A" : "F",
                                       p->fail_count);
                                if (pg->ps.type->status)
                                        sz += pg->ps.type->status(&pg->ps,
@@ -1598,52 +1420,73 @@ static int multipath_ioctl(struct dm_target *ti, struct inode *inode,
                                         bdev->bd_disk, cmd, arg);
 }
 
-static int __pgpath_congested(struct pgpath *pgpath)
+static int __pgpath_busy(struct pgpath *pgpath)
 {
        struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
-       if (dm_underlying_device_congested(q))
-               return 1;
-
-       return 0;
+       return dm_underlying_device_busy(q);
 }
 
-static int multipath_congested(struct dm_target *ti)
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy".  Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
 {
-       int congested = 0;
+       int busy = 0, has_active = 0;
        struct multipath *m = (struct multipath *) ti->private;
+       struct priority_group *pg;
+       struct pgpath *pgpath;
        unsigned long flags;
 
        spin_lock_irqsave(&m->lock, flags);
 
-       if (m->current_pgpath && m->repeat_count > 1) {
-               /* m->current_pgpath is surely used at next mapping time. */
-               if (__pgpath_congested(m->current_pgpath))
-                       congested = 1;
-
+       /* Guess which priority_group will be used at next mapping time */
+       if (unlikely(!m->current_pgpath && m->next_pg))
+               pg = m->next_pg;
+       else if (likely(m->current_pg))
+               pg = m->current_pg;
+       else
+               /*
+                * We don't know which pg will be used at next mapping time.
+                * We don't call __choose_pgpath() here to avoid to trigger
+                * pg_init just by busy checking.
+                * So we don't know whether underlying devices we will be using
+                * at next mapping time are busy or not. Just try mapping.
+                */
                goto out;
-       }
 
        /*
-        * We are here means that path selection will be executed
-        * at next mapping time.
-        * We run the path selection here and check congestion status
-        * of the next path.
-        * And increment repeat_count to avoid path selection again
-        * in map_io().
+        * If there is one non-busy active path at least, the path selector
+        * will be able to select it. So we consider such a pg as not busy.
         */
-       __choose_pgpath(m);
-       if (m->current_pgpath) {
-               if (__pgpath_congested(m->current_pgpath))
-                       congested = 1;
+       busy = 1;
+       list_for_each_entry(pgpath, &pg->pgpaths, list)
+               if (pgpath->is_active) {
+                       has_active = 1;
+
+                       if (!__pgpath_busy(pgpath)) {
+                               busy = 0;
+                               break;
+                       }
+               }
 
-               m->repeat_count++;
-       }
+       if (!has_active)
+               /*
+                * No active path in this pg, so this pg won't be used and
+                * the current_pg will be changed at next mapping time.
+                * We need to try mapping to determine it.
+                */
+               busy = 0;
 
 out:
        spin_unlock_irqrestore(&m->lock, flags);
 
-       return congested;
+       return busy;
 }
 
 /*-----------------------------------------------------------------
@@ -1655,16 +1498,14 @@ static struct target_type multipath_target = {
        .module = THIS_MODULE,
        .ctr = multipath_ctr,
        .dtr = multipath_dtr,
-       .map = multipath_map_bio,
-       .end_io = multipath_end_io,
-       .map_rq = multipath_map_req,
-       .rq_end_io = multipath_end_req,
+       .map_rq = multipath_map,
+       .rq_end_io = multipath_end_io,
        .presuspend = multipath_presuspend,
        .resume = multipath_resume,
        .status = multipath_status,
        .message = multipath_message,
        .ioctl  = multipath_ioctl,
-       .congested = multipath_congested,
+       .busy = multipath_busy,
 };
 
 static int __init dm_multipath_init(void)
index c198b85..e230f71 100644 (file)
@@ -13,8 +13,6 @@ struct dm_dev;
 
 struct dm_path {
        struct dm_dev *dev;     /* Read-only */
-       unsigned is_active;     /* Read-only */
-
        void *pscontext;        /* For path-selector use */
 };
 
index e0d7f83..49ab737 100644 (file)
@@ -442,12 +442,6 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
                        return r;
        }
 
-       r = dm_init_md(t->md);
-       if (r) {
-               DMWARN("Cannot initialize device %s, error %d", path, r);
-               return r;
-       }
-
        dd = find_device(&t->devices, dev);
        if (!dd) {
                dd = kmalloc(sizeof(*dd), GFP_KERNEL);
@@ -534,8 +528,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
        rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 
-       if (!q->request_fn)
-               rs->no_request_stacking = 1;
+       rs->no_request_stacking |= !blk_queue_stackable(q);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
@@ -671,12 +664,8 @@ int dm_split_args(int *argc, char ***argvp, char *input)
        return 0;
 }
 
-static int check_for_valid_limits(struct io_restrictions *rs,
-                                 struct mapped_device *md)
+static void check_for_valid_limits(struct io_restrictions *rs)
 {
-       int r = 0;
-       struct dm_table *t;
-
        if (!rs->max_sectors)
                rs->max_sectors = SAFE_MAX_SECTORS;
        if (!rs->max_hw_sectors)
@@ -693,39 +682,6 @@ static int check_for_valid_limits(struct io_restrictions *rs,
                rs->seg_boundary_mask = -1;
        if (!rs->bounce_pfn)
                rs->bounce_pfn = -1;
-
-       if (!dm_request_based(md))
-               return 0;
-
-       /* Allows to load only request stackable tables */
-       if (rs->no_request_stacking) {
-               DMERR("table load rejected: including non-request-stackable "
-                     "devices");
-               return -EINVAL;
-       }
-
-       t = dm_get_table(md);
-
-       /* Initial table loading must be allowed */
-       if (!t)
-               return 0;
-
-       if ((rs->max_sectors < t->limits.max_sectors) ||
-           (rs->max_hw_sectors < t->limits.max_hw_sectors) ||
-           (rs->max_phys_segments < t->limits.max_phys_segments) ||
-           (rs->max_hw_segments < t->limits.max_hw_segments) ||
-           (rs->hardsect_size > t->limits.hardsect_size) ||
-           (rs->max_segment_size < t->limits.max_segment_size) ||
-           (rs->seg_boundary_mask < t->limits.seg_boundary_mask) ||
-           (rs->bounce_pfn < t->limits.bounce_pfn) ||
-           (rs->no_cluster && !t->limits.no_cluster)) {
-               DMERR("table load rejected: shrinking current restriction");
-               r = -EINVAL;
-       }
-
-       dm_table_put(t);
-
-       return r;
 }
 
 int dm_table_add_target(struct dm_table *t, const char *type,
@@ -791,14 +747,64 @@ int dm_table_add_target(struct dm_table *t, const char *type,
        return r;
 }
 
-void dm_table_set_request_based(struct dm_table *t)
+int dm_table_set_type(struct dm_table *t)
+{
+       int i;
+       int bio_based = 0, request_based = 0;
+       struct dm_target *tgt;
+
+       for (i = 0; i < t->num_targets; i++) {
+               tgt = t->targets + i;
+               if (tgt->type->map_rq)
+                       request_based = 1;
+               else
+                       bio_based = 1;
+
+               if (bio_based && request_based) {
+                       DMWARN("Inconsistent table: different target types"
+                              " can't be mixed up");
+                       return -EINVAL;
+               }
+       }
+
+       if (bio_based) {
+               /* We must use this table as bio-based */
+               t->limits.no_request_stacking = 1;
+               return 0;
+       }
+
+       BUG_ON(!request_based); /* No targets in this table */
+
+       /* Non-request-stackable devices can't be used for request-based dm */
+       if (t->limits.no_request_stacking) {
+               DMWARN("table load rejected: including non-request-stackable"
+                      " devices");
+               return -EINVAL;
+       }
+
+       /*
+        * Request-based dm supports only tables that have a single target now.
+        * To support multiple targets, request splitting support is needed,
+        * and that needs lots of changes in the block-layer.
+        * (e.g. request completion process for partial completion.)
+        */
+       if (t->num_targets > 1) {
+               DMWARN("Request-based dm doesn't support multiple targets yet");
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int dm_table_get_type(struct dm_table *t)
 {
-       dm_set_request_based(t->md);
+       return t->limits.no_request_stacking ?
+               DM_TYPE_BIO_BASED : DM_TYPE_REQUEST_BASED;
 }
 
 int dm_table_request_based(struct dm_table *t)
 {
-       return dm_request_based(t->md);
+       return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
 }
 
 static int setup_indexes(struct dm_table *t)
@@ -835,9 +841,7 @@ int dm_table_complete(struct dm_table *t)
        int r = 0;
        unsigned int leaf_nodes;
 
-       r = check_for_valid_limits(&t->limits, t->md);
-       if (r)
-               return r;
+       check_for_valid_limits(&t->limits);
 
        /* how many indexes will the btree have ? */
        leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
@@ -933,6 +937,10 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
        else
                queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 
+       if (t->limits.no_request_stacking)
+               queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, q);
+       else
+               queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1021,6 +1029,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
        return r;
 }
 
+int dm_table_any_busy_target(struct dm_table *t)
+{
+       int i;
+       struct dm_target *ti;
+
+       for (i = 0; i < t->num_targets; i++) {
+               ti = t->targets + i;
+               if (ti->type->busy && ti->type->busy(ti))
+                       return 1;
+       }
+
+       return 0;
+}
+
 void dm_table_unplug_all(struct dm_table *t)
 {
        struct dm_dev *dd;
@@ -1062,5 +1084,3 @@ EXPORT_SYMBOL(dm_table_get);
 EXPORT_SYMBOL(dm_table_unplug_all);
 EXPORT_SYMBOL(dm_table_barrier_ok);
 EXPORT_SYMBOL(dm_table_support_barrier);
-EXPORT_SYMBOL(dm_table_request_based);
-EXPORT_SYMBOL(dm_table_set_request_based);
index a311fc3..4004e46 100644 (file)
@@ -32,6 +32,7 @@ static unsigned int _major = 0;
 
 static DEFINE_SPINLOCK(_minor_lock);
 /*
+ * For bio based dm.
  * One of these is allocated per bio.
  */
 struct dm_io {
@@ -43,6 +44,7 @@ struct dm_io {
 };
 
 /*
+ * For bio based dm.
  * One of these is allocated per target within a bio.  Hopefully
  * this will be simplified out one day.
  */
@@ -68,6 +70,10 @@ struct dm_rq_target_io {
        union map_info info;
 };
 
+/*
+ * For request based dm.
+ * One of these is allocated per bio.
+ */
 struct dm_clone_bio_info {
        struct bio *orig;
        struct request *rq;
@@ -99,16 +105,12 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_REQUEST_BASED 6
-#define DMF_BIO_BASED 7
-#define DMF_INITIALIZED 8
 
 /*
  * Work processed by per-device workqueue.
  */
 struct dm_wq_req {
        enum {
-               DM_WQ_FLUSH_ALL,
                DM_WQ_FLUSH_DEFERRED,
        } type;
        struct work_struct work;
@@ -158,6 +160,8 @@ struct mapped_device {
 
        struct bio_set *bs;
 
+       unsigned int mempool_type; /* Type of mempools above. */
+
        /*
         * Event handling.
         */
@@ -176,6 +180,9 @@ struct mapped_device {
        /* forced geometry settings */
        struct hd_geometry geometry;
 
+       /* marker of flush suspend for request-based dm */
+       struct request suspend_rq;
+
        /* For saving the address of __make_request for request based dm */
        make_request_fn *saved_make_request_fn;
 };
@@ -644,16 +651,16 @@ static void clone_endio(struct bio *bio, int error)
 /*
  * Partial completion handling for request-based dm
  */
-static void end_clone_bio(struct bio *bio, int error)
+static void end_clone_bio(struct bio *clone, int error)
 {
-       struct dm_clone_bio_info *info = bio->bi_private;
+       struct dm_clone_bio_info *info = clone->bi_private;
        struct dm_rq_target_io *tio = info->rq->end_io_data;
-       struct bio *orig_bio = info->orig;
+       struct bio *bio = info->orig;
        unsigned int nr_bytes = info->orig->bi_size;
 
        free_bio_info(tio->md, info);
-       bio->bi_private = tio->md->bs;
-       bio_put(bio);
+       clone->bi_private = tio->md->bs;
+       bio_put(clone);
 
        if (tio->error) {
                /*
@@ -682,8 +689,8 @@ static void end_clone_bio(struct bio *bio, int error)
         * So the completing bio should always be rq->bio.
         * If it's not, something wrong is happening.
         */
-       if (tio->orig->bio != orig_bio)
-               DMWARN("bio completion is going in the middle of the request");
+       if (tio->orig->bio != bio)
+               DMERR("bio completion is going in the middle of the request");
 
        /*
         * Update the original request.
@@ -718,95 +725,139 @@ static void dec_rq_pending(struct dm_rq_target_io *tio)
                wake_up(&tio->md->wait);
 }
 
-static void __requeue_request(struct request_queue *q, struct request *rq)
+static void dm_unprep_request(struct request *rq)
 {
+       struct request *clone = rq->special;
+       struct dm_rq_target_io *tio = clone->end_io_data;
+
+       rq->special = NULL;
+       rq->cmd_flags &= ~REQ_DONTPREP;
+
+       free_bio_clone(clone);
+       dec_rq_pending(tio);
+       free_rq_tio(tio->md, tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_request(struct request *clone)
+{
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       struct request *rq = tio->orig;
+       struct request_queue *q = rq->q;
+       unsigned long flags;
+
+       dm_unprep_request(rq);
+
+       spin_lock_irqsave(q->queue_lock, flags);
        if (elv_queue_empty(q))
                blk_plug_device(q);
        blk_requeue_request(q, rq);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_request);
+
+static inline void __stop_queue(struct request_queue *q)
+{
+       blk_stop_queue(q);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(q->queue_lock, flags);
+       __stop_queue(q);
+       spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static inline void __start_queue(struct request_queue *q)
+{
+       if (blk_queue_stopped(q))
+               blk_start_queue(q);
 }
 
-static void requeue_request(struct request_queue *q, struct request *rq)
+static void start_queue(struct request_queue *q)
 {
        unsigned long flags;
 
        spin_lock_irqsave(q->queue_lock, flags);
-       __requeue_request(q, rq);
+       __start_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /*
  * Complete the clone and the original request
  */
-void dm_end_request(struct request *clone, int error)
+static void dm_end_request(struct request *clone, int error)
 {
        struct dm_rq_target_io *tio = clone->end_io_data;
-       struct request *orig = tio->orig;
-       struct request_queue *q_orig = orig->q;
-       unsigned int nr_bytes = blk_rq_bytes(orig);
-
-       if (error == DM_ENDIO_REQUEUE) {
-               /*
-                * Requeue the original request of the clone.
-                * Don't invoke blk_run_queue() so that the requeued request
-                * won't be dispatched again soon.
-                */
-               free_bio_clone(clone);
-               dec_rq_pending(tio);
-               free_rq_tio(tio->md, tio);
+       struct request *rq = tio->orig;
+       struct request_queue *q = rq->q;
+       unsigned int nr_bytes = blk_rq_bytes(rq);
 
-               requeue_request(q_orig, orig);
-               return;
-       }
+       if (blk_pc_request(rq)) {
+               rq->errors = clone->errors;
+               rq->data_len = clone->data_len;
 
-       if (blk_pc_request(orig)) {
-               orig->errors = clone->errors;
-               orig->data_len = clone->data_len;
-
-               if (orig->sense)
+               if (rq->sense)
                        /*
                         * We are using the sense buffer of the original
                         * request.
                         * So setting the length of the sense data is enough.
                         */
-                       orig->sense_len = clone->sense_len;
+                       rq->sense_len = clone->sense_len;
        }
 
        free_bio_clone(clone);
        dec_rq_pending(tio);
        free_rq_tio(tio->md, tio);
 
-       if (unlikely(blk_end_request(orig, error, nr_bytes)))
+       if (unlikely(blk_end_request(rq, error, nr_bytes)))
                BUG();
 
-       blk_run_queue(q_orig);
+       blk_run_queue(q);
 }
-EXPORT_SYMBOL_GPL(dm_end_request);
 
 /*
  * Request completion handler for request-based dm
  */
-static void dm_softirq_done(struct request *orig)
+static void dm_softirq_done(struct request *rq)
 {
-       struct request *clone = orig->completion_data;
+       struct request *clone = rq->completion_data;
        struct dm_rq_target_io *tio = clone->end_io_data;
        dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
-       int error = tio->error, r;
+       int error = tio->error;
+       int r;
+
+       if (rq->cmd_flags & REQ_FAILED)
+               goto end_request;
 
        if (rq_end_io) {
                r = rq_end_io(tio->ti, clone, error, &tio->info);
-               if (r <= 0 || r == DM_ENDIO_REQUEUE)
-                       /* The target wants to complete or requeue the I/O */
+               if (r <= 0)
+                       /* The target wants to complete the I/O */
                        error = r;
                else if (r == DM_ENDIO_INCOMPLETE)
                        /* The target will handle the I/O */
                        return;
-               else {
+               else if (r == DM_ENDIO_REQUEUE) {
+                       /*
+                        * The target wants to requeue the I/O.
+                        * Don't invoke blk_run_queue() so that the requeued
+                        * request won't be dispatched again soon.
+                        */
+                       dm_requeue_request(clone);
+                       return;
+               } else {
                        DMWARN("unimplemented target endio return value: %d",
                               r);
                        BUG();
                }
        }
 
+end_request:
        dm_end_request(clone, error);
 }
 
@@ -816,7 +867,7 @@ static void dm_softirq_done(struct request *orig)
 static void end_clone_request(struct request *clone, int error)
 {
        struct dm_rq_target_io *tio = clone->end_io_data;
-       struct request *orig = tio->orig;
+       struct request *rq = tio->orig;
 
        /*
         * For just cleaning up the information of the queue in which
@@ -835,9 +886,27 @@ static void end_clone_request(struct request *clone, int error)
         *       against this queue
         */
        tio->error = error;
-       orig->completion_data = clone;
-       blk_complete_request(orig);
+       rq->completion_data = clone;
+       blk_complete_request(rq);
+}
+
+/*
+ * Complete the original request of a clone with an error status.
+ * Target's rq_end_io() function isn't called.
+ * This may be used by target's map_rq() function when the mapping fails.
+ */
+void dm_kill_request(struct request *clone, int error)
+{
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       struct request *rq = tio->orig;
+
+       tio->error = error;
+       /* Avoid printing "I/O error" message, since we didn't I/O actually */
+       rq->cmd_flags |= (REQ_FAILED | REQ_QUIET);
+       rq->completion_data = clone;
+       blk_complete_request(rq);
 }
+EXPORT_SYMBOL_GPL(dm_kill_request);
 
 static sector_t max_io_len(struct mapped_device *md,
                           sector_t sector, struct dm_target *ti)
@@ -1218,11 +1287,16 @@ static int dm_make_request(struct request_queue *q, struct bio *bio)
        return md->saved_make_request_fn(q, bio); /* call __make_request() */
 }
 
+static inline int dm_request_based(struct mapped_device *md)
+{
+       return blk_queue_stackable(md->queue);
+}
+
 static int dm_request(struct request_queue *q, struct bio *bio)
 {
        struct mapped_device *md = q->queuedata;
 
-       if (test_bit(DMF_REQUEST_BASED, &md->flags))
+       if (dm_request_based(md))
                return dm_make_request(q, bio);
 
        return _dm_request(q, bio);
@@ -1230,85 +1304,68 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 
 void dm_dispatch_request(struct request *rq)
 {
+       int r;
+
        rq->start_time = jiffies;
-       blk_submit_request(rq->q, rq);
+       r = blk_insert_cloned_request(rq->q, rq);
+       if (r)
+               dm_kill_request(rq, r);
 }
 EXPORT_SYMBOL_GPL(dm_dispatch_request);
 
-static void copy_request_info(struct request *clone, struct request *orig)
-{
-       INIT_LIST_HEAD(&clone->queuelist);
-       INIT_LIST_HEAD(&clone->donelist);
-       clone->q = NULL;
-       clone->cmd_flags = (rq_data_dir(orig) | REQ_NOMERGE);
-       clone->cmd_type = orig->cmd_type;
-       clone->sector = orig->sector;
-       clone->hard_sector = orig->hard_sector;
-       clone->nr_sectors = orig->nr_sectors;
-       clone->hard_nr_sectors = orig->hard_nr_sectors;
-       clone->current_nr_sectors = orig->current_nr_sectors;
-       clone->hard_cur_sectors = orig->hard_cur_sectors;
-       INIT_HLIST_NODE(&clone->hash);
-       clone->completion_data = NULL;
-       clone->elevator_private = NULL;
-       clone->elevator_private2 = NULL;
-       clone->rq_disk = NULL;
-       clone->start_time = jiffies;
-       clone->nr_phys_segments = orig->nr_phys_segments;
-       clone->nr_hw_segments = orig->nr_hw_segments;
-       clone->ioprio = orig->ioprio;
-       clone->special = NULL;
-       clone->buffer = orig->buffer;
-       clone->tag = -1;
-       clone->errors = 0;
-       clone->ref_count = 1;
-       clone->cmd_len = orig->cmd_len;
-       WARN_ON(orig->cmd != orig->__cmd);
-       clone->cmd = clone->__cmd;
-       if (orig->cmd_len) {
-               memcpy(clone->cmd, orig->cmd, sizeof(orig->cmd));
-       }
-       clone->data_len = orig->data_len;
-       clone->sense_len = orig->sense_len;
-       clone->data = orig->data;
-       clone->sense = orig->sense;
-       clone->timeout = 0;
-       clone->retries = 0;
-       clone->end_io = end_clone_request;
-       clone->next_rq = NULL;
-}
-
-static int clone_request_bios(struct request *clone, struct request *orig)
-{
-       struct dm_rq_target_io *tio = clone->end_io_data;
-       struct mapped_device *md = tio->md;
-       struct bio *bio, *orig_bio;
+static void copy_request_info(struct request *clone, struct request *rq)
+{
+       clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE);
+       clone->cmd_type = rq->cmd_type;
+       clone->sector = rq->sector;
+       clone->hard_sector = rq->hard_sector;
+       clone->nr_sectors = rq->nr_sectors;
+       clone->hard_nr_sectors = rq->hard_nr_sectors;
+       clone->current_nr_sectors = rq->current_nr_sectors;
+       clone->hard_cur_sectors = rq->hard_cur_sectors;
+       clone->nr_phys_segments = rq->nr_phys_segments;
+       clone->ioprio = rq->ioprio;
+       clone->buffer = rq->buffer;
+       clone->cmd_len = rq->cmd_len;
+       if (rq->cmd_len)
+               clone->cmd = rq->cmd;
+       clone->data_len = rq->data_len;
+       clone->extra_len = rq->extra_len;
+       clone->sense_len = rq->sense_len;
+       clone->data = rq->data;
+       clone->sense = rq->sense;
+}
+
+static int clone_request_bios(struct request *clone, struct request *rq,
+                             struct mapped_device *md)
+{
+       struct bio *bio, *clone_bio;
        struct dm_clone_bio_info *info;
 
-       for (orig_bio = orig->bio; orig_bio; orig_bio = orig_bio->bi_next) {
+       for (bio = rq->bio; bio; bio = bio->bi_next) {
                info = alloc_bio_info(md);
                if (!info)
                        goto free_and_out;
 
-               bio = bio_alloc_bioset(GFP_ATOMIC, orig_bio->bi_max_vecs,
-                                      md->bs);
-               if (!bio) {
+               clone_bio = bio_alloc_bioset(GFP_ATOMIC, bio->bi_max_vecs,
+                                            md->bs);
+               if (!clone_bio) {
                        free_bio_info(md, info);
                        goto free_and_out;
                }
 
-               __bio_clone(bio, orig_bio);
-               bio->bi_destructor = dm_bio_destructor;
-               bio->bi_end_io = end_clone_bio;
+               __bio_clone(clone_bio, bio);
+               clone_bio->bi_destructor = dm_bio_destructor;
+               clone_bio->bi_end_io = end_clone_bio;
                info->rq = clone;
-               info->orig = orig_bio;
-               bio->bi_private = info;
+               info->orig = bio;
+               clone_bio->bi_private = info;
 
                if (clone->bio) {
-                       clone->biotail->bi_next = bio;
-                       clone->biotail = bio;
+                       clone->biotail->bi_next = clone_bio;
+                       clone->biotail = clone_bio;
                } else
-                       clone->bio = clone->biotail = bio;
+                       clone->bio = clone->biotail = clone_bio;
        }
 
        return 0;
@@ -1319,99 +1376,160 @@ free_and_out:
        return -ENOMEM;
 }
 
-static int setup_clone(struct request *clone, struct request *orig)
+static int setup_clone(struct request *clone, struct request *rq,
+                      struct dm_rq_target_io *tio)
 {
        int r;
 
-       r = clone_request_bios(clone, orig);
+       blk_rq_init(NULL, clone);
+
+       r = clone_request_bios(clone, rq, tio->md);
        if (r)
                return r;
 
-       copy_request_info(clone, orig);
+       copy_request_info(clone, rq);
+       clone->start_time = jiffies;
+       clone->end_io = end_clone_request;
+       clone->end_io_data = tio;
 
        return 0;
 }
 
-static int clone_and_map_request(struct dm_target *ti, struct request *rq,
-                                struct mapped_device *md)
+static inline int dm_flush_suspending(struct mapped_device *md)
 {
-       int r;
-       struct request *clone;
+       return !md->suspend_rq.data;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+       struct mapped_device *md = (struct mapped_device *)q->queuedata;
        struct dm_rq_target_io *tio;
+       struct request *clone;
+
+       if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend marker */
+               if (dm_flush_suspending(md)) {
+                       if (q->in_flight)
+                               return BLKPREP_DEFER;
+                       else {
+                               /* This device should be quiet now */
+                               __stop_queue(q);
+                               smp_mb();
+                               BUG_ON(atomic_read(&md->pending));
+                               wake_up(&md->wait);
+                               return BLKPREP_KILL;
+                       }
+               } else
+                       /*
+                        * The suspend process was interrupted.
+                        * So no need to suspend now.
+                        */
+                       return BLKPREP_KILL;
+       }
+
+       if (unlikely(rq->special)) {
+               DMWARN("Already has something in rq->special.");
+               return BLKPREP_KILL;
+       }
+
+       if (unlikely(!dm_request_based(md))) {
+               DMWARN("Request was queued into bio-based device");
+               return BLKPREP_KILL;
+       }
 
        tio = alloc_rq_tio(md); /* Only one for each original request */
        if (!tio)
                /* -ENOMEM */
-               goto requeue;
+               return BLKPREP_DEFER;
+
        tio->md = md;
+       tio->ti = NULL;
        tio->orig = rq;
        tio->error = 0;
-       tio->ti = ti;
        memset(&tio->info, 0, sizeof(tio->info));
 
        clone = &tio->clone;
-       clone->end_io_data = tio;
-       clone->bio = clone->biotail = NULL;
-       if (setup_clone(clone, rq))
+       if (setup_clone(clone, rq, tio)) {
                /* -ENOMEM */
-               goto free_rq_tio_and_requeue;
+               free_rq_tio(md, tio);
+               return BLKPREP_DEFER;
+       }
 
+       rq->special = clone;
+       rq->cmd_flags |= REQ_DONTPREP;
+
+       return BLKPREP_OK;
+}
+
+static void map_request(struct dm_target *ti, struct request *rq,
+                       struct mapped_device *md)
+{
+       int r;
+       struct request *clone = rq->special;
+       struct dm_rq_target_io *tio = clone->end_io_data;
+
+       tio->ti = ti;
        atomic_inc(&md->pending);
+
+       /*
+        * Although submitted requests to the md->queue are checked against
+        * the table/queue limitations at the submission time, the limitations
+        * may be changed by a table swapping while those already checked
+        * requests are in the md->queue.
+        * If the limitations have been shrunk in such situations, we may be
+        * dispatching requests violating the current limitations here.
+        * Since struct request is a reliable one in the block-layer
+        * and device drivers, dispatching such requests is dangerous.
+        * (e.g. it may cause kernel panic easily.)
+        * Avoid to dispatch such problematic requests in request-based dm.
+        *
+        * Since dm_kill_request() decrements the md->pending, this have to
+        * be done after incrementing the md->pending.
+        */
+       r = blk_rq_check_limits(rq->q, rq);
+       if (unlikely(r)) {
+               DMWARN("violating the queue limitation. the limitation may be"
+                      " shrunk while there are some requests in the queue.");
+               dm_kill_request(clone, r);
+               return;
+       }
+
        r = ti->type->map_rq(ti, clone, &tio->info);
        switch (r) {
        case DM_MAPIO_SUBMITTED:
-               /* The target has taken the request to submit by itself */
+               /* The target has taken the I/O to submit by itself later */
                break;
        case DM_MAPIO_REMAPPED:
-               /* The clone has been remapped so dispatch it */
+               /* The target has remapped the I/O so dispatch it */
                dm_dispatch_request(clone);
                break;
        case DM_MAPIO_REQUEUE:
-               /* The target wants to requeue the original request */
-               goto free_bio_clone_and_requeue;
+               /* The target wants to requeue the I/O */
+               dm_requeue_request(clone);
+               break;
        default:
                if (r > 0) {
                        DMWARN("unimplemented target map return value: %d", r);
                        BUG();
                }
 
-               /*
-                * The target wants to complete the original request.
-                * Avoid printing "I/O error" message, since we didn't I/O.
-                */
-               rq->cmd_flags |= REQ_QUIET;
-               dm_end_request(clone, r);
+               /* The target wants to complete the I/O */
+               dm_kill_request(clone, r);
                break;
        }
-
-       return 0;
-
-free_bio_clone_and_requeue:
-       free_bio_clone(clone);
-       dec_rq_pending(tio);
-
-free_rq_tio_and_requeue:
-       free_rq_tio(md, tio);
-
-requeue:
-       /*
-        * Actual requeue is done in dm_request_fn() after queue lock is held
-        * so that we can avoid to get extra queue lock for the requeue
-        */
-       return 1;
 }
 
 /*
  * q->request_fn for request-based dm.
- * Called with the queue lock held
+ * Called with the queue lock held.
  */
 static void dm_request_fn(struct request_queue *q)
 {
-       int r;
        struct mapped_device *md = (struct mapped_device *)q->queuedata;
        struct dm_table *map = dm_get_table(md);
        struct dm_target *ti;
-       dm_congested_fn congested;
        struct request *rq;
 
        /*
@@ -1426,32 +1544,51 @@ static void dm_request_fn(struct request_queue *q)
        while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
                rq = elv_next_request(q);
                if (!rq)
-                       break;
+                       goto plug_and_out;
 
                ti = dm_table_find_target(map, rq->sector);
-               congested = ti->type->congested;
-               if (congested && congested(ti))
-                       break;
+               if (ti->type->busy && ti->type->busy(ti))
+                       goto plug_and_out;
 
                blkdev_dequeue_request(rq);
                spin_unlock(q->queue_lock);
-               r = clone_and_map_request(ti, rq, md);
+               map_request(ti, rq, md);
                spin_lock_irq(q->queue_lock);
-
-               if (r)
-                       __requeue_request(q, rq);
        }
 
+       goto out;
+
+plug_and_out:
+       if (!elv_queue_empty(q))
+               /* Some requests still remain, retry later */
+               blk_plug_device(q);
+
+out:
        dm_table_put(map);
 
        return;
 }
 
-int dm_underlying_device_congested(struct request_queue *q)
+int dm_underlying_device_busy(struct request_queue *q)
 {
        return blk_lld_busy(q);
 }
-EXPORT_SYMBOL_GPL(dm_underlying_device_congested);
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+
+static int dm_lld_busy(struct request_queue *q)
+{
+       int r;
+       struct mapped_device *md = q->queuedata;
+       struct dm_table *map = dm_get_table(md);
+
+       if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
+               r = 1;
+       else
+               r = dm_table_any_busy_target(map);
+
+       dm_table_put(map);
+       return r;
+}
 
 static void dm_unplug_all(struct request_queue *q)
 {
@@ -1459,7 +1596,7 @@ static void dm_unplug_all(struct request_queue *q)
        struct dm_table *map = dm_get_table(md);
 
        if (map) {
-               if (test_bit(DMF_REQUEST_BASED, &md->flags))
+               if (dm_request_based(md))
                        generic_unplug_device(q);
 
                dm_table_unplug_all(map);
@@ -1475,8 +1612,11 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 
        if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
                r = bdi_bits;
-       else if (test_bit(DMF_REQUEST_BASED, &md->flags))
-               /* Request-based dm cares about only own queue */
+       else if (dm_request_based(md))
+               /*
+                * Request-based dm cares about only own queue for
+                * the query about congestion status of request_queue
+                */
                r = md->queue->backing_dev_info.state & bdi_bits;
        else
                r = dm_table_any_congested(map, bdi_bits);
@@ -1560,121 +1700,6 @@ out:
        return r;
 }
 
-static void init_queue(struct request_queue *q, struct mapped_device *md)
-{
-       q->queuedata = md;
-       q->backing_dev_info.congested_fn = dm_any_congested;
-       q->backing_dev_info.congested_data = md;
-       blk_queue_make_request(q, dm_request);
-       blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
-       q->unplug_fn = dm_unplug_all;
-       blk_queue_merge_bvec(q, dm_merge_bvec);
-}
-
-int dm_set_md_request_based(struct mapped_device *md)
-{
-       int r = 0;
-
-       if (test_bit(DMF_INITIALIZED, &md->flags))
-               /* Initialization is already done */
-               return 0;
-
-       md->io_pool = mempool_create_slab_pool(MIN_IOS, _bio_info_cache);
-       if (!md->io_pool)
-               return -ENOMEM;
-
-       md->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
-       if (!md->tio_pool) {
-               r = -ENOMEM;
-               goto out_free_io_pool;
-       }
-
-       md->bs = bioset_create(MIN_IOS, MIN_IOS);
-       if (!md->bs) {
-               r = -ENOMEM;
-               goto out_free_tio_pool;
-       }
-
-       md->queue = blk_init_queue(dm_request_fn, NULL);
-       if (!md->queue) {
-               DMERR("request queue initialization for request-based failed");
-               r = -ENOMEM;
-               goto out_free_bs;
-       }
-
-       DMINFO("%s: activating request-based I/O", md->disk->disk_name);
-       md->saved_make_request_fn = md->queue->make_request_fn;
-       init_queue(md->queue, md);
-       blk_queue_softirq_done(md->queue, dm_softirq_done);
-       md->disk->queue = md->queue;
-       add_disk(md->disk);
-
-       return 0;
-
-out_cleanup_queue:
-       blk_cleanup_queue(md->queue);
-       md->disk->queue = md->queue = NULL;
-       md->saved_make_request_fn = NULL;
-
-out_free_bs:
-       bioset_free(md->bs);
-       md->bs = NULL;
-
-out_free_tio_pool:
-       mempool_destroy(md->tio_pool);
-       md->tio_pool = NULL;
-
-out_free_io_pool:
-       mempool_destroy(md->io_pool);
-       md->io_pool = NULL;
-
-       return r;
-}
-
-int dm_set_md_bio_based(struct mapped_device *md)
-{
-       if (test_bit(DMF_INITIALIZED, &md->flags)) {
-               /* Initialization is already done */
-               return 0;
-       }
-
-       md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
-       if (!md->io_pool)
-               goto out;
-
-       md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
-       if (!md->tio_pool)
-               goto out_free_io_pool;
-
-       md->bs = bioset_create(16, 16);
-       if (!md->bs)
-               goto out_free_tio_pool;
-
-       md->queue = blk_alloc_queue(GFP_KERNEL);
-       if (!md->queue) {
-               DMERR("request queue initialization for bio-based failed");
-               goto out_free_bs;
-       }
-
-       init_queue(md->queue, md);
-       md->disk->queue = md->queue;
-       add_disk(md->disk);
-
-       return 0;
-
-out_free_bs:
-       bioset_free(md->bs);
-       md->bs = NULL;
-out_free_tio_pool:
-       mempool_destroy(md->tio_pool);
-       md->tio_pool = NULL;
-out_free_io_pool:
-       mempool_destroy(md->io_pool);
-       md->io_pool = NULL;
-out:
-       return -ENOMEM;
-}
-
 static struct block_device_operations dm_blk_dops;
 
 /*
@@ -1712,10 +1737,33 @@ static struct mapped_device *alloc_dev(int minor)
        atomic_set(&md->uevent_seq, 0);
        INIT_LIST_HEAD(&md->uevent_list);
        spin_lock_init(&md->uevent_lock);
-       /* Defaults to BIO based */
-       set_bit(DMF_BIO_BASED, &md->flags);
 
-       /* md's queue and mempools will be allocated after the 1st table load */
+       md->queue = blk_init_queue(dm_request_fn, NULL);
+       if (!md->queue)
+               goto bad_queue;
+
+       /*
+        * Request-based dm devices cannot be stacked on top of bio-based dm
+        * devices.  The type of this dm device has not been decided yet,
+        * although we initialized the queue using blk_init_queue().
+        * The type is decided at the first table loading time.
+        * To prevent problematic device stacking, clear the queue flag
+        * for request stacking support until then.
+        *
+        * This queue is new, so no concurrency on the queue_flags.
+        */
+       queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+       md->saved_make_request_fn = md->queue->make_request_fn;
+       md->queue->queuedata = md;
+       md->queue->backing_dev_info.congested_fn = dm_any_congested;
+       md->queue->backing_dev_info.congested_data = md;
+       blk_queue_make_request(md->queue, dm_request);
+       blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
+       md->queue->unplug_fn = dm_unplug_all;
+       blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+       blk_queue_softirq_done(md->queue, dm_softirq_done);
+       blk_queue_prep_rq(md->queue, dm_prep_fn);
+       blk_queue_lld_busy(md->queue, dm_lld_busy);
 
        md->disk = alloc_disk(1);
        if (!md->disk)
@@ -1728,8 +1776,10 @@ static struct mapped_device *alloc_dev(int minor)
        md->disk->major = _major;
        md->disk->first_minor = minor;
        md->disk->fops = &dm_blk_dops;
+       md->disk->queue = md->queue;
        md->disk->private_data = md;
        sprintf(md->disk->disk_name, "dm-%d", minor);
+       add_disk(md->disk);
        format_dev_t(md->name, MKDEV(_major, minor));
 
        md->wq = create_singlethread_workqueue("kdmflush");
@@ -1748,6 +1798,8 @@ static struct mapped_device *alloc_dev(int minor)
 bad_thread:
        put_disk(md->disk);
 bad_disk:
+       blk_cleanup_queue(md->queue);
+bad_queue:
        free_minor(minor);
 bad_minor:
        module_put(THIS_MODULE);
@@ -1756,26 +1808,6 @@ bad_module_get:
        return NULL;
 }
 
-int dm_init_md(struct mapped_device *md)
-{
-       int r = 0;
-
-       if (test_bit(DMF_INITIALIZED, &md->flags))
-               return 0;
-
-       if(test_bit(DMF_REQUEST_BASED, &md->flags))
-               r = dm_set_md_request_based(md);
-       else if (test_bit(DMF_BIO_BASED, &md->flags))
-               r = dm_set_md_bio_based(md);
-       else
-               r = -EINVAL;
-
-       if (!r)
-               set_bit(DMF_INITIALIZED, &md->flags);
-
-       return r;
-}
-
 static void unlock_fs(struct mapped_device *md);
 
 static void free_dev(struct mapped_device *md)
@@ -1793,8 +1825,7 @@ static void free_dev(struct mapped_device *md)
                mempool_destroy(md->io_pool);
        if (md->bs)
                bioset_free(md->bs);
-       if (test_bit(DMF_INITIALIZED, &md->flags))
-               del_gendisk(md->disk);
+       del_gendisk(md->disk);
        free_minor(minor);
 
        spin_lock(&_minor_lock);
@@ -1802,8 +1833,7 @@ static void free_dev(struct mapped_device *md)
        spin_unlock(&_minor_lock);
 
        put_disk(md->disk);
-       if (md->queue)
-               blk_cleanup_queue(md->queue);
+       blk_cleanup_queue(md->queue);
        module_put(THIS_MODULE);
        kfree(md);
 }
@@ -1857,6 +1887,16 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
        dm_table_get(t);
        dm_table_event_callback(t, event_callback, md);
 
+       /*
+        * The queue hasn't been stopped yet, if the old table type wasn't
+        * for request-based during suspension.  So stop it to prevent
+        * I/O mapping before resume.
+        * This must be done before setting the queue restrictions,
+        * because request-based dm may be run just after the setting.
+        */
+       if (dm_table_request_based(t) && !blk_queue_stopped(q))
+               stop_queue(q);
+
        write_lock(&md->map_lock);
        md->map = t;
        dm_table_set_restrictions(t, q);
@@ -1979,7 +2019,11 @@ static int dm_wait_for_completion(struct mapped_device *md)
                set_current_state(TASK_INTERRUPTIBLE);
 
                smp_mb();
-               if (!atomic_read(&md->pending))
+               if (dm_request_based(md)) {
+                       if (!atomic_read(&md->pending) &&
+                           blk_queue_stopped(md->queue))
+                               break;
+               } else if (!atomic_read(&md->pending))
                        break;
 
                if (signal_pending(current)) {
@@ -2002,7 +2046,13 @@ static void __flush_deferred_io(struct mapped_device *md)
        struct bio *c;
 
        while ((c = bio_list_pop(&md->deferred))) {
-               if (__split_bio(md, c))
+               /*
+                * Some bios might have been queued here during suspension
+                * before setting of request-based dm in resume
+                */
+               if (dm_request_based(md))
+                       generic_make_request(c);
+               else if (__split_bio(md, c))
                        bio_io_error(c);
        }
 
@@ -2027,9 +2077,6 @@ static void dm_wq_work(struct work_struct *work)
 
        down_write(&md->io_lock);
        switch (req->type) {
-       case DM_WQ_FLUSH_ALL:
-               __merge_pushback_list(md);
-               /* pass through */
        case DM_WQ_FLUSH_DEFERRED:
                __flush_deferred_io(md);
                break;
@@ -2084,23 +2131,86 @@ out:
        return r;
 }
 
-static void stop_queue(struct request_queue *q)
+static inline void dm_invalidate_flush_suspend(struct mapped_device *md)
 {
+       md->suspend_rq.data = (void *)0x1;
+}
+
+static void dm_abort_suspend(struct mapped_device *md, int noflush)
+{
+       struct request_queue *q = md->queue;
        unsigned long flags;
 
+       /*
+        * For flush suspend, invalidation and queue restart must be protected
+        * by a single queue lock to prevent a race with dm_prep_fn().
+        */
        spin_lock_irqsave(q->queue_lock, flags);
-       blk_stop_queue(q);
+       if (!noflush)
+               dm_invalidate_flush_suspend(md);
+       __start_queue(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void start_queue(struct request_queue *q)
+/*
+ * Additional suspend work for request-based dm.
+ *
+ * In request-based dm, stopping request_queue prevents mapping.
+ * Even after stopping the request_queue, submitted requests from upper-layer
+ * can be inserted to the request_queue.  So original (unmapped) requests are
+ * kept in the request_queue during suspension.
+ */
+static void dm_start_suspend(struct mapped_device *md, int noflush)
 {
+       struct request *rq = &md->suspend_rq;
+       struct request_queue *q = md->queue;
        unsigned long flags;
 
+       if (noflush) {
+               stop_queue(q);
+               return;
+       }
+
+       /*
+        * For flush suspend, we need a marker to indicate the border line
+        * between flush needed I/Os and deferred I/Os, since all I/Os are
+        * queued in the request_queue during suspension.
+        *
+        * This marker must be inserted after setting DMF_BLOCK_IO,
+        * because dm_prep_fn() considers no DMF_BLOCK_IO to be
+        * a suspend interruption.
+        */
        spin_lock_irqsave(q->queue_lock, flags);
-       if (blk_queue_stopped(q))
-               blk_start_queue(q);
+       if (unlikely(rq->ref_count)) {
+               /*
+                * This can happen when the previous suspend was interrupted,
+                * the inserted suspend_rq for the previous suspend has still
+                * been in the queue and this suspend has been invoked.
+                *
+                * We could re-insert the suspend_rq by deleting it from
+                * the queue forcibly using list_del_init(&rq->queuelist).
+                * But it would break the block-layer easily.
+                * So we don't re-insert the suspend_rq again in such a case.
+                * The suspend_rq should be already invalidated during
+                * the previous suspend interruption, so just wait for it
+                * to be completed.
+                *
+                * This suspend will never complete, so warn the user to
+                * interrupt this suspend and retry later.
+                */
+               BUG_ON(!rq->data);
+               spin_unlock_irqrestore(q->queue_lock, flags);
+
+               DMWARN("Invalidating the previous suspend is still in"
+                      " progress.  This suspend will be never done."
+                      " Please interrupt this suspend and retry later.");
+               return;
+       }
        spin_unlock_irqrestore(q->queue_lock, flags);
+
+       /* Now no user of the suspend_rq */
+       blk_rq_init(q, rq);
+       blk_insert_request(q, rq, 0, NULL);
 }
 
 /*
@@ -2201,15 +2311,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        add_wait_queue(&md->wait, &wait);
        up_write(&md->io_lock);
 
-       /*
-        * In request-based dm, stopping request_queue prevents mapping.
-        * Even after stopping the request_queue, submitted requests from
-        * upper-layer can be inserted to the request_queue.
-        * So original (unmapped) requests are kept in the request_queue
-        * during suspension.
-        */
-       if (test_bit(DMF_REQUEST_BASED, &md->flags))
-               stop_queue(md->queue);
+       if (dm_request_based(md))
+               dm_start_suspend(md, noflush);
 
        /* unplug */
        if (map)
@@ -2224,8 +2327,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        remove_wait_queue(&md->wait, &wait);
 
        if (noflush) {
-               if (test_bit(DMF_REQUEST_BASED, &md->flags))
-                       /* Request-based dm uses md->queue for noflush */
+               if (dm_request_based(md))
+                       /* All requeued requests are already in md->queue */
                        clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
                else
                        __merge_pushback_list(md);
@@ -2236,9 +2339,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
        if (r < 0) {
                dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
 
-               if (test_bit(DMF_REQUEST_BASED, &md->flags))
-                       /* Request-based dm uses md->queue for deferred I/Os */
-                       start_queue(md->queue);
+               if (dm_request_based(md))
+                       dm_abort_suspend(md, noflush);
 
                unlock_fs(md);
                goto out; /* pushback list is already flushed, so skip flush */
@@ -2278,20 +2380,16 @@ int dm_resume(struct mapped_device *md)
        if (r)
                goto out;
 
+       dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+
        /*
         * Flushing deferred I/Os must be done after targets are resumed
         * so that mapping of targets can work correctly.
-        *
-        * Resuming request_queue earlier than clear_bit(DMF_BLOCK_IO) means
-        * starting to flush requests before upper-layer starts to submit bios.
-        * It may be better because llds should be empty and no need to wait
-        * for bio merging so strictly at this time.
+        * Request-based dm is queueing the deferred I/Os in its request_queue.
         */
-       if (test_bit(DMF_REQUEST_BASED, &md->flags))
+       if (dm_request_based(md))
                start_queue(md->queue);
 
-       dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
-
        unlock_fs(md);
 
        if (md->suspended_bdev) {
@@ -2362,40 +2460,75 @@ int dm_suspended(struct mapped_device *md)
        return test_bit(DMF_SUSPENDED, &md->flags);
 }
 
-int dm_request_based(struct mapped_device *md)
+int dm_noflush_suspending(struct dm_target *ti)
 {
-       return test_bit(DMF_REQUEST_BASED, &md->flags);
-}
+       struct mapped_device *md = dm_table_get_md(ti->table);
+       int r = __noflush_suspending(md);
 
-int dm_bio_based(struct mapped_device *md)
-{
-       return test_bit(DMF_BIO_BASED, &md->flags);
+       dm_put(md);
+
+       return r;
 }
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-void dm_set_request_based(struct mapped_device *md)
+int dm_init_md_mempool(struct mapped_device *md, int type)
 {
-       if (test_bit(DMF_REQUEST_BASED, &md->flags))
-               return;
+       if (unlikely(type == DM_TYPE_NONE)) {
+               DMWARN("no type is specified, can't initialize mempool");
+               return -EINVAL;
+       }
 
-       if (test_bit(DMF_INITIALIZED, &md->flags)) {
-               DMERR("Cannot change to request based, already initialized");
-               return;
+       if (md->mempool_type == type)
+               return 0;
+
+       if (md->map) {
+               /* The md has been using, can't change the mempool type */
+               DMWARN("can't change mempool type after a table is bound");
+               return -EINVAL;
        }
 
-       set_bit(DMF_REQUEST_BASED, &md->flags);
-       clear_bit(DMF_BIO_BASED, &md->flags);
-}
+       /* Not using the md yet, we can still change the mempool type */
+       if (md->mempool_type != DM_TYPE_NONE) {
+               mempool_destroy(md->io_pool);
+               md->io_pool = NULL;
+               mempool_destroy(md->tio_pool);
+               md->tio_pool = NULL;
+               bioset_free(md->bs);
+               md->bs = NULL;
+               md->mempool_type = DM_TYPE_NONE;
+       }
 
-int dm_noflush_suspending(struct dm_target *ti)
-{
-       struct mapped_device *md = dm_table_get_md(ti->table);
-       int r = __noflush_suspending(md);
+       md->io_pool = (type == DM_TYPE_BIO_BASED) ?
+                     mempool_create_slab_pool(MIN_IOS, _io_cache) :
+                     mempool_create_slab_pool(MIN_IOS, _bio_info_cache);
+       if (!md->io_pool)
+               return -ENOMEM;
 
-       dm_put(md);
+       md->tio_pool = (type == DM_TYPE_BIO_BASED) ?
+                      mempool_create_slab_pool(MIN_IOS, _tio_cache) :
+                      mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+       if (!md->tio_pool)
+               goto free_io_pool_and_out;
 
-       return r;
+       md->bs = (type == DM_TYPE_BIO_BASED) ?
+                bioset_create(16, 16) : bioset_create(MIN_IOS, MIN_IOS);
+       if (!md->bs)
+               goto free_tio_pool_and_out;
+
+       md->mempool_type = type;
+
+       return 0;
+
+free_tio_pool_and_out:
+       mempool_destroy(md->tio_pool);
+       md->tio_pool = NULL;
+
+free_io_pool_and_out:
+       mempool_destroy(md->io_pool);
+       md->io_pool = NULL;
+
+       return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
 static struct block_device_operations dm_blk_dops = {
        .open = dm_blk_open,
index 7110c69..6bd61b9 100644 (file)
 #define DM_SUSPEND_NOFLUSH_FLAG                (1 << 1)
 
 /*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE           0
+#define DM_TYPE_BIO_BASED      1
+#define DM_TYPE_REQUEST_BASED  2
+
+/*
  * List of devices that a metadevice uses and should open/close.
  */
 struct dm_dev {
@@ -49,9 +56,11 @@ void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-void dm_table_unplug_all(struct dm_table *t);
-void dm_table_set_request_based(struct dm_table *t);
+int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+int dm_table_get_type(struct dm_table *t);
 int dm_table_request_based(struct dm_table *t);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
@@ -71,13 +80,6 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
                                        void *param), void *param);
 
 /*-----------------------------------------------------------------
- * Helper for block layer and dm core operations
- *---------------------------------------------------------------*/
-void dm_dispatch_request(struct request *rq);
-void dm_end_request(struct request *rq, int error);
-int dm_underlying_device_congested(struct request_queue *q);
-
-/*-----------------------------------------------------------------
  * Useful inlines.
  *---------------------------------------------------------------*/
 static inline int array_too_big(unsigned long fixed, unsigned long obj,
@@ -106,9 +108,9 @@ void dm_stripe_exit(void);
 
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
 union map_info *dm_get_mapinfo(struct bio *bio);
-union map_info *dm_get_rq_mapinfo(struct request *rq);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 void dm_kobject_uevent(struct mapped_device *md);
 
@@ -116,11 +118,8 @@ int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
 
 /*
- * Initializer for request-based/bio-based device
+ * Mempool initializer for a mapped_device
  */
-int dm_set_md_request_based(struct mapped_device *md);
-int dm_set_md_bio_based(struct mapped_device *md);
-void dm_set_request_based(struct mapped_device *md);
-int dm_init_md(struct mapped_device *md);
+int dm_init_md_mempool(struct mapped_device *md, int type);
 
 #endif
index 03a5ab7..0b82030 100644 (file)
@@ -1302,9 +1302,6 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                        sbio->bi_size = r1_bio->sectors << 9;
                                        sbio->bi_idx = 0;
                                        sbio->bi_phys_segments = 0;
-                                       sbio->bi_hw_segments = 0;
-                                       sbio->bi_hw_front_size = 0;
-                                       sbio->bi_hw_back_size = 0;
                                        sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
                                        sbio->bi_flags |= 1 << BIO_UPTODATE;
                                        sbio->bi_next = NULL;
@@ -1790,7 +1787,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                bio->bi_vcnt = 0;
                bio->bi_idx = 0;
                bio->bi_phys_segments = 0;
-               bio->bi_hw_segments = 0;
                bio->bi_size = 0;
                bio->bi_end_io = NULL;
                bio->bi_private = NULL;
index e34cd0e..d3b9aa0 100644 (file)
@@ -1345,9 +1345,6 @@ static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
                tbio->bi_size = r10_bio->sectors << 9;
                tbio->bi_idx = 0;
                tbio->bi_phys_segments = 0;
-               tbio->bi_hw_segments = 0;
-               tbio->bi_hw_front_size = 0;
-               tbio->bi_hw_back_size = 0;
                tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
                tbio->bi_flags |= 1 << BIO_UPTODATE;
                tbio->bi_next = NULL;
@@ -1947,7 +1944,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                bio->bi_vcnt = 0;
                bio->bi_idx = 0;
                bio->bi_phys_segments = 0;
-               bio->bi_hw_segments = 0;
                bio->bi_size = 0;
        }
 
index 224de02..37e5465 100644 (file)
 const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
 #endif
 
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_phys_segments(struct bio *bio)
+{
+       return bio->bi_phys_segments & 0xffff;
+}
+
+static inline int raid5_bi_hw_segments(struct bio *bio)
+{
+       return (bio->bi_phys_segments >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+{
+       --bio->bi_phys_segments;
+       return raid5_bi_phys_segments(bio);
+}
+
+static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+{
+       unsigned short val = raid5_bi_hw_segments(bio);
+
+       --val;
+       bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
+       return val;
+}
+
+static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+{
+       bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+}
+
 static inline int raid6_next_disk(int disk, int raid_disks)
 {
        disk++;
@@ -507,7 +541,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
                        while (rbi && rbi->bi_sector <
                                dev->sector + STRIPE_SECTORS) {
                                rbi2 = r5_next_bio(rbi, dev->sector);
-                               if (--rbi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(rbi)) {
                                        rbi->bi_next = return_bi;
                                        return_bi = rbi;
                                }
@@ -1725,7 +1759,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        if (*bip)
                bi->bi_next = *bip;
        *bip = bi;
-       bi->bi_phys_segments ++;
+       bi->bi_phys_segments++;
        spin_unlock_irq(&conf->device_lock);
        spin_unlock(&sh->lock);
 
@@ -1819,7 +1853,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                        sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (--bi->bi_phys_segments == 0) {
+                       if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
@@ -1834,7 +1868,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                       sh->dev[i].sector + STRIPE_SECTORS) {
                        struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
                        clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                       if (--bi->bi_phys_segments == 0) {
+                       if (!raid5_dec_bi_phys_segments(bi)) {
                                md_write_end(conf->mddev);
                                bi->bi_next = *return_bi;
                                *return_bi = bi;
@@ -1858,7 +1892,7 @@ handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
                                struct bio *nextbi =
                                        r5_next_bio(bi, sh->dev[i].sector);
                                clear_bit(BIO_UPTODATE, &bi->bi_flags);
-                               if (--bi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(bi)) {
                                        bi->bi_next = *return_bi;
                                        *return_bi = bi;
                                }
@@ -2033,7 +2067,7 @@ static void handle_stripe_clean_event(raid5_conf_t *conf,
                                while (wbi && wbi->bi_sector <
                                        dev->sector + STRIPE_SECTORS) {
                                        wbi2 = r5_next_bio(wbi, dev->sector);
-                                       if (--wbi->bi_phys_segments == 0) {
+                                       if (!raid5_dec_bi_phys_segments(wbi)) {
                                                md_write_end(conf->mddev);
                                                wbi->bi_next = *return_bi;
                                                *return_bi = wbi;
@@ -2814,7 +2848,7 @@ static bool handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                                copy_data(0, rbi, dev->page, dev->sector);
                                rbi2 = r5_next_bio(rbi, dev->sector);
                                spin_lock_irq(&conf->device_lock);
-                               if (--rbi->bi_phys_segments == 0) {
+                               if (!raid5_dec_bi_phys_segments(rbi)) {
                                        rbi->bi_next = return_bi;
                                        return_bi = rbi;
                                }
@@ -3155,8 +3189,11 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
        if(bi) {
                conf->retry_read_aligned_list = bi->bi_next;
                bi->bi_next = NULL;
+               /*
+                * this sets the active strip count to 1 and the processed
+                * strip count to zero (upper 8 bits)
+                */
                bi->bi_phys_segments = 1; /* biased count of active stripes */
-               bi->bi_hw_segments = 0; /* count of processed stripes */
        }
 
        return bi;
@@ -3206,8 +3243,7 @@ static int bio_fits_rdev(struct bio *bi)
        if ((bi->bi_size>>9) > q->max_sectors)
                return 0;
        blk_recount_segments(q, bi);
-       if (bi->bi_phys_segments > q->max_phys_segments ||
-           bi->bi_hw_segments > q->max_hw_segments)
+       if (bi->bi_phys_segments > q->max_phys_segments)
                return 0;
 
        if (q->merge_bvec_fn)
@@ -3468,7 +3504,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
                        
        }
        spin_lock_irq(&conf->device_lock);
-       remaining = --bi->bi_phys_segments;
+       remaining = raid5_dec_bi_phys_segments(bi);
        spin_unlock_irq(&conf->device_lock);
        if (remaining == 0) {
 
@@ -3752,7 +3788,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                     sector += STRIPE_SECTORS,
                     scnt++) {
 
-               if (scnt < raid_bio->bi_hw_segments)
+               if (scnt < raid5_bi_hw_segments(raid_bio))
                        /* already done this stripe */
                        continue;
 
@@ -3760,7 +3796,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 
                if (!sh) {
                        /* failed to get a stripe - must wait */
-                       raid_bio->bi_hw_segments = scnt;
+                       raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
@@ -3768,7 +3804,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
                if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
                        release_stripe(sh);
-                       raid_bio->bi_hw_segments = scnt;
+                       raid5_set_bi_hw_segments(raid_bio, scnt);
                        conf->retry_read_aligned = raid_bio;
                        return handled;
                }
@@ -3778,7 +3814,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
                handled++;
        }
        spin_lock_irq(&conf->device_lock);
-       remaining = --raid_bio->bi_phys_segments;
+       remaining = raid5_dec_bi_phys_segments(raid_bio);
        spin_unlock_irq(&conf->device_lock);
        if (remaining == 0)
                bio_endio(raid_bio, 0);
index f34f20c..9bf581c 100644 (file)
@@ -1005,6 +1005,29 @@ static int ftl_writesect(struct mtd_blktrans_dev *dev,
        return ftl_write((void *)dev, buf, block, 1);
 }
 
+static int ftl_discardsect(struct mtd_blktrans_dev *dev,
+                          unsigned long sector, unsigned nr_sects)
+{
+       partition_t *part = (void *)dev;
+       uint32_t bsize = 1 << part->header.EraseUnitSize;
+
+       DEBUG(1, "FTL erase sector %ld for %d sectors\n",
+             sector, nr_sects);
+
+       while (nr_sects) {
+               uint32_t old_addr = part->VirtualBlockMap[sector];
+               if (old_addr != 0xffffffff) {
+                       part->VirtualBlockMap[sector] = 0xffffffff;
+                       part->EUNInfo[old_addr/bsize].Deleted++;
+                       if (set_bam_entry(part, old_addr, 0))
+                               return -EIO;
+               }
+               nr_sects--;
+               sector++;
+       }
+
+       return 0;
+}
 /*====================================================================*/
 
 static void ftl_freepart(partition_t *part)
@@ -1069,6 +1092,7 @@ static struct mtd_blktrans_ops ftl_tr = {
        .blksize        = SECTOR_SIZE,
        .readsect       = ftl_readsect,
        .writesect      = ftl_writesect,
+       .discard        = ftl_discardsect,
        .getgeo         = ftl_getgeo,
        .add_mtd        = ftl_add_mtd,
        .remove_dev     = ftl_remove_dev,
index 9ff007c..681d5ac 100644 (file)
@@ -32,6 +32,14 @@ struct mtd_blkcore_priv {
        spinlock_t queue_lock;
 };
 
+static int blktrans_discard_request(struct request_queue *q,
+                                   struct request *req)
+{
+       req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+       req->cmd[0] = REQ_LB_OP_DISCARD;
+       return 0;
+}
+
 static int do_blktrans_request(struct mtd_blktrans_ops *tr,
                               struct mtd_blktrans_dev *dev,
                               struct request *req)
@@ -44,6 +52,10 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
        buf = req->buffer;
 
+       if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+           req->cmd[0] == REQ_LB_OP_DISCARD)
+               return !tr->discard(dev, block, nsect);
+
        if (!blk_fs_request(req))
                return 0;
 
@@ -367,6 +379,10 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 
        tr->blkcore_priv->rq->queuedata = tr;
        blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+       if (tr->discard)
+               blk_queue_set_discard(tr->blkcore_priv->rq,
+                                     blktrans_discard_request);
+
        tr->blkshift = ffs(tr->blksize) - 1;
 
        tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr,
index 8c01a3e..b8a52c0 100644 (file)
@@ -529,6 +529,14 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
        spin_unlock_irqrestore(shost->host_lock, flags);
 }
 
+static inline int scsi_device_is_busy(struct scsi_device *sdev)
+{
+       if (sdev->device_busy >= sdev->queue_depth || sdev->device_blocked)
+               return 1;
+
+       return 0;
+}
+
 static inline int scsi_target_is_busy(struct scsi_target *starget)
 {
        return ((starget->can_queue > 0 &&
@@ -536,6 +544,15 @@ static inline int scsi_target_is_busy(struct scsi_target *starget)
                 starget->target_blocked);
 }
 
+static inline int scsi_host_is_busy(struct Scsi_Host *shost)
+{
+       if ((shost->can_queue > 0 && shost->host_busy >= shost->can_queue) ||
+           shost->host_blocked || shost->host_self_blocked)
+               return 1;
+
+       return 0;
+}
+
 /*
  * Function:   scsi_run_queue()
  *
@@ -558,11 +575,7 @@ static void scsi_run_queue(struct request_queue *q)
                scsi_single_lun_run(sdev);
 
        spin_lock_irqsave(shost->host_lock, flags);
-       while (!list_empty(&shost->starved_list) &&
-              !shost->host_blocked && !shost->host_self_blocked &&
-               !((shost->can_queue > 0) &&
-                 (shost->host_busy >= shost->can_queue))) {
-
+       while (!list_empty(&shost->starved_list) && !scsi_host_is_busy(shost)) {
                int flagset;
 
                /*
@@ -1349,8 +1362,6 @@ EXPORT_SYMBOL(scsi_prep_fn);
 static inline int scsi_dev_queue_ready(struct request_queue *q,
                                  struct scsi_device *sdev)
 {
-       if (sdev->device_busy >= sdev->queue_depth)
-               return 0;
        if (sdev->device_busy == 0 && sdev->device_blocked) {
                /*
                 * unblock after device_blocked iterates to zero
@@ -1364,7 +1375,7 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
                        return 0;
                }
        }
-       if (sdev->device_blocked)
+       if (scsi_device_is_busy(sdev))
                return 0;
 
        return 1;
@@ -1441,8 +1452,7 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
                        return 0;
                }
        }
-       if ((shost->can_queue > 0 && shost->host_busy >= shost->can_queue) ||
-           shost->host_blocked || shost->host_self_blocked) {
+       if (scsi_host_is_busy(shost)) {
                if (list_empty(&sdev->starved_entry))
                        list_add_tail(&sdev->starved_entry, &shost->starved_list);
                return 0;
@@ -1456,6 +1466,37 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
 }
 
 /*
+ * Busy state exporting function for request stacking drivers.
+ *
+ * For efficiency, no lock is taken to check the busy state of
+ * shost/starget/sdev, since the returned value is not guaranteed and
+ * may be changed after request stacking drivers call the function,
+ * regardless of taking lock or not.
+ *
+ * When scsi can't dispatch I/Os anymore and needs to kill I/Os
+ * (e.g. !sdev), scsi needs to return 'not busy'.
+ * Otherwise, request stacking drivers may hold requests forever.
+ */
+static int scsi_lld_busy(struct request_queue *q)
+{
+       struct scsi_device *sdev = q->queuedata;
+       struct Scsi_Host *shost;
+       struct scsi_target *starget;
+
+       if (!sdev)
+               return 0;
+
+       shost = sdev->host;
+       starget = scsi_target(sdev);
+
+       if (scsi_host_in_recovery(shost) || scsi_host_is_busy(shost) ||
+           scsi_target_is_busy(starget) || scsi_device_is_busy(sdev))
+               return 1;
+
+       return 0;
+}
+
+/*
  * Kill a request for a dead device
  */
 static void scsi_kill_request(struct request *req, struct request_queue *q)
@@ -1579,14 +1620,9 @@ static void scsi_request_fn(struct request_queue *q)
                 * accept it.
                 */
                req = elv_next_request(q);
-               if (!req)
+               if (!req || !scsi_dev_queue_ready(q, sdev))
                        break;
 
-               if (!scsi_dev_queue_ready(q, sdev)) {
-                       blk_set_lld_busy(q);
-                       break;
-               }
-
                if (unlikely(!scsi_device_online(sdev))) {
                        sdev_printk(KERN_ERR, sdev,
                                    "rejecting I/O to offline device\n");
@@ -1656,8 +1692,6 @@ static void scsi_request_fn(struct request_queue *q)
                rtn = scsi_dispatch_cmd(cmd);
                spin_lock_irq(q->queue_lock);
                if(rtn) {
-                       blk_set_lld_busy(q);
-
                        /* we're refusing the command; because of
                         * the way locks get dropped, we need to 
                         * check here if plugging is required */
@@ -1666,7 +1700,6 @@ static void scsi_request_fn(struct request_queue *q)
 
                        break;
                }
-               blk_clear_lld_busy(q);
        }
 
        goto out;
@@ -1683,7 +1716,6 @@ static void scsi_request_fn(struct request_queue *q)
         * later time.
         */
        spin_lock_irq(q->queue_lock);
-       blk_set_lld_busy(q);
        blk_requeue_request(q, req);
        sdev->device_busy--;
        if(sdev->device_busy == 0)
@@ -1767,6 +1799,7 @@ struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
        blk_queue_prep_rq(q, scsi_prep_fn);
        blk_queue_softirq_done(q, scsi_softirq_done);
        blk_queue_rq_timed_out(q, scsi_times_out);
+       blk_queue_lld_busy(q, scsi_lld_busy);
        return q;
 }
 
index ab2a322..6398489 100644 (file)
@@ -65,7 +65,6 @@ int scsi_noretry_cmd(struct scsi_cmnd *scmd);
 extern int scsi_maybe_unblock_host(struct scsi_device *sdev);
 extern void scsi_device_unbusy(struct scsi_device *sdev);
 extern int scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
-extern void scsi_queue_retry(struct scsi_cmnd *cmd, int reason);
 extern void scsi_next_command(struct scsi_cmnd *cmd);
 extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
 extern void scsi_run_host_queues(struct Scsi_Host *shost);
index 8df10aa..7a0283d 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -111,6 +111,7 @@ void bio_init(struct bio *bio)
 {
        memset(bio, 0, sizeof(*bio));
        bio->bi_flags = 1 << BIO_UPTODATE;
+       bio->bi_comp_cpu = -1;
        atomic_set(&bio->bi_cnt, 1);
 }
 
@@ -208,14 +209,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
        return bio->bi_phys_segments;
 }
 
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
-       if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-               blk_recount_segments(q, bio);
-
-       return bio->bi_hw_segments;
-}
-
 /**
  *     __bio_clone     -       clone a bio
  *     @bio: destination bio
@@ -350,8 +343,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         */
 
        while (bio->bi_phys_segments >= q->max_phys_segments
-              || bio->bi_hw_segments >= q->max_hw_segments
-              || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+              || bio->bi_phys_segments >= q->max_hw_segments) {
 
                if (retried_segments)
                        return 0;
@@ -395,13 +387,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
        }
 
        /* If we may be able to merge these biovecs, force a recount */
-       if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
-           BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+       if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
                bio->bi_flags &= ~(1 << BIO_SEG_VALID);
 
        bio->bi_vcnt++;
        bio->bi_phys_segments++;
-       bio->bi_hw_segments++;
  done:
        bio->bi_size += len;
        return len;
@@ -1393,7 +1383,6 @@ EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
 EXPORT_SYMBOL(bio_clone);
 EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
 EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_add_pc_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
index 302e95c..fb98b3d 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
 
 struct fatent_operations {
        void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
        struct fat_entry fatent;
        struct buffer_head *bhs[MAX_BUF_PER_PAGE];
        int i, err, nr_bhs;
+       int first_cl = cluster;
 
        nr_bhs = 0;
        fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        goto error;
                }
 
+               /* 
+                * Issue discard for the sectors we no longer care about,
+                * batching contiguous clusters into one request
+                */
+               if (cluster != fatent.entry + 1) {
+                       int nr_clus = fatent.entry - first_cl + 1;
+
+                       sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+                                        nr_clus * sbi->sec_per_clus);
+                       first_cl = cluster;
+               }
+
                ops->ent_put(&fatent, FAT_ENT_FREE);
                if (sbi->free_clusters != -1) {
                        sbi->free_clusters++;
index 55e2fd7..0c0c634 100644 (file)
 
 #ifdef CONFIG_BLOCK
 
-/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
 #include <asm/io.h>
 
-#if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY)
-#define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1))
-#define BIOVEC_VIRT_OVERSIZE(x)        ((x) > BIO_VMERGE_MAX_SIZE)
-#else
-#define BIOVEC_VIRT_START_SIZE(x)      0
-#define BIOVEC_VIRT_OVERSIZE(x)                0
-#endif
-
-#ifndef BIO_VMERGE_BOUNDARY
-#define BIO_VMERGE_BOUNDARY    0
-#endif
-
 #define BIO_DEBUG
 
 #ifdef BIO_DEBUG
@@ -88,25 +75,14 @@ struct bio {
        /* Number of segments in this BIO after
         * physical address coalescing is performed.
         */
-       unsigned short          bi_phys_segments;
-
-       /* Number of segments after physical and DMA remapping
-        * hardware coalescing is performed.
-        */
-       unsigned short          bi_hw_segments;
+       unsigned int            bi_phys_segments;
 
        unsigned int            bi_size;        /* residual I/O count */
 
-       /*
-        * To keep track of the max hw size, we account for the
-        * sizes of the first and last virtually mergeable segments
-        * in this bio
-        */
-       unsigned int            bi_hw_front_size;
-       unsigned int            bi_hw_back_size;
-
        unsigned int            bi_max_vecs;    /* max bvl_vecs we can hold */
 
+       unsigned int            bi_comp_cpu;    /* completion CPU */
+
        struct bio_vec          *bi_io_vec;     /* the actual vec list */
 
        bio_end_io_t            *bi_end_io;
@@ -126,11 +102,12 @@ struct bio {
 #define BIO_UPTODATE   0       /* ok after I/O completion */
 #define BIO_RW_BLOCK   1       /* RW_AHEAD set, and read/write would block */
 #define BIO_EOF                2       /* out-out-bounds error */
-#define BIO_SEG_VALID  3       /* nr_hw_seg valid */
+#define BIO_SEG_VALID  3       /* bi_phys_segments valid */
 #define BIO_CLONED     4       /* doesn't own data */
 #define BIO_BOUNCED    5       /* bio is a bounce bio */
 #define BIO_USER_MAPPED 6      /* contains user pages */
 #define BIO_EOPNOTSUPP 7       /* not supported */
+#define BIO_CPU_AFFINE 8       /* complete bio on same CPU as submitted */
 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
 
 /*
@@ -144,18 +121,29 @@ struct bio {
 /*
  * bio bi_rw flags
  *
- * bit 0 -- read (not set) or write (set)
+ * bit 0 -- data direction
+ *     If not set, bio is a read from device. If set, it's a write to device.
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
+ *     Insert a serialization point in the IO queue, forcing previously
+ *     submitted IO to be completed before this oen is issued.
  * bit 3 -- synchronous I/O hint: the block layer will unplug immediately
+ *     Note that this does NOT indicate that the IO itself is sync, just
+ *     that the block layer will not postpone issue of this IO by plugging.
  * bit 4 -- metadata request
+ *     Used for tracing to differentiate metadata and data IO. May also
+ *     get some preferential treatment in the IO scheduler
  * bit 5 -- discard sectors
+ *     Informs the lower level device that this range of sectors is no longer
+ *     used by the file system and may thus be freed by the device. Used
+ *     for flash based storage.
  * bit 6 -- fail fast device errors
  * bit 7 -- fail fast transport errors
  * bit 8 -- fail fast driver errors
+ *     Don't want driver retries for any fast fail whatever the reason.
  */
-#define BIO_RW         0
-#define BIO_RW_AHEAD   1
+#define BIO_RW         0       /* Must match RW in req flags (blkdev.h) */
+#define BIO_RW_AHEAD   1       /* Must match FAILFAST in req flags */
 #define BIO_RW_BARRIER 2
 #define BIO_RW_SYNC    3
 #define BIO_RW_META    4
@@ -195,14 +183,15 @@ struct bio {
 #define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
 #define bio_rw_ahead(bio)      ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)       ((bio)->bi_rw & (1 << BIO_RW_META))
-#define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size)
+#define bio_discard(bio)       ((bio)->bi_rw & (1 << BIO_RW_DISCARD))
+#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
 static inline unsigned int bio_cur_sectors(struct bio *bio)
 {
        if (bio->bi_vcnt)
                return bio_iovec(bio)->bv_len >> 9;
-
-       return 0;
+       else /* dataless requests such as discard */
+               return bio->bi_size >> 9;
 }
 
 static inline void *bio_data(struct bio *bio)
@@ -246,8 +235,6 @@ static inline void *bio_data(struct bio *bio)
        ((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
 #endif
 
-#define BIOVEC_VIRT_MERGEABLE(vec1, vec2)      \
-       ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
        (((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
@@ -345,7 +332,6 @@ extern void bio_free(struct bio *, struct bio_set *);
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
-extern int bio_hw_segments(struct request_queue *, struct bio *);
 
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone(struct bio *, gfp_t);
@@ -378,6 +364,14 @@ extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 /*
+ * Allow queuer to specify a completion CPU for this bio
+ */
+static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
+{
+       bio->bi_comp_cpu = cpu;
+}
+
+/*
  * bio_set is used to allow other portions of the IO system to
  * allocate their own private memory pools for bio and iovec structures.
  * These memory pools in turn all allocate from the bio_slab
@@ -455,6 +449,14 @@ static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
        __bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)      __bio_kunmap_irq(buf, flags)
 
+/*
+ * Check whether this bio carries any data or not. A NULL bio is allowed.
+ */
+static inline int bio_has_data(struct bio *bio)
+{
+       return bio && bio->bi_io_vec != NULL;
+}
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)  (&(bip->bip_vec[(idx)]))
index 518d01d..19d1098 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/bsg.h>
+#include <linux/smp.h>
 
 #include <asm/scatterlist.h>
 
@@ -54,7 +55,6 @@ enum rq_cmd_type_bits {
        REQ_TYPE_PM_SUSPEND,            /* suspend request */
        REQ_TYPE_PM_RESUME,             /* resume request */
        REQ_TYPE_PM_SHUTDOWN,           /* shutdown request */
-       REQ_TYPE_FLUSH,                 /* flush request */
        REQ_TYPE_SPECIAL,               /* driver defined type */
        REQ_TYPE_LINUX_BLOCK,           /* generic block layer message */
        /*
@@ -76,21 +76,20 @@ enum rq_cmd_type_bits {
  *
  */
 enum {
-       /*
-        * just examples for now
-        */
        REQ_LB_OP_EJECT = 0x40,         /* eject request */
-       REQ_LB_OP_FLUSH = 0x41,         /* flush device */
+       REQ_LB_OP_FLUSH = 0x41,         /* flush request */
+       REQ_LB_OP_DISCARD = 0x42,       /* discard sectors */
 };
 
 /*
- * request type modified bits. first three bits match BIO_RW* bits, important
+ * request type modified bits. first two bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
        __REQ_RW,               /* not set, read. set, write */
        __REQ_FAILFAST_DEV,     /* no driver retries of device errors */
        __REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
        __REQ_FAILFAST_DRIVER,  /* no driver retries of driver errors */
+       __REQ_DISCARD,          /* request to discard sectors */
        __REQ_SORTED,           /* elevator knows about this request */
        __REQ_SOFTBARRIER,      /* may not be passed by ioscheduler */
        __REQ_HARDBARRIER,      /* may not be passed by drive either */
@@ -116,6 +115,7 @@ enum rq_flag_bits {
 #define REQ_FAILFAST_DEV       (1 << __REQ_FAILFAST_DEV)
 #define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT)
 #define REQ_FAILFAST_DRIVER    (1 << __REQ_FAILFAST_DRIVER)
+#define REQ_DISCARD    (1 << __REQ_DISCARD)
 #define REQ_SORTED     (1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER        (1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER        (1 << __REQ_HARDBARRIER)
@@ -144,7 +144,8 @@ enum rq_flag_bits {
  */
 struct request {
        struct list_head queuelist;
-       struct list_head donelist;
+       struct call_single_data csd;
+       int cpu;
 
        struct request_queue *q;
 
@@ -195,13 +196,6 @@ struct request {
         */
        unsigned short nr_phys_segments;
 
-       /* Number of scatter-gather addr+len pairs after
-        * physical and DMA remapping hardware coalescing is performed.
-        * This is the number of scatter-gather entries the driver
-        * will actually have to deal with after DMA mapping is done.
-        */
-       unsigned short nr_hw_segments;
-
        unsigned short ioprio;
 
        void *special;
@@ -240,6 +234,11 @@ struct request {
        struct request *next_rq;
 };
 
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+       return req->ioprio;
+}
+
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.
@@ -259,6 +258,7 @@ typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
+typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 struct bvec_merge_data {
@@ -272,6 +272,7 @@ typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
+typedef int (lld_busy_fn) (struct request_queue *q);
 
 enum blk_eh_timer_return {
        BLK_EH_NOT_HANDLED,
@@ -322,11 +323,13 @@ struct request_queue
        make_request_fn         *make_request_fn;
        prep_rq_fn              *prep_rq_fn;
        unplug_fn               *unplug_fn;
+       prepare_discard_fn      *prepare_discard_fn;
        merge_bvec_fn           *merge_bvec_fn;
        prepare_flush_fn        *prepare_flush_fn;
        softirq_done_fn         *softirq_done_fn;
        rq_timed_out_fn         *rq_timed_out_fn;
        dma_drain_needed_fn     *dma_drain_needed;
+       lld_busy_fn             *lld_busy_fn;
 
        /*
         * Dispatch queue sorting
@@ -441,7 +444,8 @@ struct request_queue
 #define QUEUE_FLAG_ELVSWITCH   8       /* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI                9       /* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES    10      /* disable merge attempts */
-#define QUEUE_FLAG_BUSY               11       /* device/host under queue is busy */
+#define QUEUE_FLAG_SAME_COMP   11      /* force complete on same CPU */
+#define QUEUE_FLAG_STACKABLE   13      /* supports request stacking */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -548,9 +552,8 @@ enum {
 #define blk_queue_stopped(q)   test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)  test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_flushing(q)  ((q)->ordseq)
-#define blk_lld_busy(q)                test_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
-#define blk_set_lld_busy(q)    set_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
-#define blk_clear_lld_busy(q)  clear_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
+#define blk_queue_stackable(q) \
+       test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 
 #define blk_fs_request(rq)     ((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)     ((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
@@ -565,16 +568,18 @@ enum {
                                 blk_failfast_driver(rq))
 #define blk_rq_started(rq)     ((rq)->cmd_flags & REQ_STARTED)
 
-#define blk_account_rq(rq)     (blk_rq_started(rq) && blk_fs_request(rq))
+#define blk_account_rq(rq)     (blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq))) 
 
 #define blk_pm_suspend_request(rq)     ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
 #define blk_pm_resume_request(rq)      ((rq)->cmd_type == REQ_TYPE_PM_RESUME)
 #define blk_pm_request(rq)     \
        (blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
 
+#define blk_rq_cpu_valid(rq)   ((rq)->cpu != -1)
 #define blk_sorted_rq(rq)      ((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)     ((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)         ((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq)     ((rq)->cmd_flags & REQ_DISCARD)
 #define blk_bidi_rq(rq)                ((rq)->next_rq != NULL)
 #define blk_empty_barrier(rq)  (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 /* rq->queuelist of dequeued request must be list_empty() */
@@ -621,7 +626,8 @@ static inline void blk_clear_queue_full(struct request_queue *q, int rw)
 #define RQ_NOMERGE_FLAGS       \
        (REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)       \
-       (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
+       (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
+        (blk_discard_rq(rq) || blk_fs_request((rq))))
 
 /*
  * q->prep_rq_fn return values
@@ -693,7 +699,10 @@ extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
-extern void blk_submit_request(struct request_queue *q, struct request *rq);
+extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
+extern int blk_lld_busy(struct request_queue *q);
+extern int blk_insert_cloned_request(struct request_queue *q,
+                                    struct request *rq);
 extern void blk_plug_device(struct request_queue *);
 extern void blk_plug_device_unlocked(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
@@ -825,12 +834,14 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
                               dma_drain_needed_fn *dma_drain_needed,
                               void *buf, unsigned int size);
+extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
@@ -874,6 +885,16 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 }
 
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+                               unsigned nr_sects);
+
+static inline int sb_issue_discard(struct super_block *sb,
+                                  sector_t block, unsigned nr_blocks)
+{
+       block <<= (sb->s_blocksize_bits - 9);
+       nr_blocks <<= (sb->s_blocksize_bits - 9);
+       return blkdev_issue_discard(sb->s_bdev, block, nr_blocks);
+}
 
 /*
 * command filter functions
@@ -937,7 +958,7 @@ static inline void put_dev_sector(Sector p)
 }
 
 struct work_struct;
-int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 void kblockd_flush_work(struct work_struct *work);
 
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
index d40178b..fcc4f5e 100644 (file)
@@ -23,7 +23,8 @@ enum blktrace_cat {
        BLK_TC_NOTIFY   = 1 << 10,      /* special message */
        BLK_TC_AHEAD    = 1 << 11,      /* readahead */
        BLK_TC_META     = 1 << 12,      /* metadata */
-       BLK_TC_DRV_DATA = 1 << 13,      /* binary per-driver data */
+       BLK_TC_DISCARD  = 1 << 13,      /* discard requests */
+       BLK_TC_DRV_DATA = 1 << 14,      /* binary per-drivers data */
 
        BLK_TC_END      = 1 << 15,      /* only 16-bits, reminder */
 };
@@ -83,7 +84,7 @@ enum blktrace_notify {
 #define BLK_TA_BOUNCE          (__BLK_TA_BOUNCE)
 #define BLK_TA_REMAP           (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
 #define BLK_TA_ABORT           (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
-#define BLK_TA_DRV_DATA                (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
+#define BLK_TA_DRV_DATA        (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
 
 #define BLK_TN_PROCESS         (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
 #define BLK_TN_TIMESTAMP       (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
@@ -206,6 +207,9 @@ static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
        if (likely(!bt))
                return;
 
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
        if (blk_pc_request(rq)) {
                what |= BLK_TC_ACT(BLK_TC_PC);
                __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
index befedb2..6b3be1a 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 
-struct request;
 struct dm_target;
 struct dm_table;
 struct dm_dev;
@@ -47,7 +46,6 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
  */
 typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
                          union map_info *map_context);
-
 typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
                                  union map_info *map_context);
 
@@ -62,7 +60,6 @@ typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
 typedef int (*dm_endio_fn) (struct dm_target *ti,
                            struct bio *bio, int error,
                            union map_info *map_context);
-
 typedef int (*dm_request_endio_fn) (struct dm_target *ti,
                                    struct request *clone, int error,
                                    union map_info *map_context);
@@ -81,11 +78,17 @@ typedef int (*dm_message_fn) (struct dm_target *ti, unsigned argc, char **argv);
 typedef int (*dm_ioctl_fn) (struct dm_target *ti, struct inode *inode,
                            struct file *filp, unsigned int cmd,
                            unsigned long arg);
-typedef int (*dm_congested_fn) (struct dm_target *ti);
 
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
                            struct bio_vec *biovec, int max_size);
 
+/*
+ * Returns:
+ *    0: The target can handle the next I/O immediately.
+ *    1: The target can't handle the next I/O immediately.
+ */
+typedef int (*dm_busy_fn) (struct dm_target *ti);
+
 void dm_error(const char *message);
 
 /*
@@ -124,7 +127,7 @@ struct target_type {
        dm_message_fn message;
        dm_ioctl_fn ioctl;
        dm_merge_fn merge;
-       dm_congested_fn congested;
+       dm_busy_fn busy;
 };
 
 struct io_restrictions {
@@ -137,7 +140,7 @@ struct io_restrictions {
        unsigned short max_hw_segments;
        unsigned short max_phys_segments;
        unsigned char no_cluster; /* inverted so that 0 is default */
-       unsigned char no_request_stacking; /* inverted so that 0 is default */
+       unsigned char no_request_stacking;
 };
 
 struct dm_target {
@@ -214,7 +217,6 @@ const char *dm_device_name(struct mapped_device *md);
 int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
 struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
-int dm_request_based(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 
 /*
@@ -362,4 +364,12 @@ static inline unsigned long to_bytes(sector_t n)
        return (n << SECTOR_SHIFT);
 }
 
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_request(struct request *rq);
+void dm_kill_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
+
 #endif /* _LINUX_DEVICE_MAPPER_H */
index dca63d6..92f6f63 100644 (file)
@@ -174,15 +174,15 @@ enum {
 #define rb_entry_rq(node)      rb_entry((node), struct request, rb_node)
 
 /*
- * Hack to reuse the donelist list_head as the fifo time holder while
+ * Hack to reuse the csd.list list_head as the fifo time holder while
  * the request is in the io scheduler. Saves an unsigned long in rq.
  */
-#define rq_fifo_time(rq)       ((unsigned long) (rq)->donelist.next)
-#define rq_set_fifo_time(rq,exp)       ((rq)->donelist.next = (void *) (exp))
+#define rq_fifo_time(rq)       ((unsigned long) (rq)->csd.list.next)
+#define rq_set_fifo_time(rq,exp)       ((rq)->csd.list.next = (void *) (exp))
 #define rq_entry_fifo(ptr)     list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)      do {            \
        list_del_init(&(rq)->queuelist);        \
-       INIT_LIST_HEAD(&(rq)->donelist);        \
+       INIT_LIST_HEAD(&(rq)->csd.list);        \
        } while (0)
 
 /*
index dc1ce6c..c278993 100644 (file)
@@ -86,7 +86,9 @@ extern int dir_notify_enable;
 #define READ_META      (READ | (1 << BIO_RW_META))
 #define WRITE_SYNC     (WRITE | (1 << BIO_RW_SYNC))
 #define SWRITE_SYNC    (SWRITE | (1 << BIO_RW_SYNC))
-#define WRITE_BARRIER  ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
+#define WRITE_BARRIER  (WRITE | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
+#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
 
 #define SEL_IN         1
 #define SEL_OUT                2
@@ -224,6 +226,7 @@ extern int dir_notify_enable;
 #define BLKTRACESTART _IO(0x12,116)
 #define BLKTRACESTOP _IO(0x12,117)
 #define BLKTRACETEARDOWN _IO(0x12,118)
+#define BLKDISCARD _IO(0x12,119)
 
 #define BMAP_IOCTL 1           /* obsolete - kept for compatibility */
 #define FIBMAP    _IO(0x00,1)  /* bmap access */
index 310e616..8b4aa05 100644 (file)
@@ -41,6 +41,8 @@ struct mtd_blktrans_ops {
                    unsigned long block, char *buffer);
        int (*writesect)(struct mtd_blktrans_dev *dev,
                     unsigned long block, char *buffer);
+       int (*discard)(struct mtd_blktrans_dev *dev,
+                      unsigned long block, unsigned nr_blocks);
 
        /* Block layer ioctls */
        int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);
index fa7a6a2..e46690a 100644 (file)
@@ -251,10 +251,9 @@ print_request(unsigned long addr)
                        rq.errors,
                        (unsigned long long)rq.sector, rq.nr_sectors);
 
-       kdb_printf("  hsect %llu hnrsect %lu nrseg %u nrhwseg %u currnrsect %u\n",
+       kdb_printf("  hsect %llu hnrsect %lu nrseg %u currnrsect %u\n",
                        (unsigned long long)rq.hard_sector, rq.hard_nr_sectors,
-                       rq.nr_phys_segments, rq.nr_hw_segments,
-                       rq.current_nr_sectors);
+                       rq.nr_phys_segments, rq.current_nr_sectors);
 
        return (unsigned long) rq.queuelist.next;
 }
index b6d2d0f..06722c4 100644 (file)
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        /*
         * Data-less bio, nothing to bounce
         */
-       if (bio_empty_barrier(*bio_orig))
+       if (!bio_has_data(*bio_orig))
                return;
 
        /*