!Eblock/blk-barrier.c
!Eblock/blk-tag.c
!Iblock/blk-tag.c
+!Eblock/blk-integrity.c
+!Iblock/blktrace.c
+!Iblock/genhd.c
+!Eblock/genhd.c
</chapter>
<chapter id="chrdev">
Similar to read_expire mentioned above, but for writes.
-fifo_batch
+fifo_batch (number of requests)
----------
-When a read request expires its deadline, we must move some requests from
-the sorted io scheduler list to the block device dispatch queue. fifo_batch
-controls how many requests we move.
+Requests are grouped into ``batches'' of a particular data direction (read or
+write) which are serviced in increasing sector order. To limit extra seeking,
+deadline expiries are only checked between batches. fifo_batch controls the
+maximum number of requests per batch.
+
+This parameter tunes the balance between per-request latency and aggregate
+throughput. When low latency is the primary concern, smaller is better (where
+a value of 1 yields first-come first-served behaviour). Increasing fifo_batch
+generally improves throughput, at the cost of latency variation.
writes_starved (number of dispatches)
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-timeout.o ioctl.o genhd.o \
- scsi_ioctl.o cmd-filter.o
+ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+ ioctl.o genhd.o scsi_ioctl.o cmd-filter.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
del_timer(&ad->antic_timer);
ad->antic_status = ANTIC_FINISHED;
/* see as_work_handler */
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(ad->q, &ad->antic_work);
}
}
aic = ad->io_context->aic;
ad->antic_status = ANTIC_FINISHED;
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(q, &ad->antic_work);
if (aic->ttime_samples == 0) {
/* process anticipated on has exited or timed out*/
if (ad->changed_batch && ad->nr_dispatched == 1) {
ad->current_batch_expires = jiffies +
ad->batch_expire[ad->batch_data_dir];
- kblockd_schedule_work(&ad->antic_work);
+ kblockd_schedule_work(q, &ad->antic_work);
ad->changed_batch = 0;
if (ad->batch_data_dir == REQ_SYNC)
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+
+ bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ *
+ * Description:
+ * Issue a discard request for the sectors in question. Does not wait.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ unsigned nr_sects)
+{
+ struct request_queue *q;
+ struct bio *bio;
+ int ret = 0;
+
+ if (bdev->bd_disk == NULL)
+ return -ENXIO;
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return -ENXIO;
+
+ if (!q->prepare_discard_fn)
+ return -EOPNOTSUPP;
+
+ while (nr_sects && !ret) {
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = blkdev_discard_end_io;
+ bio->bi_bdev = bdev;
+
+ bio->bi_sector = sector;
+
+ if (nr_sects > q->max_hw_sectors) {
+ bio->bi_size = q->max_hw_sectors << 9;
+ nr_sects -= q->max_hw_sectors;
+ sector += q->max_hw_sectors;
+ } else {
+ bio->bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+ bio_get(bio);
+ submit_bio(DISCARD_BARRIER, bio);
+
+ /* Check if it failed immediately */
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ else if (!bio_flagged(bio, BIO_UPTODATE))
+ ret = -EIO;
+ bio_put(bio);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
*/
static struct workqueue_struct *kblockd_workqueue;
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
static void drive_stat_acct(struct request *rq, int new_io)
{
struct hd_struct *part;
memset(rq, 0, sizeof(*rq));
INIT_LIST_HEAD(&rq->queuelist);
- INIT_LIST_HEAD(&rq->donelist);
INIT_LIST_HEAD(&rq->timeout_list);
+ rq->cpu = -1;
rq->q = q;
rq->sector = rq->hard_sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash);
blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
- kblockd_schedule_work(&q->unplug_work);
+ kblockd_schedule_work(q, &q->unplug_work);
}
void blk_unplug(struct request_queue *q)
}
EXPORT_SYMBOL(blk_unplug);
+static void blk_invoke_request_fn(struct request_queue *q)
+{
+ /*
+ * one level of recursion is ok and is much faster than kicking
+ * the unplug handling
+ */
+ if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+ q->request_fn(q);
+ queue_flag_clear(QUEUE_FLAG_REENTER, q);
+ } else {
+ queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+ kblockd_schedule_work(q, &q->unplug_work);
+ }
+}
+
/**
* blk_start_queue - restart a previously stopped queue
* @q: The &struct request_queue in question
WARN_ON(!irqs_disabled());
queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-
- /*
- * one level of recursion is ok and is much faster than kicking
- * the unplug handling
- */
- if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
- q->request_fn(q);
- queue_flag_clear(QUEUE_FLAG_REENTER, q);
- } else {
- blk_plug_device(q);
- kblockd_schedule_work(&q->unplug_work);
- }
+ blk_invoke_request_fn(q);
}
EXPORT_SYMBOL(blk_start_queue);
* Only recurse once to avoid overrunning the stack, let the unplug
* handling reinvoke the handler shortly if we already got there.
*/
- if (!elv_queue_empty(q)) {
- if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
- q->request_fn(q);
- queue_flag_clear(QUEUE_FLAG_REENTER, q);
- } else {
- blk_plug_device(q);
- kblockd_schedule_work(&q->unplug_work);
- }
- }
+ if (!elv_queue_empty(q))
+ blk_invoke_request_fn(q);
}
EXPORT_SYMBOL(__blk_run_queue);
void blk_cleanup_queue(struct request_queue *q)
{
+ /*
+ * We know we have process context here, so we can be a little
+ * cautious and ensure that pending block actions on this device
+ * are done before moving on. Going into this function, we should
+ * not have processes doing IO to this device.
+ */
+ blk_sync_queue(q);
+
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
mutex_unlock(&q->sysfs_lock);
* request queue; this lock will be taken also from interrupt context, so irq
* disabling is needed for it.
*
- * Function returns a pointer to the initialized request queue, or NULL if
+ * Function returns a pointer to the initialized request queue, or %NULL if
* it didn't succeed.
*
* Note:
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unplug_fn = generic_unplug_device;
- q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
+ q->queue_flags = (1 << QUEUE_FLAG_CLUSTER |
+ 1 << QUEUE_FLAG_STACKABLE);
q->queue_lock = lock;
blk_queue_segment_boundary(q, 0xffffffff);
blk_rq_init(q, rq);
- /*
- * first three bits are identical in rq->cmd_flags and bio->bi_rw,
- * see bio.h and blkdev.h
- */
rq->cmd_flags = rw | REQ_ALLOCED;
if (priv) {
EXPORT_SYMBOL(blk_requeue_request);
/**
- * blk_insert_request - insert a special request in to a request queue
+ * blk_insert_request - insert a special request into a request queue
* @q: request queue where request should be inserted
* @rq: request to be inserted
* @at_head: insert request at head or tail of queue
* Many block devices need to execute commands asynchronously, so they don't
* block the whole kernel from preemption during request execution. This is
* accomplished normally by inserting aritficial requests tagged as
- * REQ_SPECIAL in to the corresponding request queue, and letting them be
- * scheduled for actual execution by the request queue.
+ * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
+ * be scheduled for actual execution by the request queue.
*
* We have the option of inserting the head or the tail of the queue.
* Typically we use the tail for new ioctls and so forth. We use the head
void init_request_from_bio(struct request *req, struct bio *bio)
{
+ req->cpu = bio->bi_comp_cpu;
req->cmd_type = REQ_TYPE_FS;
/*
/*
* REQ_BARRIER implies no merging, but lets make it explicit
*/
- if (unlikely(bio_barrier(bio)))
+ if (unlikely(bio_discard(bio))) {
+ req->cmd_flags |= REQ_DISCARD;
+ if (bio_barrier(bio))
+ req->cmd_flags |= REQ_SOFTBARRIER;
+ req->q->prepare_discard_fn(req->q, req);
+ } else if (unlikely(bio_barrier(bio)))
req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
if (bio_sync(bio))
static int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
- int el_ret, nr_sectors, barrier, err;
+ int el_ret, nr_sectors, barrier, discard, err;
const unsigned short prio = bio_prio(bio);
const int sync = bio_sync(bio);
int rw_flags;
blk_queue_bounce(q, &bio);
barrier = bio_barrier(bio);
- if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
+ if (unlikely(barrier) && bio_has_data(bio) &&
+ (q->next_ordered == QUEUE_ORDERED_NONE)) {
+ err = -EOPNOTSUPP;
+ goto end_io;
+ }
+
+ discard = bio_discard(bio);
+ if (unlikely(discard) && !q->prepare_discard_fn) {
err = -EOPNOTSUPP;
goto end_io;
}
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
+ if (!blk_rq_cpu_valid(req))
+ req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
req->sector = req->hard_sector = bio->bi_sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
+ if (!blk_rq_cpu_valid(req))
+ req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock);
+ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
+ bio_flagged(bio, BIO_CPU_AFFINE))
+ req->cpu = blk_cpu_to_group(smp_processor_id());
if (elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
if (sync)
__generic_unplug_device(q);
-
spin_unlock_irq(q->queue_lock);
return 0;
}
/**
- * generic_make_request: hand a buffer to its device driver for I/O
+ * generic_make_request - hand a buffer to its device driver for I/O
* @bio: The bio describing the location in memory and on the device.
*
* generic_make_request() is used to make I/O requests of block
if (bio_check_eod(bio, nr_sectors))
goto end_io;
- if (bio_empty_barrier(bio) && !q->prepare_flush_fn) {
+ if ((bio_empty_barrier(bio) && !q->prepare_flush_fn) ||
+ (bio_discard(bio) && !q->prepare_discard_fn)) {
err = -EOPNOTSUPP;
goto end_io;
}
EXPORT_SYMBOL(generic_make_request);
/**
- * submit_bio: submit a bio to the block device layer for I/O
+ * submit_bio - submit a bio to the block device layer for I/O
* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is very similar in purpose to generic_make_request(), and
* uses that function to do most of the work. Both are fairly rough
- * interfaces, @bio must be presetup and ready for I/O.
+ * interfaces; @bio must be presetup and ready for I/O.
*
*/
void submit_bio(int rw, struct bio *bio)
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
- if (!bio_empty_barrier(bio)) {
-
- BIO_BUG_ON(!bio->bi_size);
- BIO_BUG_ON(!bio->bi_io_vec);
-
+ if (bio_has_data(bio)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
}
EXPORT_SYMBOL(submit_bio);
-/*
- * Check a request for queue limits
+/**
+ * blk_rq_check_limits - Helper function to check a request for the queue limit
+ * @q: the queue
+ * @rq: the request being checked
+ *
+ * Description:
+ * @rq may have been made based on weaker limitations of upper-level queues
+ * in request stacking drivers, and it may violate the limitation of @q.
+ * Since the block layer and the underlying device driver trust @rq
+ * after it is inserted to @q, it should be checked against @q before
+ * the insertion using this generic function.
+ *
+ * This function should also be useful for request stacking drivers
+ * in some cases below, so export this fuction.
+ * Request stacking drivers like request-based dm may change the queue
+ * limits while requests are in the queue (e.g. dm's table swapping).
+ * Such request stacking drivers should check those requests agaist
+ * the new queue limits again when they dispatch those requests,
+ * although such checkings are also done against the old queue limits
+ * when submitting requests.
*/
-static int check_queue_limit(struct request_queue *q, struct request *rq)
+int blk_rq_check_limits(struct request_queue *q, struct request *rq)
{
if (rq->nr_sectors > q->max_sectors ||
- rq->data_len >> 9 > q->max_hw_sectors) {
+ rq->data_len > q->max_hw_sectors << 9) {
printk(KERN_ERR "%s: over max size limit.\n", __func__);
- return 1;
+ return -EIO;
}
/*
*/
blk_recalc_rq_segments(rq);
if (rq->nr_phys_segments > q->max_phys_segments ||
- rq->nr_hw_segments > q->max_hw_segments) {
+ rq->nr_phys_segments > q->max_hw_segments) {
printk(KERN_ERR "%s: over max segments limit.\n", __func__);
- return 1;
+ return -EIO;
}
return 0;
}
+EXPORT_SYMBOL_GPL(blk_rq_check_limits);
/**
- * blk_submit_request - Helper for stacking drivers to submit the request
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
* @q: the queue to submit the request
* @rq: the request being queued
- **/
-void blk_submit_request(struct request_queue *q, struct request *rq)
+ */
+int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{
unsigned long flags;
- if (check_queue_limit(q, rq))
- goto end_io;
+ if (blk_rq_check_limits(q, rq))
+ return -EIO;
#ifdef CONFIG_FAIL_MAKE_REQUEST
- if (rq->rq_disk && rq->rq_disk->flags & GENHD_FL_FAIL &&
+ if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
should_fail(&fail_make_request, blk_rq_bytes(rq)))
- goto end_io;
+ return -EIO;
#endif
spin_lock_irqsave(q->queue_lock, flags);
spin_unlock_irqrestore(q->queue_lock, flags);
- return;
-
-end_io:
- blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+ return 0;
}
-EXPORT_SYMBOL_GPL(blk_submit_request);
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
/**
* __end_that_request_first - end I/O on a request
* @req: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
* for the next range of segments (if any) in the cluster.
*
* Return:
- * 0 - we are done with this request, call end_that_request_last()
- * 1 - still buffers pending for this request
+ * %0 - we are done with this request, call end_that_request_last()
+ * %1 - still buffers pending for this request
**/
static int __end_that_request_first(struct request *req, int error,
int nr_bytes)
blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
/*
- * for a REQ_BLOCK_PC request, we want to carry any eventual
+ * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
* sense key with us all the way through
*/
if (!blk_pc_request(req))
}
/*
- * splice the completion data to a local structure and hand off to
- * process_completion_queue() to complete the requests
- */
-static void blk_done_softirq(struct softirq_action *h)
-{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, donelist);
- list_del_init(&rq->donelist);
- rq->q->softirq_done_fn(rq);
- }
-}
-
-static int __cpuinit blk_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
- int cpu = (unsigned long) hcpu;
-
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- &__get_cpu_var(blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
- }
-
- return NOTIFY_OK;
-}
-
-
-static struct notifier_block blk_cpu_notifier __cpuinitdata = {
- .notifier_call = blk_cpu_notify,
-};
-
-/**
- * blk_complete_request - end I/O on a request
- * @req: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions,
- * unless the driver actually implements this in its completion callback
- * through requeueing. The actual completion happens out-of-order,
- * through a softirq handler. The user must have registered a completion
- * callback through blk_queue_softirq_done().
- **/
-
-void __blk_complete_request(struct request *req)
-{
- struct list_head *cpu_list;
- unsigned long flags;
-
- BUG_ON(!req->q->softirq_done_fn);
-
- local_irq_save(flags);
-
- cpu_list = &__get_cpu_var(blk_cpu_done);
- list_add_tail(&req->donelist, cpu_list);
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
- local_irq_restore(flags);
-}
-
-/**
- * blk_complete_request - end I/O on a request
- * @req: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions,
- * unless the driver actually implements this in its completion callback
- * through requeueing. The actual completion happens out-of-order,
- * through a softirq handler. The user must have registered a completion
- * callback through blk_queue_softirq_done().
- **/
-
-void blk_complete_request(struct request *req)
-{
- if (!blk_mark_rq_complete(req))
- __blk_complete_request(req);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
-/*
* queue lock must be held
*/
static void end_that_request_last(struct request *req, int error)
/**
* end_queued_request - end all I/O on a queued request
* @rq: the request being processed
- * @uptodate: error value or 0/1 uptodate flag
+ * @uptodate: error value or %0/%1 uptodate flag
*
* Description:
* Ends all I/O on a request, and removes it from the block layer queues.
- * Not suitable for normal IO completion, unless the driver still has
+ * Not suitable for normal I/O completion, unless the driver still has
* the request attached to the block layer.
*
**/
/**
* end_dequeued_request - end all I/O on a dequeued request
* @rq: the request being processed
- * @uptodate: error value or 0/1 uptodate flag
+ * @uptodate: error value or %0/%1 uptodate flag
*
* Description:
* Ends all I/O on a request. The request must already have been
/**
* end_request - end I/O on the current segment of the request
* @req: the request being processed
- * @uptodate: error value or 0/1 uptodate flag
+ * @uptodate: error value or %0/%1 uptodate flag
*
* Description:
* Ends I/O on the current segment of a request. If that is the only
* remaining segment, the request is also completed and freed.
*
- * This is a remnant of how older block drivers handled IO completions.
- * Modern drivers typically end IO on the full request in one go, unless
+ * This is a remnant of how older block drivers handled I/O completions.
+ * Modern drivers typically end I/O on the full request in one go, unless
* they have a residual value to account for. For that case this function
* isn't really useful, unless the residual just happens to be the
* full current segment. In other words, don't use this function in new
static int end_that_request_data(struct request *rq, int error,
unsigned int nr_bytes, unsigned int bidi_bytes)
{
- if (blk_fs_request(rq) || blk_pc_request(rq)) {
+ if (rq->bio) {
if (__end_that_request_first(rq, error, nr_bytes))
return 1;
/**
* blk_end_io - Generic end_io function to complete a request.
* @rq: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete @rq
* @bidi_bytes: number of bytes to complete @rq->next_rq
* @drv_callback: function called between completion of bios in the request
* and completion of the request.
- * If the callback returns non 0, this helper returns without
+ * If the callback returns non %0, this helper returns without
* completion of the request.
*
* Description:
* If @rq has leftover, sets it up for the next range of segments.
*
* Return:
- * 0 - we are done with this request
- * 1 - this request is not freed yet, it still has pending buffers.
+ * %0 - we are done with this request
+ * %1 - this request is not freed yet, it still has pending buffers.
**/
static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
unsigned int bidi_bytes,
/**
* blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
* If @rq has leftover, sets it up for the next range of segments.
*
* Return:
- * 0 - we are done with this request
- * 1 - still buffers pending for this request
+ * %0 - we are done with this request
+ * %1 - still buffers pending for this request
**/
int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
/**
* __blk_end_request - Helper function for drivers to complete the request.
* @rq: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete
*
* Description:
* Must be called with queue lock held unlike blk_end_request().
*
* Return:
- * 0 - we are done with this request
- * 1 - still buffers pending for this request
+ * %0 - we are done with this request
+ * %1 - still buffers pending for this request
**/
int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
{
- if (blk_fs_request(rq) || blk_pc_request(rq)) {
- if (__end_that_request_first(rq, error, nr_bytes))
- return 1;
- }
+ if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
+ return 1;
add_disk_randomness(rq->rq_disk);
/**
* blk_end_bidi_request - Helper function for drivers to complete bidi request.
* @rq: the bidi request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete @rq
* @bidi_bytes: number of bytes to complete @rq->next_rq
*
* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
*
* Return:
- * 0 - we are done with this request
- * 1 - still buffers pending for this request
+ * %0 - we are done with this request
+ * %1 - still buffers pending for this request
**/
int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
unsigned int bidi_bytes)
/**
* blk_update_request - Special helper function for request stacking drivers
* @rq: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete @rq
*
* Description:
* This special helper function is only for request stacking drivers
* (e.g. request-based dm) so that they can handle partial completion.
* Actual device drivers should use blk_end_request instead.
- **/
+ */
void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
{
if (!end_that_request_data(rq, error, nr_bytes, 0)) {
/*
- * All bios in the request have been completed.
- * Then, members of the request are not updated.
- * Update those members to avoid double charge of diskstat
- * when the stacking driver calls blk_end_request()
- * to complete the request actually.
+ * These members are not updated in end_that_request_data()
+ * when all bios are completed.
+ * Update them so that the request stacking driver can find
+ * how many bytes remain in the request later.
*/
rq->nr_sectors = rq->hard_nr_sectors = 0;
rq->current_nr_sectors = rq->hard_cur_sectors = 0;
- rq->nr_phys_segments = rq->nr_hw_segments = 0;
}
}
EXPORT_SYMBOL_GPL(blk_update_request);
/**
* blk_end_request_callback - Special helper function for tricky drivers
* @rq: the request being processed
- * @error: 0 for success, < 0 for error
+ * @error: %0 for success, < %0 for error
* @nr_bytes: number of bytes to complete
* @drv_callback: function called between completion of bios in the request
* and completion of the request.
- * If the callback returns non 0, this helper returns without
+ * If the callback returns non %0, this helper returns without
* completion of the request.
*
* Description:
* Don't use this interface in other places anymore.
*
* Return:
- * 0 - we are done with this request
- * 1 - this request is not freed yet.
- * this request still has pending buffers or
- * the driver doesn't want to finish this request yet.
+ * %0 - we are done with this request
+ * %1 - this request is not freed yet.
+ * this request still has pending buffers or
+ * the driver doesn't want to finish this request yet.
**/
int blk_end_request_callback(struct request *rq, int error,
unsigned int nr_bytes,
void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- /* first two bits are identical in rq->cmd_flags and bio->bi_rw */
+ /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
+ we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
rq->cmd_flags |= (bio->bi_rw & 3);
- rq->nr_phys_segments = bio_phys_segments(q, bio);
- rq->nr_hw_segments = bio_hw_segments(q, bio);
+ if (bio_has_data(bio)) {
+ rq->nr_phys_segments = bio_phys_segments(q, bio);
+ rq->buffer = bio_data(bio);
+ }
rq->current_nr_sectors = bio_cur_sectors(bio);
rq->hard_cur_sectors = rq->current_nr_sectors;
rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
- rq->buffer = bio_data(bio);
rq->data_len = bio->bi_size;
rq->bio = rq->biotail = bio;
rq->rq_disk = bio->bi_bdev->bd_disk;
}
-int kblockd_schedule_work(struct work_struct *work)
+/**
+ * blk_lld_busy - Check if underlying low-level drivers of a device are busy
+ * @q : the queue of the device being checked
+ *
+ * Description:
+ * Check if underlying low-level drivers of a device are busy.
+ * If the drivers want to export their busy state, they must set own
+ * exporting function using blk_queue_lld_busy() first.
+ *
+ * Basically, this function is used only by request stacking drivers
+ * to stop dispatching requests to underlying devices when underlying
+ * devices are busy. This behavior helps more I/O merging on the queue
+ * of the request stacking driver and prevents I/O throughput regression
+ * on burst I/O load.
+ *
+ * Return:
+ * 0 - Not busy (The request stacking driver should dispatch request)
+ * 1 - Busy (The request stacking driver should stop dispatching request)
+ */
+int blk_lld_busy(struct request_queue *q)
+{
+ if (q->lld_busy_fn)
+ return q->lld_busy_fn(q);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_lld_busy);
+
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
}
int __init blk_dev_init(void)
{
- int i;
-
kblockd_workqueue = create_workqueue("kblockd");
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("blkdev_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- register_hotcpu_notifier(&blk_cpu_notifier);
-
return 0;
}
/**
* blk_end_sync_rq - executes a completion event on a request
* @rq: request to complete
- * @error: end io status of the request
+ * @error: end I/O status of the request
*/
static void blk_end_sync_rq(struct request *rq, int error)
{
* @done: I/O completion handler
*
* Description:
- * Insert a fully prepared request at the back of the io scheduler queue
+ * Insert a fully prepared request at the back of the I/O scheduler queue
* for execution. Don't wait for completion.
*/
void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
* @at_head: insert request at head or tail of queue
*
* Description:
- * Insert a fully prepared request at the back of the io scheduler queue
+ * Insert a fully prepared request at the back of the I/O scheduler queue
* for execution and wait for completion.
*/
int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
/**
* blk_integrity_compare - Compare integrity profile of two block devices
- * @b1: Device to compare
- * @b2: Device to compare
+ * @bd1: Device to compare
+ * @bd2: Device to compare
*
* Description: Meta-devices like DM and MD need to verify that all
* sub-devices use the same integrity format before advertising to
}
/**
- * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request structure to fill
* @ubuf: the user buffer
* @len: length of user data
*
* Description:
- * Data will be mapped directly for zero copy io, if possible. Otherwise
+ * Data will be mapped directly for zero copy I/O, if possible. Otherwise
* a kernel bounce buffer is used.
*
- * A matching blk_rq_unmap_user() must be issued at the end of io, while
+ * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
EXPORT_SYMBOL(blk_rq_map_user);
/**
- * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to map data to
* @iov: pointer to the iovec
* @len: I/O byte count
*
* Description:
- * Data will be mapped directly for zero copy io, if possible. Otherwise
+ * Data will be mapped directly for zero copy I/O, if possible. Otherwise
* a kernel bounce buffer is used.
*
- * A matching blk_rq_unmap_user() must be issued at the end of io, while
+ * A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
*
* Note: The mapped bio may need to be bounced through blk_queue_bounce()
* Description:
* Unmap a rq previously mapped by blk_rq_map_user(). The caller must
* supply the original rq->bio from the blk_rq_map_user() return, since
- * the io completion may have changed rq->bio.
+ * the I/O completion may have changed rq->bio.
*/
int blk_rq_unmap_user(struct bio *bio)
{
EXPORT_SYMBOL(blk_rq_unmap_user);
/**
- * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
+ * blk_rq_map_kern - map kernel data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
* @rq: request to fill
* @kbuf: the kernel buffer
void blk_recalc_rq_sectors(struct request *rq, int nsect)
{
- if (blk_fs_request(rq)) {
+ if (blk_fs_request(rq) || blk_discard_rq(rq)) {
rq->hard_sector += nsect;
rq->hard_nr_sectors -= nsect;
void blk_recalc_rq_segments(struct request *rq)
{
int nr_phys_segs;
- int nr_hw_segs;
unsigned int phys_size;
- unsigned int hw_size;
struct bio_vec *bv, *bvprv = NULL;
int seg_size;
- int hw_seg_size;
int cluster;
struct req_iterator iter;
int high, highprv = 1;
return;
cluster = test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
- hw_seg_size = seg_size = 0;
- phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+ seg_size = 0;
+ phys_size = nr_phys_segs = 0;
rq_for_each_segment(bv, rq, iter) {
/*
* the trick here is making sure that a high page is never
*/
high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
if (high || highprv)
- goto new_hw_segment;
+ goto new_segment;
if (cluster) {
if (seg_size + bv->bv_len > q->max_segment_size)
goto new_segment;
goto new_segment;
if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
goto new_segment;
- if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
- goto new_hw_segment;
seg_size += bv->bv_len;
- hw_seg_size += bv->bv_len;
bvprv = bv;
continue;
}
new_segment:
- if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
- !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
- hw_seg_size += bv->bv_len;
- else {
-new_hw_segment:
- if (nr_hw_segs == 1 &&
- hw_seg_size > rq->bio->bi_hw_front_size)
- rq->bio->bi_hw_front_size = hw_seg_size;
- hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
- nr_hw_segs++;
- }
-
nr_phys_segs++;
bvprv = bv;
seg_size = bv->bv_len;
highprv = high;
}
- if (nr_hw_segs == 1 &&
- hw_seg_size > rq->bio->bi_hw_front_size)
- rq->bio->bi_hw_front_size = hw_seg_size;
- if (hw_seg_size > rq->biotail->bi_hw_back_size)
- rq->biotail->bi_hw_back_size = hw_seg_size;
rq->nr_phys_segments = nr_phys_segs;
- rq->nr_hw_segments = nr_hw_segs;
}
void blk_recount_segments(struct request_queue *q, struct bio *bio)
blk_recalc_rq_segments(&rq);
bio->bi_next = nxt;
bio->bi_phys_segments = rq.nr_phys_segments;
- bio->bi_hw_segments = rq.nr_hw_segments;
bio->bi_flags |= (1 << BIO_SEG_VALID);
}
EXPORT_SYMBOL(blk_recount_segments);
if (!test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags))
return 0;
- if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
- return 0;
if (bio->bi_size + nxt->bi_size > q->max_segment_size)
return 0;
+ if (!bio_has_data(bio))
+ return 1;
+
+ if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
+ return 0;
+
/*
- * bio and nxt are contigous in memory, check if the queue allows
+ * bio and nxt are contiguous in memory; check if the queue allows
* these two to be merged into one
*/
if (BIO_SEG_BOUNDARY(q, bio, nxt))
return 0;
}
-static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
- struct bio *nxt)
-{
- if (!bio_flagged(bio, BIO_SEG_VALID))
- blk_recount_segments(q, bio);
- if (!bio_flagged(nxt, BIO_SEG_VALID))
- blk_recount_segments(q, nxt);
- if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
- BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
- return 0;
- if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
- return 0;
-
- return 1;
-}
-
/*
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
struct request *req,
struct bio *bio)
{
- int nr_hw_segs = bio_hw_segments(q, bio);
int nr_phys_segs = bio_phys_segments(q, bio);
- if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
+ if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
|| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
* This will form the start of a new hw segment. Bump both
* counters.
*/
- req->nr_hw_segments += nr_hw_segs;
req->nr_phys_segments += nr_phys_segs;
return 1;
}
struct bio *bio)
{
unsigned short max_sectors;
- int len;
if (unlikely(blk_pc_request(req)))
max_sectors = q->max_hw_sectors;
blk_recount_segments(q, req->biotail);
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
- len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
- if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio))
- && !BIOVEC_VIRT_OVERSIZE(len)) {
- int mergeable = ll_new_mergeable(q, req, bio);
-
- if (mergeable) {
- if (req->nr_hw_segments == 1)
- req->bio->bi_hw_front_size = len;
- if (bio->bi_hw_segments == 1)
- bio->bi_hw_back_size = len;
- }
- return mergeable;
- }
return ll_new_hw_segment(q, req, bio);
}
struct bio *bio)
{
unsigned short max_sectors;
- int len;
if (unlikely(blk_pc_request(req)))
max_sectors = q->max_hw_sectors;
q->last_merge = NULL;
return 0;
}
- len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
if (!bio_flagged(bio, BIO_SEG_VALID))
blk_recount_segments(q, bio);
if (!bio_flagged(req->bio, BIO_SEG_VALID))
blk_recount_segments(q, req->bio);
- if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
- !BIOVEC_VIRT_OVERSIZE(len)) {
- int mergeable = ll_new_mergeable(q, req, bio);
-
- if (mergeable) {
- if (bio->bi_hw_segments == 1)
- bio->bi_hw_front_size = len;
- if (req->nr_hw_segments == 1)
- req->biotail->bi_hw_back_size = len;
- }
- return mergeable;
- }
return ll_new_hw_segment(q, req, bio);
}
struct request *next)
{
int total_phys_segments;
- int total_hw_segments;
/*
* First check if the either of the requests are re-queued
if (total_phys_segments > q->max_phys_segments)
return 0;
- total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
- if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
- int len = req->biotail->bi_hw_back_size +
- next->bio->bi_hw_front_size;
- /*
- * propagate the combined length to the end of the requests
- */
- if (req->nr_hw_segments == 1)
- req->bio->bi_hw_front_size = len;
- if (next->nr_hw_segments == 1)
- next->biotail->bi_hw_back_size = len;
- total_hw_segments--;
- }
-
- if (total_hw_segments > q->max_hw_segments)
+ if (total_phys_segments > q->max_hw_segments)
return 0;
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
- req->nr_hw_segments = total_hw_segments;
return 1;
}
}
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+ if (blk_rq_cpu_valid(next))
+ req->cpu = next->cpu;
__blk_put_request(q, next);
return 1;
EXPORT_SYMBOL(blk_queue_prep_rq);
/**
+ * blk_queue_set_discard - set a discard_sectors function for queue
+ * @q: queue
+ * @dfn: prepare_discard function
+ *
+ * It's possible for a queue to register a discard callback which is used
+ * to transform a discard request into the appropriate type for the
+ * hardware. If none is registered, then discard requests are failed
+ * with %EOPNOTSUPP.
+ *
+ */
+void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn)
+{
+ q->prepare_discard_fn = dfn;
+}
+EXPORT_SYMBOL(blk_queue_set_discard);
+
+/**
* blk_queue_merge_bvec - set a merge_bvec function for queue
* @q: queue
* @mbfn: merge_bvec_fn
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
+void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
+{
+ q->lld_busy_fn = fn;
+}
+EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
+
/**
* blk_queue_make_request - define an alternate make_request function for a device
* @q: the request queue for the device to be affected
* Different hardware can have different requirements as to what pages
* it can do I/O directly to. A low level driver can call
* blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @page.
+ * buffers for doing I/O to pages residing above @dma_addr.
**/
void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
{
* Description:
* Enables a low level driver to set an upper limit on the number of
* hw data segments in a request. This would be the largest number of
- * address/length pairs the host adapter can actually give as once
+ * address/length pairs the host adapter can actually give at once
* to the device.
**/
void blk_queue_max_hw_segments(struct request_queue *q,
* @mask: alignment mask
*
* description:
- * set required memory and length aligment for direct dma transactions.
+ * set required memory and length alignment for direct dma transactions.
* this is used when buiding direct io requests for the queue.
*
**/
* @mask: alignment mask
*
* description:
- * update required memory and length aligment for direct dma transactions.
+ * update required memory and length alignment for direct dma transactions.
* If the requested alignment is larger than the current alignment, then
* the current queue alignment is updated to the new value, otherwise it
* is left alone. The design of this is to allow multiple objects
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
-static int __init blk_settings_init(void)
+int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
blk_max_pfn = max_pfn - 1;
--- /dev/null
+/*
+ * Functions related to softirq rq completions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include "blk.h"
+
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+ struct list_head *cpu_list, local_list;
+
+ local_irq_disable();
+ cpu_list = &__get_cpu_var(blk_cpu_done);
+ list_replace_init(cpu_list, &local_list);
+ local_irq_enable();
+
+ while (!list_empty(&local_list)) {
+ struct request *rq;
+
+ rq = list_entry(local_list.next, struct request, csd.list);
+ list_del_init(&rq->csd.list);
+ rq->q->softirq_done_fn(rq);
+ }
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+static void trigger_softirq(void *data)
+{
+ struct request *rq = data;
+ unsigned long flags;
+ struct list_head *list;
+
+ local_irq_save(flags);
+ list = &__get_cpu_var(blk_cpu_done);
+ list_add_tail(&rq->csd.list, list);
+
+ if (list->next == &rq->csd.list)
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+ local_irq_restore(flags);
+}
+
+/*
+ * Setup and invoke a run of 'trigger_softirq' on the given cpu.
+ */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+ if (cpu_online(cpu)) {
+ struct call_single_data *data = &rq->csd;
+
+ data->func = trigger_softirq;
+ data->info = rq;
+ data->flags = 0;
+
+ __smp_call_function_single(cpu, data);
+ return 0;
+ }
+
+ return 1;
+}
+#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */
+static int raise_blk_irq(int cpu, struct request *rq)
+{
+ return 1;
+}
+#endif
+
+static int __cpuinit blk_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+ int cpu = (unsigned long) hcpu;
+
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ &__get_cpu_var(blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata blk_cpu_notifier = {
+ .notifier_call = blk_cpu_notify,
+};
+
+void __blk_complete_request(struct request *req)
+{
+ struct request_queue *q = req->q;
+ unsigned long flags;
+ int ccpu, cpu, group_cpu;
+
+ BUG_ON(!q->softirq_done_fn);
+
+ local_irq_save(flags);
+ cpu = smp_processor_id();
+ group_cpu = blk_cpu_to_group(cpu);
+
+ /*
+ * Select completion CPU
+ */
+ if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+ ccpu = req->cpu;
+ else
+ ccpu = cpu;
+
+ if (ccpu == cpu || ccpu == group_cpu) {
+ struct list_head *list;
+do_local:
+ list = &__get_cpu_var(blk_cpu_done);
+ list_add_tail(&req->csd.list, list);
+
+ /*
+ * if the list only contains our just added request,
+ * signal a raise of the softirq. If there are already
+ * entries there, someone already raised the irq but it
+ * hasn't run yet.
+ */
+ if (list->next == &req->csd.list)
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ } else if (raise_blk_irq(ccpu, req))
+ goto do_local;
+
+ local_irq_restore(flags);
+}
+
+/**
+ * blk_complete_request - end I/O on a request
+ * @req: the request being processed
+ *
+ * Description:
+ * Ends all I/O on a request. It does not handle partial completions,
+ * unless the driver actually implements this in its completion callback
+ * through requeueing. The actual completion happens out-of-order,
+ * through a softirq handler. The user must have registered a completion
+ * callback through blk_queue_softirq_done().
+ **/
+void blk_complete_request(struct request *req)
+{
+ if (!blk_mark_rq_complete(req))
+ __blk_complete_request(req);
+}
+EXPORT_SYMBOL(blk_complete_request);
+
+__init int blk_softirq_init(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+ register_hotcpu_notifier(&blk_cpu_notifier);
+ return 0;
+}
+subsys_initcall(blk_softirq_init);
return ret;
}
+static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
+{
+ unsigned int set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+
+ return queue_var_show(set != 0, page);
+}
+
+static ssize_t
+queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
+{
+ ssize_t ret = -EINVAL;
+#if defined(CONFIG_USE_GENERIC_SMP_HELPERS)
+ unsigned long val;
+
+ ret = queue_var_store(&val, page, count);
+ spin_lock_irq(q->queue_lock);
+ if (val)
+ queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+ else
+ queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+ spin_unlock_irq(q->queue_lock);
+#endif
+ return ret;
+}
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.store = queue_nomerges_store,
};
+static struct queue_sysfs_entry queue_rq_affinity_entry = {
+ .attr = {.name = "rq_affinity", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_rq_affinity_show,
+ .store = queue_rq_affinity_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_nomerges_entry.attr,
+ &queue_rq_affinity_entry.attr,
NULL,
};
return 0;
}
-EXPORT_SYMBOL_GPL(blk_register_queue);
void blk_unregister_queue(struct gendisk *disk)
{
* __blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
- * Tries to free the specified @bqt@. Returns true if it was
+ * Tries to free the specified @bqt. Returns true if it was
* actually freed and false if there are still references using it
*/
static int __blk_free_tags(struct blk_queue_tag *bqt)
* blk_free_tags - release a given set of tag maintenance info
* @bqt: the tag map to free
*
- * For externally managed @bqt@ frees the map. Callers of this
+ * For externally managed @bqt frees the map. Callers of this
* function must guarantee to have released all the queues that
* might have been using this tag map.
*/
* @q: the request queue for the device
*
* Notes:
- * This is used to disabled tagged queuing to a device, yet leave
+ * This is used to disable tagged queuing to a device, yet leave
* queue in function.
**/
void blk_queue_free_tags(struct request_queue *q)
* @rq: the request that has completed
*
* Description:
- * Typically called when end_that_request_first() returns 0, meaning
+ * Typically called when end_that_request_first() returns %0, meaning
* all transfers have been done for a request. It's important to call
* this function before end_that_request_last(), as that will put the
* request back on the free list thus corrupting the internal tag list.
#endif /* BLK_DEV_INTEGRITY */
+static inline int blk_cpu_to_group(int cpu)
+{
+#ifdef CONFIG_SCHED_MC
+ cpumask_t mask = cpu_coregroup_map(cpu);
+ return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
+ return first_cpu(per_cpu(cpu_sibling_map, cpu));
+#else
+ return cpu;
+#endif
+}
+
#endif
*/
static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
-/*
- * Bio action bits of interest
- */
-static u32 bio_act[9] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC), 0, BLK_TC_ACT(BLK_TC_AHEAD), 0, 0, 0, BLK_TC_ACT(BLK_TC_META) };
-
-/*
- * More could be added as needed, taking care to increment the decrementer
- * to get correct indexing
- */
-#define trace_barrier_bit(rw) \
- (((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
-#define trace_sync_bit(rw) \
- (((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
-#define trace_ahead_bit(rw) \
- (((rw) & (1 << BIO_RW_AHEAD)) << (2 - BIO_RW_AHEAD))
-#define trace_meta_bit(rw) \
- (((rw) & (1 << BIO_RW_META)) >> (BIO_RW_META - 3))
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ( (rw & (1 << BIO_RW_ ## __name)) << \
+ (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name) )
/*
* The worker for the various blk_add_trace*() types. Fills out a
return;
what |= ddir_act[rw & WRITE];
- what |= bio_act[trace_barrier_bit(rw)];
- what |= bio_act[trace_sync_bit(rw)];
- what |= bio_act[trace_ahead_bit(rw)];
- what |= bio_act[trace_meta_bit(rw)];
+ what |= MASK_TC_BIT(rw, BARRIER);
+ what |= MASK_TC_BIT(rw, SYNC);
+ what |= MASK_TC_BIT(rw, AHEAD);
+ what |= MASK_TC_BIT(rw, META);
+ what |= MASK_TC_BIT(rw, DISCARD);
pid = tsk->pid;
if (unlikely(act_log_check(bt, what, sector, pid)))
#define CFQ_MIN_TT (2)
#define CFQ_SLICE_SCALE (5)
+#define CFQ_HW_QUEUE_MIN (5)
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
int rq_in_driver;
int sync_flight;
+
+ /*
+ * queue-depth detection
+ */
+ int rq_queued;
int hw_tag;
+ int hw_tag_samples;
+ int rq_in_driver_peak;
/*
* idle window management
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
- kblockd_schedule_work(&cfqd->unplug_work);
+ kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
}
}
cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
cfqd->rq_in_driver);
- /*
- * If the depth is larger 1, it really could be queueing. But lets
- * make the mark a little higher - idling could still be good for
- * low queueing, and a low queueing number could also just indicate
- * a SCSI mid layer like behaviour where limit+1 is often seen.
- */
- if (!cfqd->hw_tag && cfqd->rq_in_driver > 4)
- cfqd->hw_tag = 1;
-
cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
}
list_del_init(&rq->queuelist);
cfq_del_rq_rb(rq);
+ cfqq->cfqd->rq_queued--;
if (rq_is_meta(rq)) {
WARN_ON(!cfqq->meta_pending);
cfqq->meta_pending--;
{
struct cfq_io_context *cic = RQ_CIC(rq);
+ cfqd->rq_queued++;
if (rq_is_meta(rq))
cfqq->meta_pending++;
cfq_rq_enqueued(cfqd, cfqq, rq);
}
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void cfq_update_hw_tag(struct cfq_data *cfqd)
+{
+ if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
+ cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+
+ if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
+ cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+ return;
+
+ if (cfqd->hw_tag_samples++ < 50)
+ return;
+
+ if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN)
+ cfqd->hw_tag = 1;
+ else
+ cfqd->hw_tag = 0;
+
+ cfqd->hw_tag_samples = 0;
+ cfqd->rq_in_driver_peak = 0;
+}
+
static void cfq_completed_request(struct request_queue *q, struct request *rq)
{
struct cfq_queue *cfqq = RQ_CFQQ(rq);
now = jiffies;
cfq_log_cfqq(cfqd, cfqq, "complete");
+ cfq_update_hw_tag(cfqd);
+
WARN_ON(!cfqd->rq_in_driver);
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
cfqd->cfq_slice[1] = cfq_slice_sync;
cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
cfqd->cfq_slice_idle = cfq_slice_idle;
+ cfqd->hw_tag = 1;
return cfqd;
}
return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
case BLKFLSBUF:
case BLKROSET:
+ case BLKDISCARD:
/*
* the ones below are implemented in blkdev_locked_ioctl,
* but we call blkdev_ioctl, which gets the lock for us
*/
struct rb_root sort_list[2];
struct list_head fifo_list[2];
-
+
/*
* next in sort order. read, write or both are NULL
*/
static void deadline_move_request(struct deadline_data *, struct request *);
-#define RQ_RB_ROOT(dd, rq) (&(dd)->sort_list[rq_data_dir((rq))])
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+ return &dd->sort_list[rq_data_dir(rq)];
+}
/*
* get the request after `rq' in sector-sorted order
static void
deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
{
- struct rb_root *root = RQ_RB_ROOT(dd, rq);
+ struct rb_root *root = deadline_rb_root(dd, rq);
struct request *__alias;
-retry:
- __alias = elv_rb_add(root, rq);
- if (unlikely(__alias)) {
+ while (unlikely(__alias = elv_rb_add(root, rq)))
deadline_move_request(dd, __alias);
- goto retry;
- }
}
static inline void
if (dd->next_rq[data_dir] == rq)
dd->next_rq[data_dir] = deadline_latter_request(rq);
- elv_rb_del(RQ_RB_ROOT(dd, rq), rq);
+ elv_rb_del(deadline_rb_root(dd, rq), rq);
}
/*
deadline_add_rq_rb(dd, rq);
/*
- * set expire time (only used for reads) and add to fifo list
+ * set expire time and add to fifo list
*/
rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]);
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
- elv_rb_del(RQ_RB_ROOT(dd, req), req);
+ elv_rb_del(deadline_rb_root(dd, req), req);
deadline_add_rq_rb(dd, req);
}
}
dd->next_rq[WRITE] = NULL;
dd->next_rq[data_dir] = deadline_latter_request(rq);
- dd->last_sector = rq->sector + rq->nr_sectors;
+ dd->last_sector = rq_end_sector(rq);
/*
* take it off the sort and fifo list, move
}
/*
- * deadline_check_fifo returns 0 if there are no expired reads on the fifo,
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
*/
static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
else
rq = dd->next_rq[READ];
- if (rq) {
- /* we have a "next request" */
-
- if (dd->last_sector != rq->sector)
- /* end the batch on a non sequential request */
- dd->batching += dd->fifo_batch;
-
- if (dd->batching < dd->fifo_batch)
- /* we are still entitled to batch */
- goto dispatch_request;
- }
+ if (rq && dd->batching < dd->fifo_batch)
+ /* we have a next request are still entitled to batch */
+ goto dispatch_request;
/*
* at this point we are not running a batch. select the appropriate
#include <linux/delay.h>
#include <linux/blktrace_api.h>
#include <linux/hash.h>
-
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include "blk.h"
return 0;
/*
+ * Don't merge file system requests and discard requests
+ */
+ if (bio_discard(bio) != bio_discard(rq->bio))
+ return 0;
+
+ /*
* different data direction or already started, don't merge
*/
if (bio_data_dir(bio) != rq_data_dir(rq))
if (e->ops->elevator_activate_req_fn)
e->ops->elevator_activate_req_fn(q, rq);
-
- if (q->rq_timed_out_fn)
- blk_add_timer(rq);
}
static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
if (e->ops->elevator_deactivate_req_fn)
e->ops->elevator_deactivate_req_fn(q, rq);
-
- if (q->rq_timed_out_fn)
- blk_delete_timer(rq);
}
static inline void __elv_rqhash_del(struct request *rq)
list_for_each_prev(entry, &q->queue_head) {
struct request *pos = list_entry_rq(entry);
+ if (blk_discard_rq(rq) != blk_discard_rq(pos))
+ break;
if (rq_data_dir(rq) != rq_data_dir(pos))
break;
if (pos->cmd_flags & stop_flags)
break;
case ELEVATOR_INSERT_SORT:
- BUG_ON(!blk_fs_request(rq));
+ BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) {
* this request is scheduling boundary, update
* end_sector
*/
- if (blk_fs_request(rq)) {
+ if (blk_fs_request(rq) || blk_discard_rq(rq)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
*/
rq->cmd_flags |= REQ_STARTED;
blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+
+ /*
+ * We are now handing the request to the hardware,
+ * add the timeout handler
+ */
+ blk_add_timer(rq);
}
if (!q->boundary_rq || q->boundary_rq == rq) {
* device can handle
*/
rq->nr_phys_segments++;
- rq->nr_hw_segments++;
}
if (!q->prep_rq_fn)
* so that we don't add it again
*/
--rq->nr_phys_segments;
- --rq->nr_hw_segments;
}
rq = NULL;
/**
* get_gendisk - get partitioning information for a given device
- * @dev: device to get partitioning information for
+ * @devt: device to get partitioning information for
+ * @part: returned partition index
*
* This function gets the structure containing partitioning
- * information for the given device @dev.
+ * information for the given device @devt.
*/
struct gendisk *get_gendisk(dev_t devt, int *part)
{
return res;
}
+static void blk_ioc_discard_endio(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+ complete(bio->bi_private);
+}
+
+static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
+ uint64_t len)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ int ret = 0;
+
+ if (start & 511)
+ return -EINVAL;
+ if (len & 511)
+ return -EINVAL;
+ start >>= 9;
+ len >>= 9;
+
+ if (start + len > (bdev->bd_inode->i_size >> 9))
+ return -EINVAL;
+
+ if (!q->prepare_discard_fn)
+ return -EOPNOTSUPP;
+
+ while (len && !ret) {
+ DECLARE_COMPLETION_ONSTACK(wait);
+ struct bio *bio;
+
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (!bio)
+ return -ENOMEM;
+
+ bio->bi_end_io = blk_ioc_discard_endio;
+ bio->bi_bdev = bdev;
+ bio->bi_private = &wait;
+ bio->bi_sector = start;
+
+ if (len > q->max_hw_sectors) {
+ bio->bi_size = q->max_hw_sectors << 9;
+ len -= q->max_hw_sectors;
+ start += q->max_hw_sectors;
+ } else {
+ bio->bi_size = len << 9;
+ len = 0;
+ }
+ submit_bio(DISCARD_NOBARRIER, bio);
+
+ wait_for_completion(&wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ else if (!bio_flagged(bio, BIO_UPTODATE))
+ ret = -EIO;
+ bio_put(bio);
+ }
+ return ret;
+}
+
static int put_ushort(unsigned long arg, unsigned short val)
{
return put_user(val, (unsigned short __user *)arg);
set_device_ro(bdev, n);
unlock_kernel();
return 0;
+
+ case BLKDISCARD: {
+ uint64_t range[2];
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (copy_from_user(range, (void __user *)arg, sizeof(range)))
+ return -EFAULT;
+
+ return blk_ioctl_discard(bdev, range[0], range[1]);
+ }
+
case HDIO_GETGEO: {
struct hd_geometry geo;
if (blk_fs_request(req)) {
if (ps3disk_submit_request_sg(dev, req))
break;
- } else if (req->cmd_type == REQ_TYPE_FLUSH) {
+ } else if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+ req->cmd[0] == REQ_LB_OP_FLUSH) {
if (ps3disk_submit_flush_request(dev, req))
break;
} else {
return IRQ_HANDLED;
}
- if (req->cmd_type == REQ_TYPE_FLUSH) {
+ if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+ req->cmd[0] == REQ_LB_OP_FLUSH) {
read = 0;
num_sectors = req->hard_cur_sectors;
op = "flush";
dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
- req->cmd_type = REQ_TYPE_FLUSH;
+ req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+ req->cmd[0] = REQ_LB_OP_FLUSH;
}
static unsigned long ps3disk_mask;
if (blk_fs_request(vbr->req)) {
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = vbr->req->sector;
- vbr->out_hdr.ioprio = vbr->req->ioprio;
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
} else if (blk_pc_request(vbr->req)) {
vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
vbr->out_hdr.sector = 0;
- vbr->out_hdr.ioprio = vbr->req->ioprio;
+ vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
} else {
/* We don't put anything else in the queue. */
BUG();
next = spec->next;
}
+ r = dm_table_set_type(table);
+ if (r) {
+ DMWARN("unable to set table type");
+ return r;
+ }
+
return dm_table_complete(table);
}
goto out;
}
+ r = dm_init_md_mempool(md, dm_table_get_type(t));
+ if (r) {
+ DMWARN("unable to initialize the md mempools for this table");
+ dm_table_put(t);
+ goto out;
+ }
+
down_write(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
#include "dm.h"
#include "dm-path-selector.h"
-#include "dm-bio-list.h"
-#include "dm-bio-record.h"
#include "dm-uevent.h"
#include <linux/ctype.h>
struct list_head list;
struct priority_group *pg; /* Owning PG */
+ unsigned is_active; /* Path status */
unsigned fail_count; /* Cumulative failure count */
struct dm_path path;
unsigned pg_init_count; /* Number of times pg_init called */
struct work_struct process_queued_ios;
- struct bio_list queued_bios;
- struct list_head queued_reqs;
+ struct list_head queued_ios;
unsigned queue_size;
struct work_struct trigger_event;
*/
struct dm_mpath_io {
struct pgpath *pgpath;
- struct dm_bio_details details;
};
typedef int (*action_fn) (struct pgpath *pgpath);
struct pgpath *pgpath = kzalloc(sizeof(*pgpath), GFP_KERNEL);
if (pgpath) {
- pgpath->path.is_active = 1;
+ pgpath->is_active = 1;
INIT_WORK(&pgpath->deactivate_path, deactivate_path);
}
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
INIT_LIST_HEAD(&m->priority_groups);
- INIT_LIST_HEAD(&m->queued_reqs);
+ INIT_LIST_HEAD(&m->queued_ios);
spin_lock_init(&m->lock);
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
dm_noflush_suspending(m->ti));
}
-static int map_bio(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
struct dm_mpath_io *mpio, unsigned was_queued)
{
int r = DM_MAPIO_REMAPPED;
unsigned long flags;
struct pgpath *pgpath;
-
- spin_lock_irqsave(&m->lock, flags);
-
- /* Do we need to select a new pgpath? */
- if (!m->current_pgpath ||
- (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
- __choose_pgpath(m);
-
- pgpath = m->current_pgpath;
-
- if (was_queued)
- m->queue_size--;
-
- if ((pgpath && m->queue_io) ||
- (!pgpath && m->queue_if_no_path)) {
- /* Queue for the daemon to resubmit */
- bio_list_add(&m->queued_bios, bio);
- m->queue_size++;
- if ((m->pg_init_required && !m->pg_init_in_progress) ||
- !m->queue_io)
- queue_work(kmultipathd, &m->process_queued_ios);
- pgpath = NULL;
- r = DM_MAPIO_SUBMITTED;
- } else if (pgpath)
- bio->bi_bdev = pgpath->path.dev->bdev;
- else if (__must_push_back(m))
- r = DM_MAPIO_REQUEUE;
- else
- r = -EIO; /* Failed */
-
- mpio->pgpath = pgpath;
-
- spin_unlock_irqrestore(&m->lock, flags);
-
- return r;
-}
-
-static int map_req(struct multipath *m, struct request *clone,
- struct dm_mpath_io *mpio, unsigned was_queued)
-{
- int r = DM_MAPIO_REMAPPED;
- unsigned long flags;
- struct pgpath *pgpath;
struct block_device *bdev;
spin_lock_irqsave(&m->lock, flags);
if ((pgpath && m->queue_io) ||
(!pgpath && m->queue_if_no_path)) {
/* Queue for the daemon to resubmit */
- list_add_tail(&clone->queuelist, &m->queued_reqs);
+ list_add_tail(&clone->queuelist, &m->queued_ios);
m->queue_size++;
if ((m->pg_init_required && !m->pg_init_in_progress) ||
!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
- clone->q = NULL;
- clone->rq_disk = NULL;
r = DM_MAPIO_SUBMITTED;
} else if (pgpath) {
bdev = pgpath->path.dev->bdev;
clone->q = bdev_get_queue(bdev);
clone->rq_disk = bdev->bd_disk;
- } else if (__must_push_back(m)) {
- clone->q = NULL;
- clone->rq_disk = NULL;
+ } else if (__must_push_back(m))
r = DM_MAPIO_REQUEUE;
- } else {
- clone->q = NULL;
- clone->rq_disk = NULL;
+ else
r = -EIO; /* Failed */
- }
mpio->pgpath = pgpath;
* The multipath daemon is responsible for resubmitting queued ios.
*---------------------------------------------------------------*/
-static void dispatch_queued_bios(struct multipath *m)
-{
- int r;
- unsigned long flags;
- struct bio *bio = NULL, *next;
- struct dm_mpath_io *mpio;
- union map_info *info;
-
- spin_lock_irqsave(&m->lock, flags);
- bio = bio_list_get(&m->queued_bios);
- spin_unlock_irqrestore(&m->lock, flags);
-
- while (bio) {
- next = bio->bi_next;
- bio->bi_next = NULL;
-
- info = dm_get_mapinfo(bio);
- mpio = info->ptr;
-
- r = map_bio(m, bio, mpio, 1);
- if (r < 0)
- bio_endio(bio, r);
- else if (r == DM_MAPIO_REMAPPED)
- generic_make_request(bio);
- else if (r == DM_MAPIO_REQUEUE)
- bio_endio(bio, -EIO);
-
- bio = next;
- }
-}
-
-static void dispatch_queued_reqs(struct multipath *m)
+static void dispatch_queued_ios(struct multipath *m)
{
int r;
unsigned long flags;
LIST_HEAD(cl);
spin_lock_irqsave(&m->lock, flags);
- list_splice_init(&m->queued_reqs, &cl);
+ list_splice_init(&m->queued_ios, &cl);
spin_unlock_irqrestore(&m->lock, flags);
list_for_each_entry_safe(clone, n, &cl, queuelist) {
info = dm_get_rq_mapinfo(clone);
mpio = info->ptr;
- r = map_req(m, clone, mpio, 1);
- if (r < 0 || r == DM_MAPIO_REQUEUE) {
+ r = map_io(m, clone, mpio, 1);
+ if (r < 0) {
mempool_free(mpio, m->mpio_pool);
- if (r == DM_MAPIO_REQUEUE)
- r = DM_ENDIO_REQUEUE;
- dm_end_request(clone, r);
+ dm_kill_request(clone, r);
} else if (r == DM_MAPIO_REMAPPED)
dm_dispatch_request(clone);
+ else if (r == DM_MAPIO_REQUEUE) {
+ mempool_free(mpio, m->mpio_pool);
+ dm_requeue_request(clone);
+ }
}
}
if (init_required)
queue_work(kmpath_handlerd, &m->activate_path);
- if (!must_queue) {
- if (dm_table_request_based(m->ti->table))
- dispatch_queued_reqs(m);
- else
- dispatch_queued_bios(m);
- }
+ if (!must_queue)
+ dispatch_queued_ios(m);
}
/*
const char *param_name;
static struct param _params[] = {
- {0, 4, "invalid number of feature args"},
+ {0, 3, "invalid number of feature args"},
{1, 50, "pg_init_retries must be between 1 and 50"},
};
continue;
}
- if (!strnicmp(param_name, MESG_STR("rq_based"))) {
- dm_table_set_request_based(ti->table);
- continue;
- }
-
ti->error = "Unrecognised multipath feature request";
r = -EINVAL;
} while (argc && !r);
/*
* Map cloned requests
*/
-static int multipath_map_bio(struct dm_target *ti, struct bio *bio,
- union map_info *map_context)
-{
- int r;
- struct dm_mpath_io *mpio;
- struct multipath *m = (struct multipath *) ti->private;
-
- mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
- dm_bio_record(&mpio->details, bio);
-
- map_context->ptr = mpio;
- bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
- r = map_bio(m, bio, mpio, 0);
- if (r < 0 || r == DM_MAPIO_REQUEUE)
- mempool_free(mpio, m->mpio_pool);
-
- return r;
-}
-
-static int multipath_map_req(struct dm_target *ti, struct request *clone,
- union map_info *map_context)
+static int multipath_map(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
{
int r;
struct dm_mpath_io *mpio;
map_context->ptr = mpio;
clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-
- r = map_req(m, clone, mpio, 0);
+ r = map_io(m, clone, mpio, 0);
if (r < 0 || r == DM_MAPIO_REQUEUE)
mempool_free(mpio, m->mpio_pool);
spin_lock_irqsave(&m->lock, flags);
- if (!pgpath->path.is_active)
+ if (!pgpath->is_active)
goto out;
DMWARN("Failing path %s.", pgpath->path.dev->name);
pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
- pgpath->path.is_active = 0;
+ pgpath->is_active = 0;
pgpath->fail_count++;
m->nr_valid_paths--;
spin_lock_irqsave(&m->lock, flags);
- if (pgpath->path.is_active)
+ if (pgpath->is_active)
goto out;
if (!pgpath->pg->ps.type->reinstate_path) {
if (r)
goto out;
- pgpath->path.is_active = 1;
+ pgpath->is_active = 1;
m->current_pgpath = NULL;
if (!m->nr_valid_paths++ && m->queue_size)
/*
* end_io handling
*/
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
int error, struct dm_mpath_io *mpio)
{
+ /*
+ * We don't queue any clone request inside the multipath target
+ * during end I/O handling, since those clone requests don't have
+ * bio clones. If we queue them inside the multipath target,
+ * we need to make bio clones, that requires memory allocation.
+ * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+ * don't have bio clones.)
+ * Instead of queueing the clone request here, we queue the original
+ * request into dm core, which will remake a clone request and
+ * clone bios for it and resubmit it later.
+ */
+ int r = DM_ENDIO_REQUEUE;
unsigned long flags;
- if (!error)
+ if (!error && !clone->errors)
return 0; /* I/O complete */
- if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
- return error;
-
if (error == -EOPNOTSUPP)
return error;
- spin_lock_irqsave(&m->lock, flags);
- if (!m->nr_valid_paths) {
- if (__must_push_back(m)) {
- spin_unlock_irqrestore(&m->lock, flags);
- return DM_ENDIO_REQUEUE;
- } else if (!m->queue_if_no_path) {
- spin_unlock_irqrestore(&m->lock, flags);
- return -EIO;
- } else {
- spin_unlock_irqrestore(&m->lock, flags);
- goto requeue;
- }
- }
- spin_unlock_irqrestore(&m->lock, flags);
-
if (mpio->pgpath)
fail_path(mpio->pgpath);
-requeue:
- dm_bio_restore(&mpio->details, bio);
-
- /* queue for the daemon to resubmit or fail */
spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_bios, bio);
- m->queue_size++;
- if (!m->queue_io)
- queue_work(kmultipathd, &m->process_queued_ios);
- spin_unlock_irqrestore(&m->lock, flags);
-
- return DM_ENDIO_INCOMPLETE; /* io not complete */
-}
-
-static int do_end_req(struct multipath *m, struct request *clone,
- int error, struct dm_mpath_io *mpio)
-{
- unsigned long flags;
- int r;
-
- if (!error && !clone->errors)
- return 0; /* I/O complete */
-
- if (error == -EOPNOTSUPP)
- return error;
-
- spin_lock_irqsave(&m->lock, flags);
- if (!m->nr_valid_paths) {
- if (__must_push_back(m) || m->queue_if_no_path)
- r = DM_ENDIO_REQUEUE;
- else
- r = -EIO;
-
- spin_unlock_irqrestore(&m->lock, flags);
- return r;
- }
+ if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
+ r = -EIO;
spin_unlock_irqrestore(&m->lock, flags);
- if (mpio->pgpath)
- fail_path(mpio->pgpath);
-
- return DM_ENDIO_REQUEUE;
-}
-
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
-{
- struct multipath *m = ti->private;
- struct dm_mpath_io *mpio = map_context->ptr;
- struct pgpath *pgpath = mpio->pgpath;
- struct path_selector *ps;
- int r;
-
- r = do_end_io(m, bio, error, mpio);
- if (pgpath) {
- ps = &pgpath->pg->ps;
- if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path);
- }
- if (r != DM_ENDIO_INCOMPLETE)
- mempool_free(mpio, m->mpio_pool);
-
return r;
}
-static int multipath_end_req(struct dm_target *ti, struct request *clone,
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
int error, union map_info *map_context)
{
struct multipath *m = ti->private;
struct path_selector *ps;
int r;
- r = do_end_req(m, clone, error, mpio);
+ r = do_end_io(m, clone, error, mpio);
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
if (type == STATUSTYPE_INFO)
DMEMIT("2 %u %u ", m->queue_size, m->pg_init_count);
else {
- int rq_based = dm_table_request_based(ti->table);
-
- DMEMIT("%u ", m->queue_if_no_path + rq_based +
+ DMEMIT("%u ", m->queue_if_no_path +
(m->pg_init_retries > 0) * 2);
if (m->queue_if_no_path)
DMEMIT("queue_if_no_path ");
if (m->pg_init_retries)
DMEMIT("pg_init_retries %u ", m->pg_init_retries);
- if (rq_based)
- DMEMIT("rq_based ");
}
if (!m->hw_handler_name || type == STATUSTYPE_INFO)
list_for_each_entry(p, &pg->pgpaths, list) {
DMEMIT("%s %s %u ", p->path.dev->name,
- p->path.is_active ? "A" : "F",
+ p->is_active ? "A" : "F",
p->fail_count);
if (pg->ps.type->status)
sz += pg->ps.type->status(&pg->ps,
bdev->bd_disk, cmd, arg);
}
-static int __pgpath_congested(struct pgpath *pgpath)
+static int __pgpath_busy(struct pgpath *pgpath)
{
struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
- if (dm_underlying_device_congested(q))
- return 1;
-
- return 0;
+ return dm_underlying_device_busy(q);
}
-static int multipath_congested(struct dm_target *ti)
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy". Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
{
- int congested = 0;
+ int busy = 0, has_active = 0;
struct multipath *m = (struct multipath *) ti->private;
+ struct priority_group *pg;
+ struct pgpath *pgpath;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
- if (m->current_pgpath && m->repeat_count > 1) {
- /* m->current_pgpath is surely used at next mapping time. */
- if (__pgpath_congested(m->current_pgpath))
- congested = 1;
-
+ /* Guess which priority_group will be used at next mapping time */
+ if (unlikely(!m->current_pgpath && m->next_pg))
+ pg = m->next_pg;
+ else if (likely(m->current_pg))
+ pg = m->current_pg;
+ else
+ /*
+ * We don't know which pg will be used at next mapping time.
+ * We don't call __choose_pgpath() here to avoid to trigger
+ * pg_init just by busy checking.
+ * So we don't know whether underlying devices we will be using
+ * at next mapping time are busy or not. Just try mapping.
+ */
goto out;
- }
/*
- * We are here means that path selection will be executed
- * at next mapping time.
- * We run the path selection here and check congestion status
- * of the next path.
- * And increment repeat_count to avoid path selection again
- * in map_io().
+ * If there is one non-busy active path at least, the path selector
+ * will be able to select it. So we consider such a pg as not busy.
*/
- __choose_pgpath(m);
- if (m->current_pgpath) {
- if (__pgpath_congested(m->current_pgpath))
- congested = 1;
+ busy = 1;
+ list_for_each_entry(pgpath, &pg->pgpaths, list)
+ if (pgpath->is_active) {
+ has_active = 1;
+
+ if (!__pgpath_busy(pgpath)) {
+ busy = 0;
+ break;
+ }
+ }
- m->repeat_count++;
- }
+ if (!has_active)
+ /*
+ * No active path in this pg, so this pg won't be used and
+ * the current_pg will be changed at next mapping time.
+ * We need to try mapping to determine it.
+ */
+ busy = 0;
out:
spin_unlock_irqrestore(&m->lock, flags);
- return congested;
+ return busy;
}
/*-----------------------------------------------------------------
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
- .map = multipath_map_bio,
- .end_io = multipath_end_io,
- .map_rq = multipath_map_req,
- .rq_end_io = multipath_end_req,
+ .map_rq = multipath_map,
+ .rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,
.ioctl = multipath_ioctl,
- .congested = multipath_congested,
+ .busy = multipath_busy,
};
static int __init dm_multipath_init(void)
struct dm_path {
struct dm_dev *dev; /* Read-only */
- unsigned is_active; /* Read-only */
-
void *pscontext; /* For path-selector use */
};
return r;
}
- r = dm_init_md(t->md);
- if (r) {
- DMWARN("Cannot initialize device %s, error %d", path, r);
- return r;
- }
-
dd = find_device(&t->devices, dev);
if (!dd) {
dd = kmalloc(sizeof(*dd), GFP_KERNEL);
rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
- if (!q->request_fn)
- rs->no_request_stacking = 1;
+ rs->no_request_stacking |= !blk_queue_stackable(q);
}
EXPORT_SYMBOL_GPL(dm_set_device_limits);
return 0;
}
-static int check_for_valid_limits(struct io_restrictions *rs,
- struct mapped_device *md)
+static void check_for_valid_limits(struct io_restrictions *rs)
{
- int r = 0;
- struct dm_table *t;
-
if (!rs->max_sectors)
rs->max_sectors = SAFE_MAX_SECTORS;
if (!rs->max_hw_sectors)
rs->seg_boundary_mask = -1;
if (!rs->bounce_pfn)
rs->bounce_pfn = -1;
-
- if (!dm_request_based(md))
- return 0;
-
- /* Allows to load only request stackable tables */
- if (rs->no_request_stacking) {
- DMERR("table load rejected: including non-request-stackable "
- "devices");
- return -EINVAL;
- }
-
- t = dm_get_table(md);
-
- /* Initial table loading must be allowed */
- if (!t)
- return 0;
-
- if ((rs->max_sectors < t->limits.max_sectors) ||
- (rs->max_hw_sectors < t->limits.max_hw_sectors) ||
- (rs->max_phys_segments < t->limits.max_phys_segments) ||
- (rs->max_hw_segments < t->limits.max_hw_segments) ||
- (rs->hardsect_size > t->limits.hardsect_size) ||
- (rs->max_segment_size < t->limits.max_segment_size) ||
- (rs->seg_boundary_mask < t->limits.seg_boundary_mask) ||
- (rs->bounce_pfn < t->limits.bounce_pfn) ||
- (rs->no_cluster && !t->limits.no_cluster)) {
- DMERR("table load rejected: shrinking current restriction");
- r = -EINVAL;
- }
-
- dm_table_put(t);
-
- return r;
}
int dm_table_add_target(struct dm_table *t, const char *type,
return r;
}
-void dm_table_set_request_based(struct dm_table *t)
+int dm_table_set_type(struct dm_table *t)
+{
+ int i;
+ int bio_based = 0, request_based = 0;
+ struct dm_target *tgt;
+
+ for (i = 0; i < t->num_targets; i++) {
+ tgt = t->targets + i;
+ if (tgt->type->map_rq)
+ request_based = 1;
+ else
+ bio_based = 1;
+
+ if (bio_based && request_based) {
+ DMWARN("Inconsistent table: different target types"
+ " can't be mixed up");
+ return -EINVAL;
+ }
+ }
+
+ if (bio_based) {
+ /* We must use this table as bio-based */
+ t->limits.no_request_stacking = 1;
+ return 0;
+ }
+
+ BUG_ON(!request_based); /* No targets in this table */
+
+ /* Non-request-stackable devices can't be used for request-based dm */
+ if (t->limits.no_request_stacking) {
+ DMWARN("table load rejected: including non-request-stackable"
+ " devices");
+ return -EINVAL;
+ }
+
+ /*
+ * Request-based dm supports only tables that have a single target now.
+ * To support multiple targets, request splitting support is needed,
+ * and that needs lots of changes in the block-layer.
+ * (e.g. request completion process for partial completion.)
+ */
+ if (t->num_targets > 1) {
+ DMWARN("Request-based dm doesn't support multiple targets yet");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dm_table_get_type(struct dm_table *t)
{
- dm_set_request_based(t->md);
+ return t->limits.no_request_stacking ?
+ DM_TYPE_BIO_BASED : DM_TYPE_REQUEST_BASED;
}
int dm_table_request_based(struct dm_table *t)
{
- return dm_request_based(t->md);
+ return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
}
static int setup_indexes(struct dm_table *t)
int r = 0;
unsigned int leaf_nodes;
- r = check_for_valid_limits(&t->limits, t->md);
- if (r)
- return r;
+ check_for_valid_limits(&t->limits);
/* how many indexes will the btree have ? */
leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
else
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
+ if (t->limits.no_request_stacking)
+ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, q);
+ else
+ queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
return r;
}
+int dm_table_any_busy_target(struct dm_table *t)
+{
+ int i;
+ struct dm_target *ti;
+
+ for (i = 0; i < t->num_targets; i++) {
+ ti = t->targets + i;
+ if (ti->type->busy && ti->type->busy(ti))
+ return 1;
+ }
+
+ return 0;
+}
+
void dm_table_unplug_all(struct dm_table *t)
{
struct dm_dev *dd;
EXPORT_SYMBOL(dm_table_unplug_all);
EXPORT_SYMBOL(dm_table_barrier_ok);
EXPORT_SYMBOL(dm_table_support_barrier);
-EXPORT_SYMBOL(dm_table_request_based);
-EXPORT_SYMBOL(dm_table_set_request_based);
static DEFINE_SPINLOCK(_minor_lock);
/*
+ * For bio based dm.
* One of these is allocated per bio.
*/
struct dm_io {
};
/*
+ * For bio based dm.
* One of these is allocated per target within a bio. Hopefully
* this will be simplified out one day.
*/
union map_info info;
};
+/*
+ * For request based dm.
+ * One of these is allocated per bio.
+ */
struct dm_clone_bio_info {
struct bio *orig;
struct request *rq;
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
-#define DMF_REQUEST_BASED 6
-#define DMF_BIO_BASED 7
-#define DMF_INITIALIZED 8
/*
* Work processed by per-device workqueue.
*/
struct dm_wq_req {
enum {
- DM_WQ_FLUSH_ALL,
DM_WQ_FLUSH_DEFERRED,
} type;
struct work_struct work;
struct bio_set *bs;
+ unsigned int mempool_type; /* Type of mempools above. */
+
/*
* Event handling.
*/
/* forced geometry settings */
struct hd_geometry geometry;
+ /* marker of flush suspend for request-based dm */
+ struct request suspend_rq;
+
/* For saving the address of __make_request for request based dm */
make_request_fn *saved_make_request_fn;
};
/*
* Partial completion handling for request-based dm
*/
-static void end_clone_bio(struct bio *bio, int error)
+static void end_clone_bio(struct bio *clone, int error)
{
- struct dm_clone_bio_info *info = bio->bi_private;
+ struct dm_clone_bio_info *info = clone->bi_private;
struct dm_rq_target_io *tio = info->rq->end_io_data;
- struct bio *orig_bio = info->orig;
+ struct bio *bio = info->orig;
unsigned int nr_bytes = info->orig->bi_size;
free_bio_info(tio->md, info);
- bio->bi_private = tio->md->bs;
- bio_put(bio);
+ clone->bi_private = tio->md->bs;
+ bio_put(clone);
if (tio->error) {
/*
* So the completing bio should always be rq->bio.
* If it's not, something wrong is happening.
*/
- if (tio->orig->bio != orig_bio)
- DMWARN("bio completion is going in the middle of the request");
+ if (tio->orig->bio != bio)
+ DMERR("bio completion is going in the middle of the request");
/*
* Update the original request.
wake_up(&tio->md->wait);
}
-static void __requeue_request(struct request_queue *q, struct request *rq)
+static void dm_unprep_request(struct request *rq)
{
+ struct request *clone = rq->special;
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ rq->special = NULL;
+ rq->cmd_flags &= ~REQ_DONTPREP;
+
+ free_bio_clone(clone);
+ dec_rq_pending(tio);
+ free_rq_tio(tio->md, tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_request(struct request *clone)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+ struct request *rq = tio->orig;
+ struct request_queue *q = rq->q;
+ unsigned long flags;
+
+ dm_unprep_request(rq);
+
+ spin_lock_irqsave(q->queue_lock, flags);
if (elv_queue_empty(q))
blk_plug_device(q);
blk_requeue_request(q, rq);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_request);
+
+static inline void __stop_queue(struct request_queue *q)
+{
+ blk_stop_queue(q);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __stop_queue(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static inline void __start_queue(struct request_queue *q)
+{
+ if (blk_queue_stopped(q))
+ blk_start_queue(q);
}
-static void requeue_request(struct request_queue *q, struct request *rq)
+static void start_queue(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(q->queue_lock, flags);
- __requeue_request(q, rq);
+ __start_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
/*
* Complete the clone and the original request
*/
-void dm_end_request(struct request *clone, int error)
+static void dm_end_request(struct request *clone, int error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *orig = tio->orig;
- struct request_queue *q_orig = orig->q;
- unsigned int nr_bytes = blk_rq_bytes(orig);
-
- if (error == DM_ENDIO_REQUEUE) {
- /*
- * Requeue the original request of the clone.
- * Don't invoke blk_run_queue() so that the requeued request
- * won't be dispatched again soon.
- */
- free_bio_clone(clone);
- dec_rq_pending(tio);
- free_rq_tio(tio->md, tio);
+ struct request *rq = tio->orig;
+ struct request_queue *q = rq->q;
+ unsigned int nr_bytes = blk_rq_bytes(rq);
- requeue_request(q_orig, orig);
- return;
- }
+ if (blk_pc_request(rq)) {
+ rq->errors = clone->errors;
+ rq->data_len = clone->data_len;
- if (blk_pc_request(orig)) {
- orig->errors = clone->errors;
- orig->data_len = clone->data_len;
-
- if (orig->sense)
+ if (rq->sense)
/*
* We are using the sense buffer of the original
* request.
* So setting the length of the sense data is enough.
*/
- orig->sense_len = clone->sense_len;
+ rq->sense_len = clone->sense_len;
}
free_bio_clone(clone);
dec_rq_pending(tio);
free_rq_tio(tio->md, tio);
- if (unlikely(blk_end_request(orig, error, nr_bytes)))
+ if (unlikely(blk_end_request(rq, error, nr_bytes)))
BUG();
- blk_run_queue(q_orig);
+ blk_run_queue(q);
}
-EXPORT_SYMBOL_GPL(dm_end_request);
/*
* Request completion handler for request-based dm
*/
-static void dm_softirq_done(struct request *orig)
+static void dm_softirq_done(struct request *rq)
{
- struct request *clone = orig->completion_data;
+ struct request *clone = rq->completion_data;
struct dm_rq_target_io *tio = clone->end_io_data;
dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
- int error = tio->error, r;
+ int error = tio->error;
+ int r;
+
+ if (rq->cmd_flags & REQ_FAILED)
+ goto end_request;
if (rq_end_io) {
r = rq_end_io(tio->ti, clone, error, &tio->info);
- if (r <= 0 || r == DM_ENDIO_REQUEUE)
- /* The target wants to complete or requeue the I/O */
+ if (r <= 0)
+ /* The target wants to complete the I/O */
error = r;
else if (r == DM_ENDIO_INCOMPLETE)
/* The target will handle the I/O */
return;
- else {
+ else if (r == DM_ENDIO_REQUEUE) {
+ /*
+ * The target wants to requeue the I/O.
+ * Don't invoke blk_run_queue() so that the requeued
+ * request won't be dispatched again soon.
+ */
+ dm_requeue_request(clone);
+ return;
+ } else {
DMWARN("unimplemented target endio return value: %d",
r);
BUG();
}
}
+end_request:
dm_end_request(clone, error);
}
static void end_clone_request(struct request *clone, int error)
{
struct dm_rq_target_io *tio = clone->end_io_data;
- struct request *orig = tio->orig;
+ struct request *rq = tio->orig;
/*
* For just cleaning up the information of the queue in which
* against this queue
*/
tio->error = error;
- orig->completion_data = clone;
- blk_complete_request(orig);
+ rq->completion_data = clone;
+ blk_complete_request(rq);
+}
+
+/*
+ * Complete the original request of a clone with an error status.
+ * Target's rq_end_io() function isn't called.
+ * This may be used by target's map_rq() function when the mapping fails.
+ */
+void dm_kill_request(struct request *clone, int error)
+{
+ struct dm_rq_target_io *tio = clone->end_io_data;
+ struct request *rq = tio->orig;
+
+ tio->error = error;
+ /* Avoid printing "I/O error" message, since we didn't I/O actually */
+ rq->cmd_flags |= (REQ_FAILED | REQ_QUIET);
+ rq->completion_data = clone;
+ blk_complete_request(rq);
}
+EXPORT_SYMBOL_GPL(dm_kill_request);
static sector_t max_io_len(struct mapped_device *md,
sector_t sector, struct dm_target *ti)
return md->saved_make_request_fn(q, bio); /* call __make_request() */
}
+static inline int dm_request_based(struct mapped_device *md)
+{
+ return blk_queue_stackable(md->queue);
+}
+
static int dm_request(struct request_queue *q, struct bio *bio)
{
struct mapped_device *md = q->queuedata;
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
+ if (dm_request_based(md))
return dm_make_request(q, bio);
return _dm_request(q, bio);
void dm_dispatch_request(struct request *rq)
{
+ int r;
+
rq->start_time = jiffies;
- blk_submit_request(rq->q, rq);
+ r = blk_insert_cloned_request(rq->q, rq);
+ if (r)
+ dm_kill_request(rq, r);
}
EXPORT_SYMBOL_GPL(dm_dispatch_request);
-static void copy_request_info(struct request *clone, struct request *orig)
-{
- INIT_LIST_HEAD(&clone->queuelist);
- INIT_LIST_HEAD(&clone->donelist);
- clone->q = NULL;
- clone->cmd_flags = (rq_data_dir(orig) | REQ_NOMERGE);
- clone->cmd_type = orig->cmd_type;
- clone->sector = orig->sector;
- clone->hard_sector = orig->hard_sector;
- clone->nr_sectors = orig->nr_sectors;
- clone->hard_nr_sectors = orig->hard_nr_sectors;
- clone->current_nr_sectors = orig->current_nr_sectors;
- clone->hard_cur_sectors = orig->hard_cur_sectors;
- INIT_HLIST_NODE(&clone->hash);
- clone->completion_data = NULL;
- clone->elevator_private = NULL;
- clone->elevator_private2 = NULL;
- clone->rq_disk = NULL;
- clone->start_time = jiffies;
- clone->nr_phys_segments = orig->nr_phys_segments;
- clone->nr_hw_segments = orig->nr_hw_segments;
- clone->ioprio = orig->ioprio;
- clone->special = NULL;
- clone->buffer = orig->buffer;
- clone->tag = -1;
- clone->errors = 0;
- clone->ref_count = 1;
- clone->cmd_len = orig->cmd_len;
- WARN_ON(orig->cmd != orig->__cmd);
- clone->cmd = clone->__cmd;
- if (orig->cmd_len) {
- memcpy(clone->cmd, orig->cmd, sizeof(orig->cmd));
- }
- clone->data_len = orig->data_len;
- clone->sense_len = orig->sense_len;
- clone->data = orig->data;
- clone->sense = orig->sense;
- clone->timeout = 0;
- clone->retries = 0;
- clone->end_io = end_clone_request;
- clone->next_rq = NULL;
-}
-
-static int clone_request_bios(struct request *clone, struct request *orig)
-{
- struct dm_rq_target_io *tio = clone->end_io_data;
- struct mapped_device *md = tio->md;
- struct bio *bio, *orig_bio;
+static void copy_request_info(struct request *clone, struct request *rq)
+{
+ clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE);
+ clone->cmd_type = rq->cmd_type;
+ clone->sector = rq->sector;
+ clone->hard_sector = rq->hard_sector;
+ clone->nr_sectors = rq->nr_sectors;
+ clone->hard_nr_sectors = rq->hard_nr_sectors;
+ clone->current_nr_sectors = rq->current_nr_sectors;
+ clone->hard_cur_sectors = rq->hard_cur_sectors;
+ clone->nr_phys_segments = rq->nr_phys_segments;
+ clone->ioprio = rq->ioprio;
+ clone->buffer = rq->buffer;
+ clone->cmd_len = rq->cmd_len;
+ if (rq->cmd_len)
+ clone->cmd = rq->cmd;
+ clone->data_len = rq->data_len;
+ clone->extra_len = rq->extra_len;
+ clone->sense_len = rq->sense_len;
+ clone->data = rq->data;
+ clone->sense = rq->sense;
+}
+
+static int clone_request_bios(struct request *clone, struct request *rq,
+ struct mapped_device *md)
+{
+ struct bio *bio, *clone_bio;
struct dm_clone_bio_info *info;
- for (orig_bio = orig->bio; orig_bio; orig_bio = orig_bio->bi_next) {
+ for (bio = rq->bio; bio; bio = bio->bi_next) {
info = alloc_bio_info(md);
if (!info)
goto free_and_out;
- bio = bio_alloc_bioset(GFP_ATOMIC, orig_bio->bi_max_vecs,
- md->bs);
- if (!bio) {
+ clone_bio = bio_alloc_bioset(GFP_ATOMIC, bio->bi_max_vecs,
+ md->bs);
+ if (!clone_bio) {
free_bio_info(md, info);
goto free_and_out;
}
- __bio_clone(bio, orig_bio);
- bio->bi_destructor = dm_bio_destructor;
- bio->bi_end_io = end_clone_bio;
+ __bio_clone(clone_bio, bio);
+ clone_bio->bi_destructor = dm_bio_destructor;
+ clone_bio->bi_end_io = end_clone_bio;
info->rq = clone;
- info->orig = orig_bio;
- bio->bi_private = info;
+ info->orig = bio;
+ clone_bio->bi_private = info;
if (clone->bio) {
- clone->biotail->bi_next = bio;
- clone->biotail = bio;
+ clone->biotail->bi_next = clone_bio;
+ clone->biotail = clone_bio;
} else
- clone->bio = clone->biotail = bio;
+ clone->bio = clone->biotail = clone_bio;
}
return 0;
return -ENOMEM;
}
-static int setup_clone(struct request *clone, struct request *orig)
+static int setup_clone(struct request *clone, struct request *rq,
+ struct dm_rq_target_io *tio)
{
int r;
- r = clone_request_bios(clone, orig);
+ blk_rq_init(NULL, clone);
+
+ r = clone_request_bios(clone, rq, tio->md);
if (r)
return r;
- copy_request_info(clone, orig);
+ copy_request_info(clone, rq);
+ clone->start_time = jiffies;
+ clone->end_io = end_clone_request;
+ clone->end_io_data = tio;
return 0;
}
-static int clone_and_map_request(struct dm_target *ti, struct request *rq,
- struct mapped_device *md)
+static inline int dm_flush_suspending(struct mapped_device *md)
{
- int r;
- struct request *clone;
+ return !md->suspend_rq.data;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+ struct mapped_device *md = (struct mapped_device *)q->queuedata;
struct dm_rq_target_io *tio;
+ struct request *clone;
+
+ if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend marker */
+ if (dm_flush_suspending(md)) {
+ if (q->in_flight)
+ return BLKPREP_DEFER;
+ else {
+ /* This device should be quiet now */
+ __stop_queue(q);
+ smp_mb();
+ BUG_ON(atomic_read(&md->pending));
+ wake_up(&md->wait);
+ return BLKPREP_KILL;
+ }
+ } else
+ /*
+ * The suspend process was interrupted.
+ * So no need to suspend now.
+ */
+ return BLKPREP_KILL;
+ }
+
+ if (unlikely(rq->special)) {
+ DMWARN("Already has something in rq->special.");
+ return BLKPREP_KILL;
+ }
+
+ if (unlikely(!dm_request_based(md))) {
+ DMWARN("Request was queued into bio-based device");
+ return BLKPREP_KILL;
+ }
tio = alloc_rq_tio(md); /* Only one for each original request */
if (!tio)
/* -ENOMEM */
- goto requeue;
+ return BLKPREP_DEFER;
+
tio->md = md;
+ tio->ti = NULL;
tio->orig = rq;
tio->error = 0;
- tio->ti = ti;
memset(&tio->info, 0, sizeof(tio->info));
clone = &tio->clone;
- clone->end_io_data = tio;
- clone->bio = clone->biotail = NULL;
- if (setup_clone(clone, rq))
+ if (setup_clone(clone, rq, tio)) {
/* -ENOMEM */
- goto free_rq_tio_and_requeue;
+ free_rq_tio(md, tio);
+ return BLKPREP_DEFER;
+ }
+ rq->special = clone;
+ rq->cmd_flags |= REQ_DONTPREP;
+
+ return BLKPREP_OK;
+}
+
+static void map_request(struct dm_target *ti, struct request *rq,
+ struct mapped_device *md)
+{
+ int r;
+ struct request *clone = rq->special;
+ struct dm_rq_target_io *tio = clone->end_io_data;
+
+ tio->ti = ti;
atomic_inc(&md->pending);
+
+ /*
+ * Although submitted requests to the md->queue are checked against
+ * the table/queue limitations at the submission time, the limitations
+ * may be changed by a table swapping while those already checked
+ * requests are in the md->queue.
+ * If the limitations have been shrunk in such situations, we may be
+ * dispatching requests violating the current limitations here.
+ * Since struct request is a reliable one in the block-layer
+ * and device drivers, dispatching such requests is dangerous.
+ * (e.g. it may cause kernel panic easily.)
+ * Avoid to dispatch such problematic requests in request-based dm.
+ *
+ * Since dm_kill_request() decrements the md->pending, this have to
+ * be done after incrementing the md->pending.
+ */
+ r = blk_rq_check_limits(rq->q, rq);
+ if (unlikely(r)) {
+ DMWARN("violating the queue limitation. the limitation may be"
+ " shrunk while there are some requests in the queue.");
+ dm_kill_request(clone, r);
+ return;
+ }
+
r = ti->type->map_rq(ti, clone, &tio->info);
switch (r) {
case DM_MAPIO_SUBMITTED:
- /* The target has taken the request to submit by itself */
+ /* The target has taken the I/O to submit by itself later */
break;
case DM_MAPIO_REMAPPED:
- /* The clone has been remapped so dispatch it */
+ /* The target has remapped the I/O so dispatch it */
dm_dispatch_request(clone);
break;
case DM_MAPIO_REQUEUE:
- /* The target wants to requeue the original request */
- goto free_bio_clone_and_requeue;
+ /* The target wants to requeue the I/O */
+ dm_requeue_request(clone);
+ break;
default:
if (r > 0) {
DMWARN("unimplemented target map return value: %d", r);
BUG();
}
- /*
- * The target wants to complete the original request.
- * Avoid printing "I/O error" message, since we didn't I/O.
- */
- rq->cmd_flags |= REQ_QUIET;
- dm_end_request(clone, r);
+ /* The target wants to complete the I/O */
+ dm_kill_request(clone, r);
break;
}
-
- return 0;
-
-free_bio_clone_and_requeue:
- free_bio_clone(clone);
- dec_rq_pending(tio);
-
-free_rq_tio_and_requeue:
- free_rq_tio(md, tio);
-
-requeue:
- /*
- * Actual requeue is done in dm_request_fn() after queue lock is held
- * so that we can avoid to get extra queue lock for the requeue
- */
- return 1;
}
/*
* q->request_fn for request-based dm.
- * Called with the queue lock held
+ * Called with the queue lock held.
*/
static void dm_request_fn(struct request_queue *q)
{
- int r;
struct mapped_device *md = (struct mapped_device *)q->queuedata;
struct dm_table *map = dm_get_table(md);
struct dm_target *ti;
- dm_congested_fn congested;
struct request *rq;
/*
while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
rq = elv_next_request(q);
if (!rq)
- break;
+ goto plug_and_out;
ti = dm_table_find_target(map, rq->sector);
- congested = ti->type->congested;
- if (congested && congested(ti))
- break;
+ if (ti->type->busy && ti->type->busy(ti))
+ goto plug_and_out;
blkdev_dequeue_request(rq);
spin_unlock(q->queue_lock);
- r = clone_and_map_request(ti, rq, md);
+ map_request(ti, rq, md);
spin_lock_irq(q->queue_lock);
-
- if (r)
- __requeue_request(q, rq);
}
+ goto out;
+
+plug_and_out:
+ if (!elv_queue_empty(q))
+ /* Some requests still remain, retry later */
+ blk_plug_device(q);
+
+out:
dm_table_put(map);
return;
}
-int dm_underlying_device_congested(struct request_queue *q)
+int dm_underlying_device_busy(struct request_queue *q)
{
return blk_lld_busy(q);
}
-EXPORT_SYMBOL_GPL(dm_underlying_device_congested);
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+
+static int dm_lld_busy(struct request_queue *q)
+{
+ int r;
+ struct mapped_device *md = q->queuedata;
+ struct dm_table *map = dm_get_table(md);
+
+ if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
+ r = 1;
+ else
+ r = dm_table_any_busy_target(map);
+
+ dm_table_put(map);
+ return r;
+}
static void dm_unplug_all(struct request_queue *q)
{
struct dm_table *map = dm_get_table(md);
if (map) {
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
+ if (dm_request_based(md))
generic_unplug_device(q);
dm_table_unplug_all(map);
if (!map || test_bit(DMF_BLOCK_IO, &md->flags))
r = bdi_bits;
- else if (test_bit(DMF_REQUEST_BASED, &md->flags))
- /* Request-based dm cares about only own queue */
+ else if (dm_request_based(md))
+ /*
+ * Request-based dm cares about only own queue for
+ * the query about congestion status of request_queue
+ */
r = md->queue->backing_dev_info.state & bdi_bits;
else
r = dm_table_any_congested(map, bdi_bits);
return r;
}
-static void init_queue(struct request_queue *q, struct mapped_device *md)
-{
- q->queuedata = md;
- q->backing_dev_info.congested_fn = dm_any_congested;
- q->backing_dev_info.congested_data = md;
- blk_queue_make_request(q, dm_request);
- blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
- q->unplug_fn = dm_unplug_all;
- blk_queue_merge_bvec(q, dm_merge_bvec);
-}
-
-int dm_set_md_request_based(struct mapped_device *md)
-{
- int r = 0;
-
- if (test_bit(DMF_INITIALIZED, &md->flags))
- /* Initialization is already done */
- return 0;
-
- md->io_pool = mempool_create_slab_pool(MIN_IOS, _bio_info_cache);
- if (!md->io_pool)
- return -ENOMEM;
-
- md->tio_pool = mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
- if (!md->tio_pool) {
- r = -ENOMEM;
- goto out_free_io_pool;
- }
-
- md->bs = bioset_create(MIN_IOS, MIN_IOS);
- if (!md->bs) {
- r = -ENOMEM;
- goto out_free_tio_pool;
- }
-
- md->queue = blk_init_queue(dm_request_fn, NULL);
- if (!md->queue) {
- DMERR("request queue initialization for request-based failed");
- r = -ENOMEM;
- goto out_free_bs;
- }
-
- DMINFO("%s: activating request-based I/O", md->disk->disk_name);
- md->saved_make_request_fn = md->queue->make_request_fn;
- init_queue(md->queue, md);
- blk_queue_softirq_done(md->queue, dm_softirq_done);
- md->disk->queue = md->queue;
- add_disk(md->disk);
-
- return 0;
-
-out_cleanup_queue:
- blk_cleanup_queue(md->queue);
- md->disk->queue = md->queue = NULL;
- md->saved_make_request_fn = NULL;
-
-out_free_bs:
- bioset_free(md->bs);
- md->bs = NULL;
-
-out_free_tio_pool:
- mempool_destroy(md->tio_pool);
- md->tio_pool = NULL;
-
-out_free_io_pool:
- mempool_destroy(md->io_pool);
- md->io_pool = NULL;
-
- return r;
-}
-
-int dm_set_md_bio_based(struct mapped_device *md)
-{
- if (test_bit(DMF_INITIALIZED, &md->flags)) {
- /* Initialization is already done */
- return 0;
- }
-
- md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
- if (!md->io_pool)
- goto out;
-
- md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
- if (!md->tio_pool)
- goto out_free_io_pool;
-
- md->bs = bioset_create(16, 16);
- if (!md->bs)
- goto out_free_tio_pool;
-
- md->queue = blk_alloc_queue(GFP_KERNEL);
- if (!md->queue) {
- DMERR("request queue initialization for bio-based failed");
- goto out_free_bs;
- }
-
- init_queue(md->queue, md);
- md->disk->queue = md->queue;
- add_disk(md->disk);
-
- return 0;
-
-out_free_bs:
- bioset_free(md->bs);
- md->bs = NULL;
-out_free_tio_pool:
- mempool_destroy(md->tio_pool);
- md->tio_pool = NULL;
-out_free_io_pool:
- mempool_destroy(md->io_pool);
- md->io_pool = NULL;
-out:
- return -ENOMEM;
-}
-
static struct block_device_operations dm_blk_dops;
/*
atomic_set(&md->uevent_seq, 0);
INIT_LIST_HEAD(&md->uevent_list);
spin_lock_init(&md->uevent_lock);
- /* Defaults to BIO based */
- set_bit(DMF_BIO_BASED, &md->flags);
- /* md's queue and mempools will be allocated after the 1st table load */
+ md->queue = blk_init_queue(dm_request_fn, NULL);
+ if (!md->queue)
+ goto bad_queue;
+
+ /*
+ * Request-based dm devices cannot be stacked on top of bio-based dm
+ * devices. The type of this dm device has not been decided yet,
+ * although we initialized the queue using blk_init_queue().
+ * The type is decided at the first table loading time.
+ * To prevent problematic device stacking, clear the queue flag
+ * for request stacking support until then.
+ *
+ * This queue is new, so no concurrency on the queue_flags.
+ */
+ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+ md->saved_make_request_fn = md->queue->make_request_fn;
+ md->queue->queuedata = md;
+ md->queue->backing_dev_info.congested_fn = dm_any_congested;
+ md->queue->backing_dev_info.congested_data = md;
+ blk_queue_make_request(md->queue, dm_request);
+ blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
+ md->queue->unplug_fn = dm_unplug_all;
+ blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+ blk_queue_softirq_done(md->queue, dm_softirq_done);
+ blk_queue_prep_rq(md->queue, dm_prep_fn);
+ blk_queue_lld_busy(md->queue, dm_lld_busy);
md->disk = alloc_disk(1);
if (!md->disk)
md->disk->major = _major;
md->disk->first_minor = minor;
md->disk->fops = &dm_blk_dops;
+ md->disk->queue = md->queue;
md->disk->private_data = md;
sprintf(md->disk->disk_name, "dm-%d", minor);
+ add_disk(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
md->wq = create_singlethread_workqueue("kdmflush");
bad_thread:
put_disk(md->disk);
bad_disk:
+ blk_cleanup_queue(md->queue);
+bad_queue:
free_minor(minor);
bad_minor:
module_put(THIS_MODULE);
return NULL;
}
-int dm_init_md(struct mapped_device *md)
-{
- int r = 0;
-
- if (test_bit(DMF_INITIALIZED, &md->flags))
- return 0;
-
- if(test_bit(DMF_REQUEST_BASED, &md->flags))
- r = dm_set_md_request_based(md);
- else if (test_bit(DMF_BIO_BASED, &md->flags))
- r = dm_set_md_bio_based(md);
- else
- r = -EINVAL;
-
- if (!r)
- set_bit(DMF_INITIALIZED, &md->flags);
-
- return r;
-}
-
static void unlock_fs(struct mapped_device *md);
static void free_dev(struct mapped_device *md)
mempool_destroy(md->io_pool);
if (md->bs)
bioset_free(md->bs);
- if (test_bit(DMF_INITIALIZED, &md->flags))
- del_gendisk(md->disk);
+ del_gendisk(md->disk);
free_minor(minor);
spin_lock(&_minor_lock);
spin_unlock(&_minor_lock);
put_disk(md->disk);
- if (md->queue)
- blk_cleanup_queue(md->queue);
+ blk_cleanup_queue(md->queue);
module_put(THIS_MODULE);
kfree(md);
}
dm_table_get(t);
dm_table_event_callback(t, event_callback, md);
+ /*
+ * The queue hasn't been stopped yet, if the old table type wasn't
+ * for request-based during suspension. So stop it to prevent
+ * I/O mapping before resume.
+ * This must be done before setting the queue restrictions,
+ * because request-based dm may be run just after the setting.
+ */
+ if (dm_table_request_based(t) && !blk_queue_stopped(q))
+ stop_queue(q);
+
write_lock(&md->map_lock);
md->map = t;
dm_table_set_restrictions(t, q);
set_current_state(TASK_INTERRUPTIBLE);
smp_mb();
- if (!atomic_read(&md->pending))
+ if (dm_request_based(md)) {
+ if (!atomic_read(&md->pending) &&
+ blk_queue_stopped(md->queue))
+ break;
+ } else if (!atomic_read(&md->pending))
break;
if (signal_pending(current)) {
struct bio *c;
while ((c = bio_list_pop(&md->deferred))) {
- if (__split_bio(md, c))
+ /*
+ * Some bios might have been queued here during suspension
+ * before setting of request-based dm in resume
+ */
+ if (dm_request_based(md))
+ generic_make_request(c);
+ else if (__split_bio(md, c))
bio_io_error(c);
}
down_write(&md->io_lock);
switch (req->type) {
- case DM_WQ_FLUSH_ALL:
- __merge_pushback_list(md);
- /* pass through */
case DM_WQ_FLUSH_DEFERRED:
__flush_deferred_io(md);
break;
return r;
}
-static void stop_queue(struct request_queue *q)
+static inline void dm_invalidate_flush_suspend(struct mapped_device *md)
{
+ md->suspend_rq.data = (void *)0x1;
+}
+
+static void dm_abort_suspend(struct mapped_device *md, int noflush)
+{
+ struct request_queue *q = md->queue;
unsigned long flags;
+ /*
+ * For flush suspend, invalidation and queue restart must be protected
+ * by a single queue lock to prevent a race with dm_prep_fn().
+ */
spin_lock_irqsave(q->queue_lock, flags);
- blk_stop_queue(q);
+ if (!noflush)
+ dm_invalidate_flush_suspend(md);
+ __start_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void start_queue(struct request_queue *q)
+/*
+ * Additional suspend work for request-based dm.
+ *
+ * In request-based dm, stopping request_queue prevents mapping.
+ * Even after stopping the request_queue, submitted requests from upper-layer
+ * can be inserted to the request_queue. So original (unmapped) requests are
+ * kept in the request_queue during suspension.
+ */
+static void dm_start_suspend(struct mapped_device *md, int noflush)
{
+ struct request *rq = &md->suspend_rq;
+ struct request_queue *q = md->queue;
unsigned long flags;
+ if (noflush) {
+ stop_queue(q);
+ return;
+ }
+
+ /*
+ * For flush suspend, we need a marker to indicate the border line
+ * between flush needed I/Os and deferred I/Os, since all I/Os are
+ * queued in the request_queue during suspension.
+ *
+ * This marker must be inserted after setting DMF_BLOCK_IO,
+ * because dm_prep_fn() considers no DMF_BLOCK_IO to be
+ * a suspend interruption.
+ */
spin_lock_irqsave(q->queue_lock, flags);
- if (blk_queue_stopped(q))
- blk_start_queue(q);
+ if (unlikely(rq->ref_count)) {
+ /*
+ * This can happen when the previous suspend was interrupted,
+ * the inserted suspend_rq for the previous suspend has still
+ * been in the queue and this suspend has been invoked.
+ *
+ * We could re-insert the suspend_rq by deleting it from
+ * the queue forcibly using list_del_init(&rq->queuelist).
+ * But it would break the block-layer easily.
+ * So we don't re-insert the suspend_rq again in such a case.
+ * The suspend_rq should be already invalidated during
+ * the previous suspend interruption, so just wait for it
+ * to be completed.
+ *
+ * This suspend will never complete, so warn the user to
+ * interrupt this suspend and retry later.
+ */
+ BUG_ON(!rq->data);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ DMWARN("Invalidating the previous suspend is still in"
+ " progress. This suspend will be never done."
+ " Please interrupt this suspend and retry later.");
+ return;
+ }
spin_unlock_irqrestore(q->queue_lock, flags);
+
+ /* Now no user of the suspend_rq */
+ blk_rq_init(q, rq);
+ blk_insert_request(q, rq, 0, NULL);
}
/*
add_wait_queue(&md->wait, &wait);
up_write(&md->io_lock);
- /*
- * In request-based dm, stopping request_queue prevents mapping.
- * Even after stopping the request_queue, submitted requests from
- * upper-layer can be inserted to the request_queue.
- * So original (unmapped) requests are kept in the request_queue
- * during suspension.
- */
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
- stop_queue(md->queue);
+ if (dm_request_based(md))
+ dm_start_suspend(md, noflush);
/* unplug */
if (map)
remove_wait_queue(&md->wait, &wait);
if (noflush) {
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
- /* Request-based dm uses md->queue for noflush */
+ if (dm_request_based(md))
+ /* All requeued requests are already in md->queue */
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
else
__merge_pushback_list(md);
if (r < 0) {
dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
- /* Request-based dm uses md->queue for deferred I/Os */
- start_queue(md->queue);
+ if (dm_request_based(md))
+ dm_abort_suspend(md, noflush);
unlock_fs(md);
goto out; /* pushback list is already flushed, so skip flush */
if (r)
goto out;
+ dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
+
/*
* Flushing deferred I/Os must be done after targets are resumed
* so that mapping of targets can work correctly.
- *
- * Resuming request_queue earlier than clear_bit(DMF_BLOCK_IO) means
- * starting to flush requests before upper-layer starts to submit bios.
- * It may be better because llds should be empty and no need to wait
- * for bio merging so strictly at this time.
+ * Request-based dm is queueing the deferred I/Os in its request_queue.
*/
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
+ if (dm_request_based(md))
start_queue(md->queue);
- dm_queue_flush(md, DM_WQ_FLUSH_DEFERRED, NULL);
-
unlock_fs(md);
if (md->suspended_bdev) {
return test_bit(DMF_SUSPENDED, &md->flags);
}
-int dm_request_based(struct mapped_device *md)
+int dm_noflush_suspending(struct dm_target *ti)
{
- return test_bit(DMF_REQUEST_BASED, &md->flags);
-}
+ struct mapped_device *md = dm_table_get_md(ti->table);
+ int r = __noflush_suspending(md);
-int dm_bio_based(struct mapped_device *md)
-{
- return test_bit(DMF_BIO_BASED, &md->flags);
+ dm_put(md);
+
+ return r;
}
+EXPORT_SYMBOL_GPL(dm_noflush_suspending);
-void dm_set_request_based(struct mapped_device *md)
+int dm_init_md_mempool(struct mapped_device *md, int type)
{
- if (test_bit(DMF_REQUEST_BASED, &md->flags))
- return;
+ if (unlikely(type == DM_TYPE_NONE)) {
+ DMWARN("no type is specified, can't initialize mempool");
+ return -EINVAL;
+ }
- if (test_bit(DMF_INITIALIZED, &md->flags)) {
- DMERR("Cannot change to request based, already initialized");
- return;
+ if (md->mempool_type == type)
+ return 0;
+
+ if (md->map) {
+ /* The md has been using, can't change the mempool type */
+ DMWARN("can't change mempool type after a table is bound");
+ return -EINVAL;
}
- set_bit(DMF_REQUEST_BASED, &md->flags);
- clear_bit(DMF_BIO_BASED, &md->flags);
-}
+ /* Not using the md yet, we can still change the mempool type */
+ if (md->mempool_type != DM_TYPE_NONE) {
+ mempool_destroy(md->io_pool);
+ md->io_pool = NULL;
+ mempool_destroy(md->tio_pool);
+ md->tio_pool = NULL;
+ bioset_free(md->bs);
+ md->bs = NULL;
+ md->mempool_type = DM_TYPE_NONE;
+ }
-int dm_noflush_suspending(struct dm_target *ti)
-{
- struct mapped_device *md = dm_table_get_md(ti->table);
- int r = __noflush_suspending(md);
+ md->io_pool = (type == DM_TYPE_BIO_BASED) ?
+ mempool_create_slab_pool(MIN_IOS, _io_cache) :
+ mempool_create_slab_pool(MIN_IOS, _bio_info_cache);
+ if (!md->io_pool)
+ return -ENOMEM;
- dm_put(md);
+ md->tio_pool = (type == DM_TYPE_BIO_BASED) ?
+ mempool_create_slab_pool(MIN_IOS, _tio_cache) :
+ mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+ if (!md->tio_pool)
+ goto free_io_pool_and_out;
- return r;
+ md->bs = (type == DM_TYPE_BIO_BASED) ?
+ bioset_create(16, 16) : bioset_create(MIN_IOS, MIN_IOS);
+ if (!md->bs)
+ goto free_tio_pool_and_out;
+
+ md->mempool_type = type;
+
+ return 0;
+
+free_tio_pool_and_out:
+ mempool_destroy(md->tio_pool);
+ md->tio_pool = NULL;
+
+free_io_pool_and_out:
+ mempool_destroy(md->io_pool);
+ md->io_pool = NULL;
+
+ return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(dm_noflush_suspending);
static struct block_device_operations dm_blk_dops = {
.open = dm_blk_open,
#define DM_SUSPEND_NOFLUSH_FLAG (1 << 1)
/*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE 0
+#define DM_TYPE_BIO_BASED 1
+#define DM_TYPE_REQUEST_BASED 2
+
+/*
* List of devices that a metadevice uses and should open/close.
*/
struct dm_dev {
void dm_table_postsuspend_targets(struct dm_table *t);
int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-void dm_table_unplug_all(struct dm_table *t);
-void dm_table_set_request_based(struct dm_table *t);
+int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+int dm_table_get_type(struct dm_table *t);
int dm_table_request_based(struct dm_table *t);
+void dm_table_unplug_all(struct dm_table *t);
/*
* To check the return value from dm_table_find_target().
void *param), void *param);
/*-----------------------------------------------------------------
- * Helper for block layer and dm core operations
- *---------------------------------------------------------------*/
-void dm_dispatch_request(struct request *rq);
-void dm_end_request(struct request *rq, int error);
-int dm_underlying_device_congested(struct request_queue *q);
-
-/*-----------------------------------------------------------------
* Useful inlines.
*---------------------------------------------------------------*/
static inline int array_too_big(unsigned long fixed, unsigned long obj,
void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
union map_info *dm_get_mapinfo(struct bio *bio);
-union map_info *dm_get_rq_mapinfo(struct request *rq);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
void dm_kobject_uevent(struct mapped_device *md);
void dm_kcopyd_exit(void);
/*
- * Initializer for request-based/bio-based device
+ * Mempool initializer for a mapped_device
*/
-int dm_set_md_request_based(struct mapped_device *md);
-int dm_set_md_bio_based(struct mapped_device *md);
-void dm_set_request_based(struct mapped_device *md);
-int dm_init_md(struct mapped_device *md);
+int dm_init_md_mempool(struct mapped_device *md, int type);
#endif
sbio->bi_size = r1_bio->sectors << 9;
sbio->bi_idx = 0;
sbio->bi_phys_segments = 0;
- sbio->bi_hw_segments = 0;
- sbio->bi_hw_front_size = 0;
- sbio->bi_hw_back_size = 0;
sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
sbio->bi_flags |= 1 << BIO_UPTODATE;
sbio->bi_next = NULL;
bio->bi_vcnt = 0;
bio->bi_idx = 0;
bio->bi_phys_segments = 0;
- bio->bi_hw_segments = 0;
bio->bi_size = 0;
bio->bi_end_io = NULL;
bio->bi_private = NULL;
tbio->bi_size = r10_bio->sectors << 9;
tbio->bi_idx = 0;
tbio->bi_phys_segments = 0;
- tbio->bi_hw_segments = 0;
- tbio->bi_hw_front_size = 0;
- tbio->bi_hw_back_size = 0;
tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
tbio->bi_flags |= 1 << BIO_UPTODATE;
tbio->bi_next = NULL;
bio->bi_vcnt = 0;
bio->bi_idx = 0;
bio->bi_phys_segments = 0;
- bio->bi_hw_segments = 0;
bio->bi_size = 0;
}
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
#endif
+/*
+ * We maintain a biased count of active stripes in the bottom 16 bits of
+ * bi_phys_segments, and a count of processed stripes in the upper 16 bits
+ */
+static inline int raid5_bi_phys_segments(struct bio *bio)
+{
+ return bio->bi_phys_segments & 0xffff;
+}
+
+static inline int raid5_bi_hw_segments(struct bio *bio)
+{
+ return (bio->bi_phys_segments >> 16) & 0xffff;
+}
+
+static inline int raid5_dec_bi_phys_segments(struct bio *bio)
+{
+ --bio->bi_phys_segments;
+ return raid5_bi_phys_segments(bio);
+}
+
+static inline int raid5_dec_bi_hw_segments(struct bio *bio)
+{
+ unsigned short val = raid5_bi_hw_segments(bio);
+
+ --val;
+ bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
+ return val;
+}
+
+static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
+{
+ bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+}
+
static inline int raid6_next_disk(int disk, int raid_disks)
{
disk++;
while (rbi && rbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
rbi2 = r5_next_bio(rbi, dev->sector);
- if (--rbi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(rbi)) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
if (*bip)
bi->bi_next = *bip;
*bip = bi;
- bi->bi_phys_segments ++;
+ bi->bi_phys_segments++;
spin_unlock_irq(&conf->device_lock);
spin_unlock(&sh->lock);
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (--bi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (--bi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(bi)) {
md_write_end(conf->mddev);
bi->bi_next = *return_bi;
*return_bi = bi;
struct bio *nextbi =
r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
- if (--bi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(bi)) {
bi->bi_next = *return_bi;
*return_bi = bi;
}
while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
- if (--wbi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(wbi)) {
md_write_end(conf->mddev);
wbi->bi_next = *return_bi;
*return_bi = wbi;
copy_data(0, rbi, dev->page, dev->sector);
rbi2 = r5_next_bio(rbi, dev->sector);
spin_lock_irq(&conf->device_lock);
- if (--rbi->bi_phys_segments == 0) {
+ if (!raid5_dec_bi_phys_segments(rbi)) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
if(bi) {
conf->retry_read_aligned_list = bi->bi_next;
bi->bi_next = NULL;
+ /*
+ * this sets the active strip count to 1 and the processed
+ * strip count to zero (upper 8 bits)
+ */
bi->bi_phys_segments = 1; /* biased count of active stripes */
- bi->bi_hw_segments = 0; /* count of processed stripes */
}
return bi;
if ((bi->bi_size>>9) > q->max_sectors)
return 0;
blk_recount_segments(q, bi);
- if (bi->bi_phys_segments > q->max_phys_segments ||
- bi->bi_hw_segments > q->max_hw_segments)
+ if (bi->bi_phys_segments > q->max_phys_segments)
return 0;
if (q->merge_bvec_fn)
}
spin_lock_irq(&conf->device_lock);
- remaining = --bi->bi_phys_segments;
+ remaining = raid5_dec_bi_phys_segments(bi);
spin_unlock_irq(&conf->device_lock);
if (remaining == 0) {
sector += STRIPE_SECTORS,
scnt++) {
- if (scnt < raid_bio->bi_hw_segments)
+ if (scnt < raid5_bi_hw_segments(raid_bio))
/* already done this stripe */
continue;
if (!sh) {
/* failed to get a stripe - must wait */
- raid_bio->bi_hw_segments = scnt;
+ raid5_set_bi_hw_segments(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
}
set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
release_stripe(sh);
- raid_bio->bi_hw_segments = scnt;
+ raid5_set_bi_hw_segments(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
}
handled++;
}
spin_lock_irq(&conf->device_lock);
- remaining = --raid_bio->bi_phys_segments;
+ remaining = raid5_dec_bi_phys_segments(raid_bio);
spin_unlock_irq(&conf->device_lock);
if (remaining == 0)
bio_endio(raid_bio, 0);
return ftl_write((void *)dev, buf, block, 1);
}
+static int ftl_discardsect(struct mtd_blktrans_dev *dev,
+ unsigned long sector, unsigned nr_sects)
+{
+ partition_t *part = (void *)dev;
+ uint32_t bsize = 1 << part->header.EraseUnitSize;
+
+ DEBUG(1, "FTL erase sector %ld for %d sectors\n",
+ sector, nr_sects);
+
+ while (nr_sects) {
+ uint32_t old_addr = part->VirtualBlockMap[sector];
+ if (old_addr != 0xffffffff) {
+ part->VirtualBlockMap[sector] = 0xffffffff;
+ part->EUNInfo[old_addr/bsize].Deleted++;
+ if (set_bam_entry(part, old_addr, 0))
+ return -EIO;
+ }
+ nr_sects--;
+ sector++;
+ }
+
+ return 0;
+}
/*====================================================================*/
static void ftl_freepart(partition_t *part)
.blksize = SECTOR_SIZE,
.readsect = ftl_readsect,
.writesect = ftl_writesect,
+ .discard = ftl_discardsect,
.getgeo = ftl_getgeo,
.add_mtd = ftl_add_mtd,
.remove_dev = ftl_remove_dev,
spinlock_t queue_lock;
};
+static int blktrans_discard_request(struct request_queue *q,
+ struct request *req)
+{
+ req->cmd_type = REQ_TYPE_LINUX_BLOCK;
+ req->cmd[0] = REQ_LB_OP_DISCARD;
+ return 0;
+}
+
static int do_blktrans_request(struct mtd_blktrans_ops *tr,
struct mtd_blktrans_dev *dev,
struct request *req)
buf = req->buffer;
+ if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
+ req->cmd[0] == REQ_LB_OP_DISCARD)
+ return !tr->discard(dev, block, nsect);
+
if (!blk_fs_request(req))
return 0;
tr->blkcore_priv->rq->queuedata = tr;
blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+ if (tr->discard)
+ blk_queue_set_discard(tr->blkcore_priv->rq,
+ blktrans_discard_request);
+
tr->blkshift = ffs(tr->blksize) - 1;
tr->blkcore_priv->thread = kthread_run(mtd_blktrans_thread, tr,
spin_unlock_irqrestore(shost->host_lock, flags);
}
+static inline int scsi_device_is_busy(struct scsi_device *sdev)
+{
+ if (sdev->device_busy >= sdev->queue_depth || sdev->device_blocked)
+ return 1;
+
+ return 0;
+}
+
static inline int scsi_target_is_busy(struct scsi_target *starget)
{
return ((starget->can_queue > 0 &&
starget->target_blocked);
}
+static inline int scsi_host_is_busy(struct Scsi_Host *shost)
+{
+ if ((shost->can_queue > 0 && shost->host_busy >= shost->can_queue) ||
+ shost->host_blocked || shost->host_self_blocked)
+ return 1;
+
+ return 0;
+}
+
/*
* Function: scsi_run_queue()
*
scsi_single_lun_run(sdev);
spin_lock_irqsave(shost->host_lock, flags);
- while (!list_empty(&shost->starved_list) &&
- !shost->host_blocked && !shost->host_self_blocked &&
- !((shost->can_queue > 0) &&
- (shost->host_busy >= shost->can_queue))) {
-
+ while (!list_empty(&shost->starved_list) && !scsi_host_is_busy(shost)) {
int flagset;
/*
static inline int scsi_dev_queue_ready(struct request_queue *q,
struct scsi_device *sdev)
{
- if (sdev->device_busy >= sdev->queue_depth)
- return 0;
if (sdev->device_busy == 0 && sdev->device_blocked) {
/*
* unblock after device_blocked iterates to zero
return 0;
}
}
- if (sdev->device_blocked)
+ if (scsi_device_is_busy(sdev))
return 0;
return 1;
return 0;
}
}
- if ((shost->can_queue > 0 && shost->host_busy >= shost->can_queue) ||
- shost->host_blocked || shost->host_self_blocked) {
+ if (scsi_host_is_busy(shost)) {
if (list_empty(&sdev->starved_entry))
list_add_tail(&sdev->starved_entry, &shost->starved_list);
return 0;
}
/*
+ * Busy state exporting function for request stacking drivers.
+ *
+ * For efficiency, no lock is taken to check the busy state of
+ * shost/starget/sdev, since the returned value is not guaranteed and
+ * may be changed after request stacking drivers call the function,
+ * regardless of taking lock or not.
+ *
+ * When scsi can't dispatch I/Os anymore and needs to kill I/Os
+ * (e.g. !sdev), scsi needs to return 'not busy'.
+ * Otherwise, request stacking drivers may hold requests forever.
+ */
+static int scsi_lld_busy(struct request_queue *q)
+{
+ struct scsi_device *sdev = q->queuedata;
+ struct Scsi_Host *shost;
+ struct scsi_target *starget;
+
+ if (!sdev)
+ return 0;
+
+ shost = sdev->host;
+ starget = scsi_target(sdev);
+
+ if (scsi_host_in_recovery(shost) || scsi_host_is_busy(shost) ||
+ scsi_target_is_busy(starget) || scsi_device_is_busy(sdev))
+ return 1;
+
+ return 0;
+}
+
+/*
* Kill a request for a dead device
*/
static void scsi_kill_request(struct request *req, struct request_queue *q)
* accept it.
*/
req = elv_next_request(q);
- if (!req)
+ if (!req || !scsi_dev_queue_ready(q, sdev))
break;
- if (!scsi_dev_queue_ready(q, sdev)) {
- blk_set_lld_busy(q);
- break;
- }
-
if (unlikely(!scsi_device_online(sdev))) {
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to offline device\n");
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
if(rtn) {
- blk_set_lld_busy(q);
-
/* we're refusing the command; because of
* the way locks get dropped, we need to
* check here if plugging is required */
break;
}
- blk_clear_lld_busy(q);
}
goto out;
* later time.
*/
spin_lock_irq(q->queue_lock);
- blk_set_lld_busy(q);
blk_requeue_request(q, req);
sdev->device_busy--;
if(sdev->device_busy == 0)
blk_queue_prep_rq(q, scsi_prep_fn);
blk_queue_softirq_done(q, scsi_softirq_done);
blk_queue_rq_timed_out(q, scsi_times_out);
+ blk_queue_lld_busy(q, scsi_lld_busy);
return q;
}
extern int scsi_maybe_unblock_host(struct scsi_device *sdev);
extern void scsi_device_unbusy(struct scsi_device *sdev);
extern int scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
-extern void scsi_queue_retry(struct scsi_cmnd *cmd, int reason);
extern void scsi_next_command(struct scsi_cmnd *cmd);
extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
extern void scsi_run_host_queues(struct Scsi_Host *shost);
{
memset(bio, 0, sizeof(*bio));
bio->bi_flags = 1 << BIO_UPTODATE;
+ bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1);
}
return bio->bi_phys_segments;
}
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_hw_segments;
-}
-
/**
* __bio_clone - clone a bio
* @bio: destination bio
*/
while (bio->bi_phys_segments >= q->max_phys_segments
- || bio->bi_hw_segments >= q->max_hw_segments
- || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+ || bio->bi_phys_segments >= q->max_hw_segments) {
if (retried_segments)
return 0;
}
/* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
- BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+ if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio->bi_flags &= ~(1 << BIO_SEG_VALID);
bio->bi_vcnt++;
bio->bi_phys_segments++;
- bio->bi_hw_segments++;
done:
bio->bi_size += len;
return len;
EXPORT_SYMBOL(__bio_clone);
EXPORT_SYMBOL(bio_clone);
EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
EXPORT_SYMBOL(bio_add_page);
EXPORT_SYMBOL(bio_add_pc_page);
EXPORT_SYMBOL(bio_get_nr_vecs);
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
struct fatent_operations {
void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
struct fat_entry fatent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, err, nr_bhs;
+ int first_cl = cluster;
nr_bhs = 0;
fatent_init(&fatent);
goto error;
}
+ /*
+ * Issue discard for the sectors we no longer care about,
+ * batching contiguous clusters into one request
+ */
+ if (cluster != fatent.entry + 1) {
+ int nr_clus = fatent.entry - first_cl + 1;
+
+ sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+ nr_clus * sbi->sec_per_clus);
+ first_cl = cluster;
+ }
+
ops->ent_put(&fatent, FAT_ENT_FREE);
if (sbi->free_clusters != -1) {
sbi->free_clusters++;
#ifdef CONFIG_BLOCK
-/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
#include <asm/io.h>
-#if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY)
-#define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1))
-#define BIOVEC_VIRT_OVERSIZE(x) ((x) > BIO_VMERGE_MAX_SIZE)
-#else
-#define BIOVEC_VIRT_START_SIZE(x) 0
-#define BIOVEC_VIRT_OVERSIZE(x) 0
-#endif
-
-#ifndef BIO_VMERGE_BOUNDARY
-#define BIO_VMERGE_BOUNDARY 0
-#endif
-
#define BIO_DEBUG
#ifdef BIO_DEBUG
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
- unsigned short bi_phys_segments;
-
- /* Number of segments after physical and DMA remapping
- * hardware coalescing is performed.
- */
- unsigned short bi_hw_segments;
+ unsigned int bi_phys_segments;
unsigned int bi_size; /* residual I/O count */
- /*
- * To keep track of the max hw size, we account for the
- * sizes of the first and last virtually mergeable segments
- * in this bio
- */
- unsigned int bi_hw_front_size;
- unsigned int bi_hw_back_size;
-
unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
+ unsigned int bi_comp_cpu; /* completion CPU */
+
struct bio_vec *bi_io_vec; /* the actual vec list */
bio_end_io_t *bi_end_io;
#define BIO_UPTODATE 0 /* ok after I/O completion */
#define BIO_RW_BLOCK 1 /* RW_AHEAD set, and read/write would block */
#define BIO_EOF 2 /* out-out-bounds error */
-#define BIO_SEG_VALID 3 /* nr_hw_seg valid */
+#define BIO_SEG_VALID 3 /* bi_phys_segments valid */
#define BIO_CLONED 4 /* doesn't own data */
#define BIO_BOUNCED 5 /* bio is a bounce bio */
#define BIO_USER_MAPPED 6 /* contains user pages */
#define BIO_EOPNOTSUPP 7 /* not supported */
+#define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */
#define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
/*
/*
* bio bi_rw flags
*
- * bit 0 -- read (not set) or write (set)
+ * bit 0 -- data direction
+ * If not set, bio is a read from device. If set, it's a write to device.
* bit 1 -- rw-ahead when set
* bit 2 -- barrier
+ * Insert a serialization point in the IO queue, forcing previously
+ * submitted IO to be completed before this oen is issued.
* bit 3 -- synchronous I/O hint: the block layer will unplug immediately
+ * Note that this does NOT indicate that the IO itself is sync, just
+ * that the block layer will not postpone issue of this IO by plugging.
* bit 4 -- metadata request
+ * Used for tracing to differentiate metadata and data IO. May also
+ * get some preferential treatment in the IO scheduler
* bit 5 -- discard sectors
+ * Informs the lower level device that this range of sectors is no longer
+ * used by the file system and may thus be freed by the device. Used
+ * for flash based storage.
* bit 6 -- fail fast device errors
* bit 7 -- fail fast transport errors
* bit 8 -- fail fast driver errors
+ * Don't want driver retries for any fast fail whatever the reason.
*/
-#define BIO_RW 0
-#define BIO_RW_AHEAD 1
+#define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */
+#define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */
#define BIO_RW_BARRIER 2
#define BIO_RW_SYNC 3
#define BIO_RW_META 4
#define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER))
#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
-#define bio_empty_barrier(bio) (bio_barrier(bio) && !(bio)->bi_size)
+#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD))
+#define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
static inline unsigned int bio_cur_sectors(struct bio *bio)
{
if (bio->bi_vcnt)
return bio_iovec(bio)->bv_len >> 9;
-
- return 0;
+ else /* dataless requests such as discard */
+ return bio->bi_size >> 9;
}
static inline void *bio_data(struct bio *bio)
((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
#endif
-#define BIOVEC_VIRT_MERGEABLE(vec1, vec2) \
- ((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
#define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
#define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
extern void bio_endio(struct bio *, int);
struct request_queue;
extern int bio_phys_segments(struct request_queue *, struct bio *);
-extern int bio_hw_segments(struct request_queue *, struct bio *);
extern void __bio_clone(struct bio *, struct bio *);
extern struct bio *bio_clone(struct bio *, gfp_t);
extern unsigned int bvec_nr_vecs(unsigned short idx);
/*
+ * Allow queuer to specify a completion CPU for this bio
+ */
+static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu)
+{
+ bio->bi_comp_cpu = cpu;
+}
+
+/*
* bio_set is used to allow other portions of the IO system to
* allocate their own private memory pools for bio and iovec structures.
* These memory pools in turn all allocate from the bio_slab
__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
#define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)
+/*
+ * Check whether this bio carries any data or not. A NULL bio is allowed.
+ */
+static inline int bio_has_data(struct bio *bio)
+{
+ return bio && bio->bi_io_vec != NULL;
+}
+
#if defined(CONFIG_BLK_DEV_INTEGRITY)
#define bip_vec_idx(bip, idx) (&(bip->bip_vec[(idx)]))
#include <linux/module.h>
#include <linux/stringify.h>
#include <linux/bsg.h>
+#include <linux/smp.h>
#include <asm/scatterlist.h>
REQ_TYPE_PM_SUSPEND, /* suspend request */
REQ_TYPE_PM_RESUME, /* resume request */
REQ_TYPE_PM_SHUTDOWN, /* shutdown request */
- REQ_TYPE_FLUSH, /* flush request */
REQ_TYPE_SPECIAL, /* driver defined type */
REQ_TYPE_LINUX_BLOCK, /* generic block layer message */
/*
*
*/
enum {
- /*
- * just examples for now
- */
REQ_LB_OP_EJECT = 0x40, /* eject request */
- REQ_LB_OP_FLUSH = 0x41, /* flush device */
+ REQ_LB_OP_FLUSH = 0x41, /* flush request */
+ REQ_LB_OP_DISCARD = 0x42, /* discard sectors */
};
/*
- * request type modified bits. first three bits match BIO_RW* bits, important
+ * request type modified bits. first two bits match BIO_RW* bits, important
*/
enum rq_flag_bits {
__REQ_RW, /* not set, read. set, write */
__REQ_FAILFAST_DEV, /* no driver retries of device errors */
__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
__REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */
+ __REQ_DISCARD, /* request to discard sectors */
__REQ_SORTED, /* elevator knows about this request */
__REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
__REQ_HARDBARRIER, /* may not be passed by drive either */
#define REQ_FAILFAST_DEV (1 << __REQ_FAILFAST_DEV)
#define REQ_FAILFAST_TRANSPORT (1 << __REQ_FAILFAST_TRANSPORT)
#define REQ_FAILFAST_DRIVER (1 << __REQ_FAILFAST_DRIVER)
+#define REQ_DISCARD (1 << __REQ_DISCARD)
#define REQ_SORTED (1 << __REQ_SORTED)
#define REQ_SOFTBARRIER (1 << __REQ_SOFTBARRIER)
#define REQ_HARDBARRIER (1 << __REQ_HARDBARRIER)
*/
struct request {
struct list_head queuelist;
- struct list_head donelist;
+ struct call_single_data csd;
+ int cpu;
struct request_queue *q;
*/
unsigned short nr_phys_segments;
- /* Number of scatter-gather addr+len pairs after
- * physical and DMA remapping hardware coalescing is performed.
- * This is the number of scatter-gather entries the driver
- * will actually have to deal with after DMA mapping is done.
- */
- unsigned short nr_hw_segments;
-
unsigned short ioprio;
void *special;
struct request *next_rq;
};
+static inline unsigned short req_get_ioprio(struct request *req)
+{
+ return req->ioprio;
+}
+
/*
* State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
* requests. Some step values could eventually be made generic.
typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
typedef int (prep_rq_fn) (struct request_queue *, struct request *);
typedef void (unplug_fn) (struct request_queue *);
+typedef int (prepare_discard_fn) (struct request_queue *, struct request *);
struct bio_vec;
struct bvec_merge_data {
typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
typedef void (softirq_done_fn)(struct request *);
typedef int (dma_drain_needed_fn)(struct request *);
+typedef int (lld_busy_fn) (struct request_queue *q);
enum blk_eh_timer_return {
BLK_EH_NOT_HANDLED,
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
unplug_fn *unplug_fn;
+ prepare_discard_fn *prepare_discard_fn;
merge_bvec_fn *merge_bvec_fn;
prepare_flush_fn *prepare_flush_fn;
softirq_done_fn *softirq_done_fn;
rq_timed_out_fn *rq_timed_out_fn;
dma_drain_needed_fn *dma_drain_needed;
+ lld_busy_fn *lld_busy_fn;
/*
* Dispatch queue sorting
#define QUEUE_FLAG_ELVSWITCH 8 /* don't use elevator, just do FIFO */
#define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */
#define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */
-#define QUEUE_FLAG_BUSY 11 /* device/host under queue is busy */
+#define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */
+#define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */
static inline int queue_is_locked(struct request_queue *q)
{
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_flushing(q) ((q)->ordseq)
-#define blk_lld_busy(q) test_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
-#define blk_set_lld_busy(q) set_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
-#define blk_clear_lld_busy(q) clear_bit(QUEUE_FLAG_BUSY, &(q)->queue_flags)
+#define blk_queue_stackable(q) \
+ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
#define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS)
#define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
blk_failfast_driver(rq))
#define blk_rq_started(rq) ((rq)->cmd_flags & REQ_STARTED)
-#define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq))
+#define blk_account_rq(rq) (blk_rq_started(rq) && (blk_fs_request(rq) || blk_discard_rq(rq)))
#define blk_pm_suspend_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
#define blk_pm_resume_request(rq) ((rq)->cmd_type == REQ_TYPE_PM_RESUME)
#define blk_pm_request(rq) \
(blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
+#define blk_rq_cpu_valid(rq) ((rq)->cpu != -1)
#define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
#define blk_barrier_rq(rq) ((rq)->cmd_flags & REQ_HARDBARRIER)
#define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA)
+#define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD)
#define blk_bidi_rq(rq) ((rq)->next_rq != NULL)
#define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
/* rq->queuelist of dequeued request must be list_empty() */
#define RQ_NOMERGE_FLAGS \
(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
#define rq_mergeable(rq) \
- (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
+ (!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
+ (blk_discard_rq(rq) || blk_fs_request((rq))))
/*
* q->prep_rq_fn return values
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
extern void blk_requeue_request(struct request_queue *, struct request *);
-extern void blk_submit_request(struct request_queue *q, struct request *rq);
+extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
+extern int blk_lld_busy(struct request_queue *q);
+extern int blk_insert_cloned_request(struct request_queue *q,
+ struct request *rq);
extern void blk_plug_device(struct request_queue *);
extern void blk_plug_device_unlocked(struct request_queue *);
extern int blk_remove_plug(struct request_queue *);
extern int blk_queue_dma_drain(struct request_queue *q,
dma_drain_needed_fn *dma_drain_needed,
void *buf, unsigned int size);
+extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
extern void blk_queue_dma_alignment(struct request_queue *, int);
extern void blk_queue_update_dma_alignment(struct request_queue *, int);
extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
+extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *);
extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
}
extern int blkdev_issue_flush(struct block_device *, sector_t *);
+extern int blkdev_issue_discard(struct block_device *, sector_t sector,
+ unsigned nr_sects);
+
+static inline int sb_issue_discard(struct super_block *sb,
+ sector_t block, unsigned nr_blocks)
+{
+ block <<= (sb->s_blocksize_bits - 9);
+ nr_blocks <<= (sb->s_blocksize_bits - 9);
+ return blkdev_issue_discard(sb->s_bdev, block, nr_blocks);
+}
/*
* command filter functions
}
struct work_struct;
-int kblockd_schedule_work(struct work_struct *work);
+int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
void kblockd_flush_work(struct work_struct *work);
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
BLK_TC_NOTIFY = 1 << 10, /* special message */
BLK_TC_AHEAD = 1 << 11, /* readahead */
BLK_TC_META = 1 << 12, /* metadata */
- BLK_TC_DRV_DATA = 1 << 13, /* binary per-driver data */
+ BLK_TC_DISCARD = 1 << 13, /* discard requests */
+ BLK_TC_DRV_DATA = 1 << 14, /* binary per-drivers data */
BLK_TC_END = 1 << 15, /* only 16-bits, reminder */
};
#define BLK_TA_BOUNCE (__BLK_TA_BOUNCE)
#define BLK_TA_REMAP (__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
#define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE))
-#define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
+#define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA))
#define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY))
#define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY))
if (likely(!bt))
return;
+ if (blk_discard_rq(rq))
+ rw |= (1 << BIO_RW_DISCARD);
+
if (blk_pc_request(rq)) {
what |= BLK_TC_ACT(BLK_TC_PC);
__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
#include <linux/bio.h>
#include <linux/blkdev.h>
-struct request;
struct dm_target;
struct dm_table;
struct dm_dev;
*/
typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
union map_info *map_context);
-
typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
union map_info *map_context);
typedef int (*dm_endio_fn) (struct dm_target *ti,
struct bio *bio, int error,
union map_info *map_context);
-
typedef int (*dm_request_endio_fn) (struct dm_target *ti,
struct request *clone, int error,
union map_info *map_context);
typedef int (*dm_ioctl_fn) (struct dm_target *ti, struct inode *inode,
struct file *filp, unsigned int cmd,
unsigned long arg);
-typedef int (*dm_congested_fn) (struct dm_target *ti);
typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
struct bio_vec *biovec, int max_size);
+/*
+ * Returns:
+ * 0: The target can handle the next I/O immediately.
+ * 1: The target can't handle the next I/O immediately.
+ */
+typedef int (*dm_busy_fn) (struct dm_target *ti);
+
void dm_error(const char *message);
/*
dm_message_fn message;
dm_ioctl_fn ioctl;
dm_merge_fn merge;
- dm_congested_fn congested;
+ dm_busy_fn busy;
};
struct io_restrictions {
unsigned short max_hw_segments;
unsigned short max_phys_segments;
unsigned char no_cluster; /* inverted so that 0 is default */
- unsigned char no_request_stacking; /* inverted so that 0 is default */
+ unsigned char no_request_stacking;
};
struct dm_target {
int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
struct gendisk *dm_disk(struct mapped_device *md);
int dm_suspended(struct mapped_device *md);
-int dm_request_based(struct mapped_device *md);
int dm_noflush_suspending(struct dm_target *ti);
/*
return (n << SECTOR_SHIFT);
}
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_request(struct request *rq);
+void dm_kill_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
+
#endif /* _LINUX_DEVICE_MAPPER_H */
#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
/*
- * Hack to reuse the donelist list_head as the fifo time holder while
+ * Hack to reuse the csd.list list_head as the fifo time holder while
* the request is in the io scheduler. Saves an unsigned long in rq.
*/
-#define rq_fifo_time(rq) ((unsigned long) (rq)->donelist.next)
-#define rq_set_fifo_time(rq,exp) ((rq)->donelist.next = (void *) (exp))
+#define rq_fifo_time(rq) ((unsigned long) (rq)->csd.list.next)
+#define rq_set_fifo_time(rq,exp) ((rq)->csd.list.next = (void *) (exp))
#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
#define rq_fifo_clear(rq) do { \
list_del_init(&(rq)->queuelist); \
- INIT_LIST_HEAD(&(rq)->donelist); \
+ INIT_LIST_HEAD(&(rq)->csd.list); \
} while (0)
/*
#define READ_META (READ | (1 << BIO_RW_META))
#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC))
-#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
+#define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER))
+#define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
+#define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
#define SEL_IN 1
#define SEL_OUT 2
#define BLKTRACESTART _IO(0x12,116)
#define BLKTRACESTOP _IO(0x12,117)
#define BLKTRACETEARDOWN _IO(0x12,118)
+#define BLKDISCARD _IO(0x12,119)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
unsigned long block, char *buffer);
int (*writesect)(struct mtd_blktrans_dev *dev,
unsigned long block, char *buffer);
+ int (*discard)(struct mtd_blktrans_dev *dev,
+ unsigned long block, unsigned nr_blocks);
/* Block layer ioctls */
int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);
rq.errors,
(unsigned long long)rq.sector, rq.nr_sectors);
- kdb_printf(" hsect %llu hnrsect %lu nrseg %u nrhwseg %u currnrsect %u\n",
+ kdb_printf(" hsect %llu hnrsect %lu nrseg %u currnrsect %u\n",
(unsigned long long)rq.hard_sector, rq.hard_nr_sectors,
- rq.nr_phys_segments, rq.nr_hw_segments,
- rq.current_nr_sectors);
+ rq.nr_phys_segments, rq.current_nr_sectors);
return (unsigned long) rq.queuelist.next;
}
/*
* Data-less bio, nothing to bounce
*/
- if (bio_empty_barrier(*bio_orig))
+ if (!bio_has_data(*bio_orig))
return;
/*