UBUNTU: Ubuntu-2.6.38-12.51
[linux-flexiantxendom0-natty.git] / block / blk-core.c
index d316662..518dd42 100644 (file)
@@ -33,7 +33,7 @@
 
 #include "blk.h"
 
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
@@ -64,13 +64,27 @@ static void drive_stat_acct(struct request *rq, int new_io)
                return;
 
        cpu = part_stat_lock();
-       part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
-       if (!new_io)
+       if (!new_io) {
+               part = rq->part;
                part_stat_inc(cpu, part, merges[rw]);
-       else {
+       } else {
+               part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+               if (!hd_struct_try_get(part)) {
+                       /*
+                        * The partition is already being removed,
+                        * the request will be accounted on the disk only
+                        *
+                        * We take a reference on disk->part0 although that
+                        * partition will never be deleted, so we can treat
+                        * it as any other partition.
+                        */
+                       part = &rq->rq_disk->part0;
+                       hd_struct_get(part);
+               }
                part_round_stats(cpu, part);
                part_inc_in_flight(part, rw);
+               rq->part = part;
        }
 
        part_stat_unlock();
@@ -128,6 +142,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
        rq->ref_count = 1;
        rq->start_time = jiffies;
        set_start_time_ns(rq);
+       rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -136,7 +151,7 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
 {
        struct request_queue *q = rq->q;
 
-       if (&q->bar_rq != rq) {
+       if (&q->flush_rq != rq) {
                if (error)
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
                else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
@@ -160,13 +175,12 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
                if (bio->bi_size == 0)
                        bio_endio(bio, error);
        } else {
-
                /*
-                * Okay, this is the barrier request in progress, just
-                * record the error;
+                * Okay, this is the sequenced flush request in
+                * progress, just record the error;
                 */
-               if (error && !q->orderr)
-                       q->orderr = error;
+               if (error && !q->flush_err)
+                       q->flush_err = error;
        }
 }
 
@@ -338,7 +352,7 @@ void blk_start_queue(struct request_queue *q)
        WARN_ON(!irqs_disabled());
 
        queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-       __blk_run_queue(q);
+       __blk_run_queue(q, false);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -382,19 +396,21 @@ void blk_sync_queue(struct request_queue *q)
        del_timer_sync(&q->unplug_timer);
        del_timer_sync(&q->timeout);
        cancel_work_sync(&q->unplug_work);
+       throtl_shutdown_timer_wq(q);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 
 /**
  * __blk_run_queue - run a single device queue
  * @q: The queue to run
+ * @force_kblockd: Don't run @q->request_fn directly.  Use kblockd.
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
-void __blk_run_queue(struct request_queue *q)
+void __blk_run_queue(struct request_queue *q, bool force_kblockd)
 {
        blk_remove_plug(q);
 
@@ -408,7 +424,7 @@ void __blk_run_queue(struct request_queue *q)
         * Only recurse once to avoid overrunning the stack, let the unplug
         * handling reinvoke the handler shortly if we already got there.
         */
-       if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+       if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
                q->request_fn(q);
                queue_flag_clear(QUEUE_FLAG_REENTER, q);
        } else {
@@ -431,7 +447,7 @@ void blk_run_queue(struct request_queue *q)
        unsigned long flags;
 
        spin_lock_irqsave(q->queue_lock, flags);
-       __blk_run_queue(q);
+       __blk_run_queue(q, false);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
@@ -515,12 +531,17 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
                return NULL;
        }
 
+       if (blk_throtl_init(q)) {
+               kmem_cache_free(blk_requestq_cachep, q);
+               return NULL;
+       }
+
        setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
                    laptop_mode_timer_fn, (unsigned long) q);
        init_timer(&q->unplug_timer);
        setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
        INIT_LIST_HEAD(&q->timeout_list);
-       INIT_LIST_HEAD(&q->pending_barriers);
+       INIT_LIST_HEAD(&q->pending_flushes);
        INIT_WORK(&q->unplug_work, blk_unplug_work);
 
        kobject_init(&q->kobj, &blk_queue_ktype);
@@ -1033,7 +1054,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 
        drive_stat_acct(rq, 1);
        __elv_add_request(q, rq, where, 0);
-       __blk_run_queue(q);
+       __blk_run_queue(q, false);
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
@@ -1183,19 +1204,12 @@ static int __make_request(struct request_queue *q, struct bio *bio)
        int el_ret;
        unsigned int bytes = bio->bi_size;
        const unsigned short prio = bio_prio(bio);
-       const bool sync = (bio->bi_rw & REQ_SYNC);
-       const bool unplug = (bio->bi_rw & REQ_UNPLUG);
-       const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
+       const bool sync = !!(bio->bi_rw & REQ_SYNC);
+       const bool unplug = !!(bio->bi_rw & REQ_UNPLUG);
+       const unsigned long ff = bio->bi_rw & REQ_FAILFAST_MASK;
        int where = ELEVATOR_INSERT_SORT;
        int rw_flags;
 
-       /* REQ_HARDBARRIER is no more */
-       if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
-               "block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
-               bio_endio(bio, -EOPNOTSUPP);
-               return 0;
-       }
-
        /*
         * low level driver can indicate that it wants pages above a
         * certain limit bounced to low memory (ie for highmem, or even
@@ -1205,7 +1219,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
        spin_lock_irq(q->queue_lock);
 
-       if (bio->bi_rw & REQ_HARDBARRIER) {
+       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
                where = ELEVATOR_INSERT_FRONT;
                goto get_rq;
        }
@@ -1331,9 +1345,9 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
 
-               trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev,
-                                   bio->bi_sector - p->start_sect);
+               trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+                                     bdev->bd_dev,
+                                     bio->bi_sector - p->start_sect);
        }
 }
 
@@ -1346,7 +1360,7 @@ static void handle_bad_sector(struct bio *bio)
                        bdevname(bio->bi_bdev, b),
                        bio->bi_rw,
                        (unsigned long long)bio->bi_sector + bio_sectors(bio),
-                       (long long)(bio->bi_bdev->bd_inode->i_size >> 9));
+                       (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 
        set_bit(BIO_EOF, &bio->bi_flags);
 }
@@ -1399,7 +1413,7 @@ static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
                return 0;
 
        /* Test device or partition size, when known. */
-       maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
+       maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
        if (maxsector) {
                sector_t sector = bio->bi_sector;
 
@@ -1502,7 +1516,7 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
 
                if (old_sector != -1)
-                       trace_block_remap(q, bio, old_dev, old_sector);
+                       trace_block_bio_remap(q, bio, old_dev, old_sector);
 
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@ -1510,6 +1524,19 @@ static inline void __generic_make_request(struct bio *bio)
                if (bio_check_eod(bio, nr_sectors))
                        goto end_io;
 
+               /*
+                * Filter flush bio's early so that make_request based
+                * drivers without flush support don't have to worry
+                * about them.
+                */
+               if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
+                       bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
+                       if (!nr_sectors) {
+                               err = 0;
+                               goto end_io;
+                       }
+               }
+
                if ((bio->bi_rw & REQ_DISCARD) &&
                    (!blk_queue_discard(q) ||
                     ((bio->bi_rw & REQ_SECURE) &&
@@ -1518,6 +1545,15 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
                }
 
+               blk_throtl_bio(q, &bio);
+
+               /*
+                * If bio = NULL, bio has been throttled and will be submitted
+                * later.
+                */
+               if (!bio)
+                       break;
+
                trace_block_bio_queue(q, bio);
 
                ret = q->make_request_fn(q, bio);
@@ -1608,11 +1644,12 @@ void submit_bio(int rw, struct bio *bio)
 
                if (unlikely(block_dump)) {
                        char b[BDEVNAME_SIZE];
-                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
+                       printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
                        current->comm, task_pid_nr(current),
                                (rw & WRITE) ? "WRITE" : "READ",
                                (unsigned long long)bio->bi_sector,
-                               bdevname(bio->bi_bdev, b));
+                               bdevname(bio->bi_bdev, b),
+                               count);
                }
        }
 
@@ -1633,7 +1670,7 @@ EXPORT_SYMBOL(submit_bio);
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
- *    in some cases below, so export this fuction.
+ *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
@@ -1755,7 +1792,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
                int cpu;
 
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
                part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                part_stat_unlock();
        }
@@ -1764,24 +1801,25 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 static void blk_account_io_done(struct request *req)
 {
        /*
-        * Account IO completion.  bar_rq isn't accounted as a normal
-        * IO on queueing nor completion.  Accounting the containing
-        * request is enough.
+        * Account IO completion.  flush_rq isn't accounted as a
+        * normal IO on queueing nor completion.  Accounting the
+        * containing request is enough.
         */
-       if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
+       if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
                unsigned long duration = jiffies - req->start_time;
                const int rw = rq_data_dir(req);
                struct hd_struct *part;
                int cpu;
 
                cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
 
                part_stat_inc(cpu, part, ios[rw]);
                part_stat_add(cpu, part, ticks[rw], duration);
                part_round_stats(cpu, part);
                part_dec_in_flight(part, rw);
 
+               hd_struct_put(part);
                part_stat_unlock();
        }
 }
@@ -2493,9 +2531,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
        dst->cpu = src->cpu;
-       dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
-       if (src->cmd_flags & REQ_DISCARD)
-               dst->cmd_flags |= REQ_DISCARD;
+       dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
        dst->cmd_type = src->cmd_type;
        dst->__sector = blk_rq_pos(src);
        dst->__data_len = blk_rq_bytes(src);
@@ -2580,7 +2616,9 @@ int __init blk_dev_init(void)
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                        sizeof(((struct request *)0)->cmd_flags));
 
-       kblockd_workqueue = create_workqueue("kblockd");
+       /* used for unplugging and affects IO latency/throughput - HIGHPRI */
+       kblockd_workqueue = alloc_workqueue("kblockd",
+                                           WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");