2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
4 * This file is released under the GPL.
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/moduleparam.h>
12 #include <linux/blkpg.h>
13 #include <linux/bio.h>
14 #include <linux/mempool.h>
15 #include <linux/slab.h>
17 static const char *_name = DM_NAME;
18 #define MAX_DEVICES 1024
20 static unsigned int major = 0;
21 static unsigned int _major = 0;
24 struct mapped_device *md;
32 struct deferred_io *next;
36 * Bits for the md->flags field.
38 #define DMF_BLOCK_IO 0
39 #define DMF_SUSPENDED 1
41 struct mapped_device {
42 struct rw_semaphore lock;
47 request_queue_t *queue;
51 * A list of ios that arrived while we were suspended.
54 wait_queue_head_t wait;
55 struct deferred_io *deferred;
58 * The current mapping.
63 * io objects are allocated from here.
71 wait_queue_head_t eventq;
75 static kmem_cache_t *_io_cache;
77 static __init int local_init(void)
81 /* allocate a slab for the dm_ios */
82 _io_cache = kmem_cache_create("dm_io",
83 sizeof(struct dm_io), 0, 0, NULL, NULL);
88 r = register_blkdev(_major, _name);
90 kmem_cache_destroy(_io_cache);
100 static void local_exit(void)
102 kmem_cache_destroy(_io_cache);
104 if (unregister_blkdev(_major, _name) < 0)
105 DMERR("devfs_unregister_blkdev failed");
109 DMINFO("cleaned up");
113 * We have a lot of init/exit functions, so it seems easier to
114 * store them in an array. The disposable macro 'xx'
115 * expands a prefix into a pair of function names.
122 #define xx(n) {n ## _init, n ## _exit},
131 static int __init dm_init(void)
133 const int count = ARRAY_SIZE(_inits);
137 for (i = 0; i < count; i++) {
138 r = _inits[i].init();
152 static void __exit dm_exit(void)
154 int i = ARRAY_SIZE(_inits);
161 * Block device functions
163 static int dm_blk_open(struct inode *inode, struct file *file)
165 struct mapped_device *md;
167 md = inode->i_bdev->bd_disk->private_data;
172 static int dm_blk_close(struct inode *inode, struct file *file)
174 struct mapped_device *md;
176 md = inode->i_bdev->bd_disk->private_data;
181 static inline struct dm_io *alloc_io(struct mapped_device *md)
183 return mempool_alloc(md->io_pool, GFP_NOIO);
186 static inline void free_io(struct mapped_device *md, struct dm_io *io)
188 mempool_free(io, md->io_pool);
191 static inline struct deferred_io *alloc_deferred(void)
193 return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
196 static inline void free_deferred(struct deferred_io *di)
202 * Add the bio to the list of deferred io.
204 static int queue_io(struct mapped_device *md, struct bio *bio)
206 struct deferred_io *di;
208 di = alloc_deferred();
212 down_write(&md->lock);
214 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
221 di->next = md->deferred;
225 return 0; /* deferred successfully */
228 /*-----------------------------------------------------------------
230 * A more elegant soln is in the works that uses the queue
231 * merge fn, unfortunately there are a couple of changes to
232 * the block layer that I want to make for this. So in the
233 * interests of getting something for people to use I give
234 * you this clearly demarcated crap.
235 *---------------------------------------------------------------*/
236 static inline sector_t to_sector(unsigned int bytes)
238 return bytes >> SECTOR_SHIFT;
241 static inline unsigned int to_bytes(sector_t sector)
243 return sector << SECTOR_SHIFT;
247 * Decrements the number of outstanding ios that a bio has been
248 * cloned into, completing the original io if necc.
250 static inline void dec_pending(struct dm_io *io, int error)
252 static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
256 spin_lock_irqsave(&_uptodate_lock, flags);
258 spin_unlock_irqrestore(&_uptodate_lock, flags);
261 if (atomic_dec_and_test(&io->io_count)) {
262 if (atomic_dec_and_test(&io->md->pending))
263 /* nudge anyone waiting on suspend queue */
264 wake_up(&io->md->wait);
266 bio_endio(io->bio, io->bio->bi_size, io->error);
271 static int clone_endio(struct bio *bio, unsigned int done, int error)
273 struct dm_io *io = bio->bi_private;
278 dec_pending(io, error);
284 static sector_t max_io_len(struct mapped_device *md,
285 sector_t sector, struct dm_target *ti)
287 sector_t offset = sector - ti->begin;
288 sector_t len = ti->len - offset;
291 * Does the target need to split even further ?
295 boundary = dm_round_up(offset + 1, ti->split_io) - offset;
304 static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_io *io)
311 BUG_ON(!clone->bi_size);
313 clone->bi_end_io = clone_endio;
314 clone->bi_private = io;
317 * Map the clone. If r == 0 we don't need to do
318 * anything, the target has assumed ownership of
321 atomic_inc(&io->io_count);
322 r = ti->type->map(ti, clone);
324 /* the bio has been remapped so dispatch it */
325 generic_make_request(clone);
328 /* error the io and bail out */
329 dec_pending(io, -EIO);
333 struct mapped_device *md;
337 sector_t sector_count;
342 * Creates a little bio that is just does part of a bvec.
344 static struct bio *split_bvec(struct bio *bio, sector_t sector,
345 unsigned short idx, unsigned int offset,
349 struct bio_vec *bv = bio->bi_io_vec + idx;
351 clone = bio_alloc(GFP_NOIO, 1);
352 memcpy(clone->bi_io_vec, bv, sizeof(*bv));
354 clone->bi_sector = sector;
355 clone->bi_bdev = bio->bi_bdev;
356 clone->bi_rw = bio->bi_rw;
358 clone->bi_size = to_bytes(len);
359 clone->bi_io_vec->bv_offset = offset;
360 clone->bi_io_vec->bv_len = clone->bi_size;
366 * Creates a bio that consists of range of complete bvecs.
368 static struct bio *clone_bio(struct bio *bio, sector_t sector,
369 unsigned short idx, unsigned short bv_count,
374 clone = bio_clone(bio, GFP_NOIO);
375 clone->bi_sector = sector;
377 clone->bi_vcnt = idx + bv_count;
378 clone->bi_size = to_bytes(len);
383 static void __clone_and_map(struct clone_info *ci)
385 struct bio *clone, *bio = ci->bio;
386 struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
387 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
389 if (ci->sector_count <= max) {
391 * Optimise for the simple case where we can do all of
392 * the remaining io with a single clone.
394 clone = clone_bio(bio, ci->sector, ci->idx,
395 bio->bi_vcnt - ci->idx, ci->sector_count);
396 __map_bio(ti, clone, ci->io);
397 ci->sector_count = 0;
399 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
401 * There are some bvecs that don't span targets.
402 * Do as many of these as possible.
405 sector_t remaining = max;
408 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
409 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
411 if (bv_len > remaining)
418 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
419 __map_bio(ti, clone, ci->io);
422 ci->sector_count -= len;
427 * Create two copy bios to deal with io that has
428 * been split across a target.
430 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
432 clone = split_bvec(bio, ci->sector, ci->idx,
434 __map_bio(ti, clone, ci->io);
437 ci->sector_count -= max;
438 ti = dm_table_find_target(ci->md->map, ci->sector);
440 len = to_sector(bv->bv_len) - max;
441 clone = split_bvec(bio, ci->sector, ci->idx,
442 bv->bv_offset + to_bytes(max), len);
443 __map_bio(ti, clone, ci->io);
446 ci->sector_count -= len;
452 * Split the bio into several clones.
454 static void __split_bio(struct mapped_device *md, struct bio *bio)
456 struct clone_info ci;
460 ci.io = alloc_io(md);
462 atomic_set(&ci.io->io_count, 1);
465 ci.sector = bio->bi_sector;
466 ci.sector_count = bio_sectors(bio);
467 ci.idx = bio->bi_idx;
469 atomic_inc(&md->pending);
470 while (ci.sector_count)
471 __clone_and_map(&ci);
473 /* drop the extra reference count */
474 dec_pending(ci.io, 0);
476 /*-----------------------------------------------------------------
478 *---------------------------------------------------------------*/
482 * The request function that just remaps the bio built up by
485 static int dm_request(request_queue_t *q, struct bio *bio)
488 struct mapped_device *md = q->queuedata;
490 down_read(&md->lock);
493 * If we're suspended we have to queue
496 while (test_bit(DMF_BLOCK_IO, &md->flags)) {
499 if (bio_rw(bio) == READA) {
500 bio_io_error(bio, bio->bi_size);
504 r = queue_io(md, bio);
506 bio_io_error(bio, bio->bi_size);
510 return 0; /* deferred successfully */
513 * We're in a while loop, because someone could suspend
514 * before we get to the following read lock.
516 down_read(&md->lock);
520 bio_io_error(bio, bio->bi_size);
524 __split_bio(md, bio);
529 /*-----------------------------------------------------------------
530 * A bitset is used to keep track of allocated minor numbers.
531 *---------------------------------------------------------------*/
532 static spinlock_t _minor_lock = SPIN_LOCK_UNLOCKED;
533 static unsigned long _minor_bits[MAX_DEVICES / BITS_PER_LONG];
535 static void free_minor(unsigned int minor)
537 spin_lock(&_minor_lock);
538 clear_bit(minor, _minor_bits);
539 spin_unlock(&_minor_lock);
543 * See if the device with a specific minor # is free.
545 static int specific_minor(unsigned int minor)
549 if (minor >= MAX_DEVICES) {
550 DMWARN("request for a mapped_device beyond MAX_DEVICES (%d)",
555 spin_lock(&_minor_lock);
556 if (!test_and_set_bit(minor, _minor_bits))
558 spin_unlock(&_minor_lock);
563 static int next_free_minor(unsigned int *minor)
568 spin_lock(&_minor_lock);
569 m = find_first_zero_bit(_minor_bits, MAX_DEVICES);
570 if (m != MAX_DEVICES) {
571 set_bit(m, _minor_bits);
575 spin_unlock(&_minor_lock);
581 * Allocate and initialise a blank device with a given minor.
583 static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
586 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
589 DMWARN("unable to allocate device, out of memory.");
593 /* get a minor number for the dev */
594 r = persistent ? specific_minor(minor) : next_free_minor(&minor);
600 memset(md, 0, sizeof(*md));
601 init_rwsem(&md->lock);
602 atomic_set(&md->holders, 1);
604 md->queue = blk_alloc_queue(GFP_KERNEL);
610 md->queue->queuedata = md;
611 blk_queue_make_request(md->queue, dm_request);
613 md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
614 mempool_free_slab, _io_cache);
617 blk_put_queue(md->queue);
622 md->disk = alloc_disk(1);
624 mempool_destroy(md->io_pool);
626 blk_put_queue(md->queue);
631 md->disk->major = _major;
632 md->disk->first_minor = minor;
633 md->disk->fops = &dm_blk_dops;
634 md->disk->queue = md->queue;
635 md->disk->private_data = md;
636 sprintf(md->disk->disk_name, "dm-%d", minor);
639 atomic_set(&md->pending, 0);
640 init_waitqueue_head(&md->wait);
641 init_waitqueue_head(&md->eventq);
646 static void free_dev(struct mapped_device *md)
648 free_minor(md->disk->first_minor);
649 mempool_destroy(md->io_pool);
650 del_gendisk(md->disk);
652 blk_put_queue(md->queue);
657 * Bind a table to the device.
659 static void event_callback(void *context)
661 struct mapped_device *md = (struct mapped_device *) context;
663 down_write(&md->lock);
665 wake_up_interruptible(&md->eventq);
669 static void __set_size(struct gendisk *disk, sector_t size)
671 struct block_device *bdev;
673 set_capacity(disk, size);
674 bdev = bdget_disk(disk, 0);
676 down(&bdev->bd_inode->i_sem);
677 i_size_write(bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
678 up(&bdev->bd_inode->i_sem);
683 static int __bind(struct mapped_device *md, struct dm_table *t)
685 request_queue_t *q = md->queue;
689 size = dm_table_get_size(t);
690 __set_size(md->disk, size);
694 dm_table_event_callback(md->map, event_callback, md);
697 dm_table_set_restrictions(t, q);
701 static void __unbind(struct mapped_device *md)
706 dm_table_event_callback(md->map, NULL, NULL);
707 dm_table_put(md->map);
712 * Constructor for a new device.
714 static int create_aux(unsigned int minor, int persistent,
715 struct mapped_device **result)
717 struct mapped_device *md;
719 md = alloc_dev(minor, persistent);
727 int dm_create(struct mapped_device **result)
729 return create_aux(0, 0, result);
732 int dm_create_with_minor(unsigned int minor, struct mapped_device **result)
734 return create_aux(minor, 1, result);
737 void dm_get(struct mapped_device *md)
739 atomic_inc(&md->holders);
742 void dm_put(struct mapped_device *md)
744 if (atomic_dec_and_test(&md->holders)) {
745 if (!test_bit(DMF_SUSPENDED, &md->flags) && md->map)
746 dm_table_suspend_targets(md->map);
753 * Requeue the deferred bios by calling generic_make_request.
755 static void flush_deferred_io(struct deferred_io *c)
757 struct deferred_io *n;
761 generic_make_request(c->bio);
768 * Swap in a new table (destroying old one).
770 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
774 down_write(&md->lock);
776 /* device must be suspended */
777 if (!test_bit(DMF_SUSPENDED, &md->flags)) {
783 r = __bind(md, table);
792 * We need to be able to change a mapping table under a mounted
793 * filesystem. For example we might want to move some data in
794 * the background. Before the table can be swapped with
795 * dm_bind_table, dm_suspend must be called to flush any in
796 * flight bios and ensure that any further io gets deferred.
798 int dm_suspend(struct mapped_device *md)
800 DECLARE_WAITQUEUE(wait, current);
802 down_write(&md->lock);
805 * First we set the BLOCK_IO flag so no more ios will be
808 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
813 set_bit(DMF_BLOCK_IO, &md->flags);
814 add_wait_queue(&md->wait, &wait);
818 * Then we wait for the already mapped ios to
823 set_current_state(TASK_INTERRUPTIBLE);
825 if (!atomic_read(&md->pending))
830 set_current_state(TASK_RUNNING);
832 down_write(&md->lock);
833 remove_wait_queue(&md->wait, &wait);
834 set_bit(DMF_SUSPENDED, &md->flags);
836 dm_table_suspend_targets(md->map);
842 int dm_resume(struct mapped_device *md)
844 struct deferred_io *def;
846 down_write(&md->lock);
848 !test_bit(DMF_SUSPENDED, &md->flags) ||
849 !dm_table_get_size(md->map)) {
854 dm_table_resume_targets(md->map);
855 clear_bit(DMF_SUSPENDED, &md->flags);
856 clear_bit(DMF_BLOCK_IO, &md->flags);
861 flush_deferred_io(def);
867 /*-----------------------------------------------------------------
868 * Event notification.
869 *---------------------------------------------------------------*/
870 uint32_t dm_get_event_nr(struct mapped_device *md)
874 down_read(&md->lock);
881 int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
884 down_write(&md->lock);
885 if (event_nr != md->event_nr) {
890 add_wait_queue(&md->eventq, wq);
896 void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
898 down_write(&md->lock);
899 remove_wait_queue(&md->eventq, wq);
904 * The gendisk is only valid as long as you have a reference
907 struct gendisk *dm_disk(struct mapped_device *md)
912 struct dm_table *dm_get_table(struct mapped_device *md)
916 down_read(&md->lock);
925 int dm_suspended(struct mapped_device *md)
927 return test_bit(DMF_SUSPENDED, &md->flags);
930 struct block_device_operations dm_blk_dops = {
932 .release = dm_blk_close,
939 module_init(dm_init);
940 module_exit(dm_exit);
942 module_param(major, uint, 0);
943 MODULE_PARM_DESC(major, "The major number of the device mapper");
944 MODULE_DESCRIPTION(DM_NAME " driver");
945 MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
946 MODULE_LICENSE("GPL");