dm thin: fix table output when pool target disables discard passdown internally
[linux-flexiantxendom0-3.2.10.git] / drivers / md / dm-thin.c
index 2d9e755..eb3d138 100644 (file)
@@ -279,8 +279,10 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
 
        hlist_del(&cell->list);
 
-       bio_list_add(inmates, cell->holder);
-       bio_list_merge(inmates, &cell->bios);
+       if (inmates) {
+               bio_list_add(inmates, cell->holder);
+               bio_list_merge(inmates, &cell->bios);
+       }
 
        mempool_free(cell, prison->cell_pool);
 }
@@ -303,9 +305,10 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
  */
 static void __cell_release_singleton(struct cell *cell, struct bio *bio)
 {
-       hlist_del(&cell->list);
        BUG_ON(cell->holder != bio);
        BUG_ON(!bio_list_empty(&cell->bios));
+
+       __cell_release(cell, NULL);
 }
 
 static void cell_release_singleton(struct cell *cell, struct bio *bio)
@@ -489,6 +492,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
  * devices.
  */
 struct new_mapping;
+
+struct pool_features {
+       unsigned zero_new_blocks:1;
+       unsigned discard_enabled:1;
+       unsigned discard_passdown:1;
+};
+
 struct pool {
        struct list_head list;
        struct dm_target *ti;   /* Only set if a pool target is bound */
@@ -502,7 +512,7 @@ struct pool {
        dm_block_t offset_mask;
        dm_block_t low_water_blocks;
 
-       unsigned zero_new_blocks:1;
+       struct pool_features pf;
        unsigned low_water_triggered:1; /* A dm event has been sent */
        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
 
@@ -520,10 +530,12 @@ struct pool {
        struct bio_list deferred_bios;
        struct bio_list deferred_flush_bios;
        struct list_head prepared_mappings;
+       struct list_head prepared_discards;
 
        struct bio_list retry_on_resume_list;
 
-       struct deferred_set ds; /* FIXME: move to thin_c */
+       struct deferred_set shared_read_ds;
+       struct deferred_set all_io_ds;
 
        struct new_mapping *next_mapping;
        mempool_t *mapping_pool;
@@ -541,7 +553,7 @@ struct pool_c {
        struct dm_target_callbacks callbacks;
 
        dm_block_t low_water_blocks;
-       unsigned zero_new_blocks:1;
+       struct pool_features pf;
 };
 
 /*
@@ -618,6 +630,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
 
 /*----------------------------------------------------------------*/
 
+struct endio_hook {
+       struct thin_c *tc;
+       struct deferred_entry *shared_read_entry;
+       struct deferred_entry *all_io_entry;
+       struct new_mapping *overwrite_mapping;
+};
+
 static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 {
        struct bio *bio;
@@ -628,7 +647,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
        bio_list_init(master);
 
        while ((bio = bio_list_pop(&bios))) {
-               if (dm_get_mapinfo(bio)->ptr == tc)
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               if (h->tc == tc)
                        bio_endio(bio, DM_ENDIO_REQUEUE);
                else
                        bio_list_add(master, bio);
@@ -716,21 +736,17 @@ static void wake_worker(struct pool *pool)
 /*
  * Bio endio functions.
  */
-struct endio_hook {
-       struct thin_c *tc;
-       bio_end_io_t *saved_bi_end_io;
-       struct deferred_entry *entry;
-};
-
 struct new_mapping {
        struct list_head list;
 
-       int prepared;
+       unsigned quiesced:1;
+       unsigned prepared:1;
+       unsigned pass_discard:1;
 
        struct thin_c *tc;
        dm_block_t virt_block;
        dm_block_t data_block;
-       struct cell *cell;
+       struct cell *cell, *cell2;
        int err;
 
        /*
@@ -747,7 +763,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
 {
        struct pool *pool = m->tc->pool;
 
-       if (list_empty(&m->list) && m->prepared) {
+       if (m->quiesced && m->prepared) {
                list_add(&m->list, &pool->prepared_mappings);
                wake_worker(pool);
        }
@@ -770,7 +786,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
 static void overwrite_endio(struct bio *bio, int err)
 {
        unsigned long flags;
-       struct new_mapping *m = dm_get_mapinfo(bio)->ptr;
+       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+       struct new_mapping *m = h->overwrite_mapping;
        struct pool *pool = m->tc->pool;
 
        m->err = err;
@@ -781,31 +798,6 @@ static void overwrite_endio(struct bio *bio, int err)
        spin_unlock_irqrestore(&pool->lock, flags);
 }
 
-static void shared_read_endio(struct bio *bio, int err)
-{
-       struct list_head mappings;
-       struct new_mapping *m, *tmp;
-       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
-       unsigned long flags;
-       struct pool *pool = h->tc->pool;
-
-       bio->bi_end_io = h->saved_bi_end_io;
-       bio_endio(bio, err);
-
-       INIT_LIST_HEAD(&mappings);
-       ds_dec(h->entry, &mappings);
-
-       spin_lock_irqsave(&pool->lock, flags);
-       list_for_each_entry_safe(m, tmp, &mappings, list) {
-               list_del(&m->list);
-               INIT_LIST_HEAD(&m->list);
-               __maybe_add_mapping(m);
-       }
-       spin_unlock_irqrestore(&pool->lock, flags);
-
-       mempool_free(h, pool->endio_hook_pool);
-}
-
 /*----------------------------------------------------------------*/
 
 /*
@@ -894,7 +886,30 @@ static void process_prepared_mapping(struct new_mapping *m)
        mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_mappings(struct pool *pool)
+static void process_prepared_discard(struct new_mapping *m)
+{
+       int r;
+       struct thin_c *tc = m->tc;
+
+       r = dm_thin_remove_block(tc->td, m->virt_block);
+       if (r)
+               DMERR("dm_thin_remove_block() failed");
+
+       /*
+        * Pass the discard down to the underlying device?
+        */
+       if (m->pass_discard)
+               remap_and_issue(tc, m->bio, m->data_block);
+       else
+               bio_endio(m->bio, 0);
+
+       cell_defer_except(tc, m->cell);
+       cell_defer_except(tc, m->cell2);
+       mempool_free(m, tc->pool->mapping_pool);
+}
+
+static void process_prepared(struct pool *pool, struct list_head *head,
+                            void (*fn)(struct new_mapping *))
 {
        unsigned long flags;
        struct list_head maps;
@@ -902,21 +917,27 @@ static void process_prepared_mappings(struct pool *pool)
 
        INIT_LIST_HEAD(&maps);
        spin_lock_irqsave(&pool->lock, flags);
-       list_splice_init(&pool->prepared_mappings, &maps);
+       list_splice_init(head, &maps);
        spin_unlock_irqrestore(&pool->lock, flags);
 
        list_for_each_entry_safe(m, tmp, &maps, list)
-               process_prepared_mapping(m);
+               fn(m);
 }
 
 /*
  * Deferred bio jobs.
  */
-static int io_overwrites_block(struct pool *pool, struct bio *bio)
+static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
-       return ((bio_data_dir(bio) == WRITE) &&
-               !(bio->bi_sector & pool->offset_mask)) &&
+       return !(bio->bi_sector & pool->offset_mask) &&
                (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
+
+}
+
+static int io_overwrites_block(struct pool *pool, struct bio *bio)
+{
+       return (bio_data_dir(bio) == WRITE) &&
+               io_overlaps_block(pool, bio);
 }
 
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -957,6 +978,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
 
        INIT_LIST_HEAD(&m->list);
+       m->quiesced = 0;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -965,7 +987,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
        m->err = 0;
        m->bio = NULL;
 
-       ds_add_work(&pool->ds, &m->list);
+       if (!ds_add_work(&pool->shared_read_ds, &m->list))
+               m->quiesced = 1;
 
        /*
         * IO to pool_dev remaps to the pool target's data_dev.
@@ -974,9 +997,10 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
         * bio immediately. Otherwise we use kcopyd to clone the data first.
         */
        if (io_overwrites_block(pool, bio)) {
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-               dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_dest);
        } else {
                struct dm_io_region from, to;
@@ -1023,6 +1047,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
        struct new_mapping *m = get_next_mapping(pool);
 
        INIT_LIST_HEAD(&m->list);
+       m->quiesced = 1;
        m->prepared = 0;
        m->tc = tc;
        m->virt_block = virt_block;
@@ -1036,13 +1061,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
         * zeroing pre-existing data, we can issue the bio immediately.
         * Otherwise we use kcopyd to zero the data first.
         */
-       if (!pool->zero_new_blocks)
+       if (!pool->pf.zero_new_blocks)
                process_prepared_mapping(m);
 
        else if (io_overwrites_block(pool, bio)) {
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               h->overwrite_mapping = m;
                m->bio = bio;
                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
-               dm_get_mapinfo(bio)->ptr = m;
                remap_and_issue(tc, bio, data_block);
 
        } else {
@@ -1129,7 +1155,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
  */
 static void retry_on_resume(struct bio *bio)
 {
-       struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+       struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+       struct thin_c *tc = h->tc;
        struct pool *pool = tc->pool;
        unsigned long flags;
 
@@ -1150,6 +1177,89 @@ static void no_space(struct cell *cell)
                retry_on_resume(bio);
 }
 
+static void process_discard(struct thin_c *tc, struct bio *bio)
+{
+       int r;
+       unsigned long flags;
+       struct pool *pool = tc->pool;
+       struct cell *cell, *cell2;
+       struct cell_key key, key2;
+       dm_block_t block = get_bio_block(tc, bio);
+       struct dm_thin_lookup_result lookup_result;
+       struct new_mapping *m;
+
+       build_virtual_key(tc->td, block, &key);
+       if (bio_detain(tc->pool->prison, &key, bio, &cell))
+               return;
+
+       r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
+       switch (r) {
+       case 0:
+               /*
+                * Check nobody is fiddling with this pool block.  This can
+                * happen if someone's in the process of breaking sharing
+                * on this block.
+                */
+               build_data_key(tc->td, lookup_result.block, &key2);
+               if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
+                       cell_release_singleton(cell, bio);
+                       break;
+               }
+
+               if (io_overlaps_block(pool, bio)) {
+                       /*
+                        * IO may still be going to the destination block.  We must
+                        * quiesce before we can do the removal.
+                        */
+                       m = get_next_mapping(pool);
+                       m->tc = tc;
+                       m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
+                       m->virt_block = block;
+                       m->data_block = lookup_result.block;
+                       m->cell = cell;
+                       m->cell2 = cell2;
+                       m->err = 0;
+                       m->bio = bio;
+
+                       if (!ds_add_work(&pool->all_io_ds, &m->list)) {
+                               spin_lock_irqsave(&pool->lock, flags);
+                               list_add(&m->list, &pool->prepared_discards);
+                               spin_unlock_irqrestore(&pool->lock, flags);
+                               wake_worker(pool);
+                       }
+               } else {
+                       /*
+                        * This path is hit if people are ignoring
+                        * limits->discard_granularity.  It ignores any
+                        * part of the discard that is in a subsequent
+                        * block.
+                        */
+                       sector_t offset = bio->bi_sector - (block << pool->block_shift);
+                       unsigned remaining = (pool->sectors_per_block - offset) << 9;
+                       bio->bi_size = min(bio->bi_size, remaining);
+
+                       cell_release_singleton(cell, bio);
+                       cell_release_singleton(cell2, bio);
+                       remap_and_issue(tc, bio, lookup_result.block);
+               }
+               break;
+
+       case -ENODATA:
+               /*
+                * It isn't provisioned, just forget it.
+                */
+               cell_release_singleton(cell, bio);
+               bio_endio(bio, 0);
+               break;
+
+       default:
+               DMERR("discard: find block unexpectedly returned %d", r);
+               cell_release_singleton(cell, bio);
+               bio_io_error(bio);
+               break;
+       }
+}
+
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
                          struct cell_key *key,
                          struct dm_thin_lookup_result *lookup_result,
@@ -1195,13 +1305,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
        if (bio_data_dir(bio) == WRITE)
                break_sharing(tc, bio, block, &key, lookup_result, cell);
        else {
-               struct endio_hook *h;
-               h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
 
-               h->tc = tc;
-               h->entry = ds_inc(&pool->ds);
-               save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
-               dm_get_mapinfo(bio)->ptr = h;
+               h->shared_read_entry = ds_inc(&pool->shared_read_ds);
 
                cell_release_singleton(cell, bio);
                remap_and_issue(tc, bio, lookup_result->block);
@@ -1299,6 +1405,7 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
 
        default:
                DMERR("dm_thin_find_block() failed, error = %d", r);
+               cell_release_singleton(cell, bio);
                bio_io_error(bio);
                break;
        }
@@ -1325,7 +1432,9 @@ static void process_deferred_bios(struct pool *pool)
        spin_unlock_irqrestore(&pool->lock, flags);
 
        while ((bio = bio_list_pop(&bios))) {
-               struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+               struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
+               struct thin_c *tc = h->tc;
+
                /*
                 * If we've got no free new_mapping structs, and processing
                 * this bio might require one, we pause until there are some
@@ -1338,7 +1447,11 @@ static void process_deferred_bios(struct pool *pool)
 
                        break;
                }
-               process_bio(tc, bio);
+
+               if (bio->bi_rw & REQ_DISCARD)
+                       process_discard(tc, bio);
+               else
+                       process_bio(tc, bio);
        }
 
        /*
@@ -1372,7 +1485,8 @@ static void do_worker(struct work_struct *ws)
 {
        struct pool *pool = container_of(ws, struct pool, worker);
 
-       process_prepared_mappings(pool);
+       process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
+       process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
        process_deferred_bios(pool);
 }
 
@@ -1408,6 +1522,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
        wake_worker(pool);
 }
 
+static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
+{
+       struct pool *pool = tc->pool;
+       struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
+
+       h->tc = tc;
+       h->shared_read_entry = NULL;
+       h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
+       h->overwrite_mapping = NULL;
+
+       return h;
+}
+
 /*
  * Non-blocking function called from the thin target's map function.
  */
@@ -1420,12 +1547,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
        struct dm_thin_device *td = tc->td;
        struct dm_thin_lookup_result result;
 
-       /*
-        * Save the thin context for easy access from the deferred bio later.
-        */
-       map_context->ptr = tc;
-
-       if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+       map_context->ptr = thin_hook_bio(tc, bio);
+       if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
                thin_defer_bio(tc, bio);
                return DM_MAPIO_SUBMITTED;
        }
@@ -1507,7 +1630,22 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
 
        pool->ti = ti;
        pool->low_water_blocks = pt->low_water_blocks;
-       pool->zero_new_blocks = pt->zero_new_blocks;
+       pool->pf = pt->pf;
+
+       /*
+        * If discard_passdown was enabled verify that the data device
+        * supports discards.  Disable discard_passdown if not; otherwise
+        * -EOPNOTSUPP will be returned.
+        */
+       if (pt->pf.discard_passdown) {
+               struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
+               if (!q || !blk_queue_discard(q)) {
+                       char buf[BDEVNAME_SIZE];
+                       DMWARN("Discard unsupported by data device (%s): Disabling discard passdown.",
+                              bdevname(pt->data_dev->bdev, buf));
+                       pool->pf.discard_passdown = 0;
+               }
+       }
 
        return 0;
 }
@@ -1521,6 +1659,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 /*----------------------------------------------------------------
  * Pool creation
  *--------------------------------------------------------------*/
+/* Initialize pool features. */
+static void pool_features_init(struct pool_features *pf)
+{
+       pf->zero_new_blocks = 1;
+       pf->discard_enabled = 1;
+       pf->discard_passdown = 1;
+}
+
 static void __pool_destroy(struct pool *pool)
 {
        __pool_table_remove(pool);
@@ -1568,7 +1714,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        pool->block_shift = ffs(block_size) - 1;
        pool->offset_mask = block_size - 1;
        pool->low_water_blocks = 0;
-       pool->zero_new_blocks = 1;
+       pool_features_init(&pool->pf);
        pool->prison = prison_create(PRISON_CELLS);
        if (!pool->prison) {
                *error = "Error creating pool's bio prison";
@@ -1601,10 +1747,12 @@ static struct pool *pool_create(struct mapped_device *pool_md,
        bio_list_init(&pool->deferred_bios);
        bio_list_init(&pool->deferred_flush_bios);
        INIT_LIST_HEAD(&pool->prepared_mappings);
+       INIT_LIST_HEAD(&pool->prepared_discards);
        pool->low_water_triggered = 0;
        pool->no_free_space = 0;
        bio_list_init(&pool->retry_on_resume_list);
-       ds_init(&pool->ds);
+       ds_init(&pool->shared_read_ds);
+       ds_init(&pool->all_io_ds);
 
        pool->next_mapping = NULL;
        pool->mapping_pool =
@@ -1663,7 +1811,8 @@ static void __pool_dec(struct pool *pool)
 
 static struct pool *__pool_find(struct mapped_device *pool_md,
                                struct block_device *metadata_dev,
-                               unsigned long block_size, char **error)
+                               unsigned long block_size, char **error,
+                               int *created)
 {
        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
@@ -1679,8 +1828,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
                                return ERR_PTR(-EINVAL);
                        __pool_inc(pool);
 
-               } else
+               } else {
                        pool = pool_create(pool_md, metadata_dev, block_size, error);
+                       *created = 1;
+               }
        }
 
        return pool;
@@ -1704,10 +1855,6 @@ static void pool_dtr(struct dm_target *ti)
        mutex_unlock(&dm_thin_pool_table.mutex);
 }
 
-struct pool_features {
-       unsigned zero_new_blocks:1;
-};
-
 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                               struct dm_target *ti)
 {
@@ -1716,7 +1863,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
        const char *arg_name;
 
        static struct dm_arg _args[] = {
-               {0, 1, "Invalid number of pool feature arguments"},
+               {0, 3, "Invalid number of pool feature arguments"},
        };
 
        /*
@@ -1736,6 +1883,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
                if (!strcasecmp(arg_name, "skip_block_zeroing")) {
                        pf->zero_new_blocks = 0;
                        continue;
+               } else if (!strcasecmp(arg_name, "ignore_discard")) {
+                       pf->discard_enabled = 0;
+                       continue;
+               } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
+                       pf->discard_passdown = 0;
+                       continue;
                }
 
                ti->error = "Unrecognised pool feature requested";
@@ -1753,10 +1906,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
  *
  * Optional feature arguments are:
  *          skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ *          ignore_discard: disable discard
+ *          no_discard_passdown: don't pass discards down to the data device
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
-       int r;
+       int r, pool_created = 0;
        struct pool_c *pt;
        struct pool *pool;
        struct pool_features pf;
@@ -1816,8 +1971,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        /*
         * Set default pool features.
         */
-       memset(&pf, 0, sizeof(pf));
-       pf.zero_new_blocks = 1;
+       pool_features_init(&pf);
 
        dm_consume_args(&as, 4);
        r = parse_pool_features(&as, &pf, ti);
@@ -1831,20 +1985,45 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
        }
 
        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-                          block_size, &ti->error);
+                          block_size, &ti->error, &pool_created);
        if (IS_ERR(pool)) {
                r = PTR_ERR(pool);
                goto out_free_pt;
        }
 
+       /*
+        * 'pool_created' reflects whether this is the first table load.
+        * Top level discard support is not allowed to be changed after
+        * initial load.  This would require a pool reload to trigger thin
+        * device changes.
+        */
+       if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
+               ti->error = "Discard support cannot be disabled once enabled";
+               r = -EINVAL;
+               goto out_flags_changed;
+       }
+
        pt->pool = pool;
        pt->ti = ti;
        pt->metadata_dev = metadata_dev;
        pt->data_dev = data_dev;
        pt->low_water_blocks = low_water_blocks;
-       pt->zero_new_blocks = pf.zero_new_blocks;
+       pt->pf = pf;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
+       /*
+        * Only need to enable discards if the pool should pass
+        * them down to the data device.  The thin device's discard
+        * processing will cause mappings to be removed from the btree.
+        */
+       if (pf.discard_enabled && pf.discard_passdown) {
+               ti->num_discard_requests = 1;
+               /*
+                * Setting 'discards_supported' circumvents the normal
+                * stacking of discard limits (this keeps the pool and
+                * thin devices' discard limits consistent).
+                */
+               ti->discards_supported = 1;
+       }
        ti->private = pt;
 
        pt->callbacks.congested_fn = pool_is_congested;
@@ -1854,6 +2033,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
        return 0;
 
+out_flags_changed:
+       __pool_dec(pool);
 out_free_pt:
        kfree(pt);
 out:
@@ -2142,7 +2323,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 static int pool_status(struct dm_target *ti, status_type_t type,
                       char *result, unsigned maxlen)
 {
-       int r;
+       int r, count;
        unsigned sz = 0;
        uint64_t transaction_id;
        dm_block_t nr_free_blocks_data;
@@ -2205,10 +2386,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
                       (unsigned long)pool->sectors_per_block,
                       (unsigned long long)pt->low_water_blocks);
 
-               DMEMIT("%u ", !pool->zero_new_blocks);
+               count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
+                       !pt->pf.discard_passdown;
+               DMEMIT("%u ", count);
 
-               if (!pool->zero_new_blocks)
+               if (!pool->pf.zero_new_blocks)
                        DMEMIT("skip_block_zeroing ");
+
+               if (!pool->pf.discard_enabled)
+                       DMEMIT("ignore_discard ");
+
+               if (!pt->pf.discard_passdown)
+                       DMEMIT("no_discard_passdown ");
+
                break;
        }
 
@@ -2237,6 +2427,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
+{
+       /*
+        * FIXME: these limits may be incompatible with the pool's data device
+        */
+       limits->max_discard_sectors = pool->sectors_per_block;
+
+       /*
+        * This is just a hint, and not enforced.  We have to cope with
+        * bios that overlap 2 blocks.
+        */
+       limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+       limits->discard_zeroes_data = pool->pf.zero_new_blocks;
+}
+
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct pool_c *pt = ti->private;
@@ -2244,13 +2449,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
        blk_limits_io_min(limits, 0);
        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       if (pool->pf.discard_enabled)
+               set_discard_limits(pool, limits);
 }
 
 static struct target_type pool_target = {
        .name = "thin-pool",
        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
                    DM_TARGET_IMMUTABLE,
-       .version = {1, 0, 0},
+       .version = {1, 1, 0},
        .module = THIS_MODULE,
        .ctr = pool_ctr,
        .dtr = pool_dtr,
@@ -2292,6 +2499,9 @@ static void thin_dtr(struct dm_target *ti)
  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
  * dev_id: the internal device identifier
  * origin_dev: a device external to the pool that should act as the origin
+ *
+ * If the pool device has discards disabled, they get disabled for the thin
+ * device as well.
  */
 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -2360,8 +2570,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 
        ti->split_io = tc->pool->sectors_per_block;
        ti->num_flush_requests = 1;
-       ti->num_discard_requests = 0;
-       ti->discards_supported = 0;
+
+       /* In case the pool supports discards, pass them on. */
+       if (tc->pool->pf.discard_enabled) {
+               ti->discards_supported = 1;
+               ti->num_discard_requests = 1;
+       }
 
        dm_put(pool_md);
 
@@ -2389,11 +2603,48 @@ out_unlock:
 static int thin_map(struct dm_target *ti, struct bio *bio,
                    union map_info *map_context)
 {
-       bio->bi_sector -= ti->begin;
+       bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
 
        return thin_bio_map(ti, bio, map_context);
 }
 
+static int thin_endio(struct dm_target *ti,
+                     struct bio *bio, int err,
+                     union map_info *map_context)
+{
+       unsigned long flags;
+       struct endio_hook *h = map_context->ptr;
+       struct list_head work;
+       struct new_mapping *m, *tmp;
+       struct pool *pool = h->tc->pool;
+
+       if (h->shared_read_entry) {
+               INIT_LIST_HEAD(&work);
+               ds_dec(h->shared_read_entry, &work);
+
+               spin_lock_irqsave(&pool->lock, flags);
+               list_for_each_entry_safe(m, tmp, &work, list) {
+                       list_del(&m->list);
+                       m->quiesced = 1;
+                       __maybe_add_mapping(m);
+               }
+               spin_unlock_irqrestore(&pool->lock, flags);
+       }
+
+       if (h->all_io_entry) {
+               INIT_LIST_HEAD(&work);
+               ds_dec(h->all_io_entry, &work);
+               spin_lock_irqsave(&pool->lock, flags);
+               list_for_each_entry_safe(m, tmp, &work, list)
+                       list_add(&m->list, &pool->prepared_discards);
+               spin_unlock_irqrestore(&pool->lock, flags);
+       }
+
+       mempool_free(h, pool->endio_hook_pool);
+
+       return 0;
+}
+
 static void thin_postsuspend(struct dm_target *ti)
 {
        if (dm_noflush_suspending(ti))
@@ -2469,9 +2720,11 @@ static int thin_iterate_devices(struct dm_target *ti,
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
        struct thin_c *tc = ti->private;
+       struct pool *pool = tc->pool;
 
        blk_limits_io_min(limits, 0);
-       blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT);
+       blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
+       set_discard_limits(pool, limits);
 }
 
 static struct target_type thin_target = {
@@ -2481,6 +2734,7 @@ static struct target_type thin_target = {
        .ctr = thin_ctr,
        .dtr = thin_dtr,
        .map = thin_map,
+       .end_io = thin_endio,
        .postsuspend = thin_postsuspend,
        .status = thin_status,
        .iterate_devices = thin_iterate_devices,
@@ -2515,6 +2769,6 @@ static void dm_thin_exit(void)
 module_init(dm_thin_init);
 module_exit(dm_thin_exit);
 
-MODULE_DESCRIPTION(DM_NAME "device-mapper thin provisioning target");
+MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");