2 * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
4 * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
6 * This file is released under the GPL.
9 * Linux 2.6 Device Mapper RAID4 and RAID5 target.
12 * o RAID4 with dedicated and selectable parity device
13 * o RAID5 with rotating parity (left+right, symmetric+asymmetric)
14 * o recovery of out of sync device for initial
15 * RAID set creation or after dead drive replacement
16 * o run time optimization of xor algorithm used to calculate parity
20 * o the raid address calculation algorithm
21 * o the base of the biovec <-> page list copier.
24 * Uses region hash to keep track of how many writes are in flight to
25 * regions in order to use dirty log to keep state of regions to recover:
27 * o clean regions (those which are synchronized
28 * and don't have write io in flight)
29 * o dirty regions (those with write io in flight)
32 * On startup, any dirty regions are migrated to the
33 * 'nosync' state and are subject to recovery by the daemon.
35 * See raid_ctr() for table definition.
37 * FIXME: recovery bandwidth
40 static const char *version = "v0.2594b";
43 #include "dm-memcache.h"
44 #include "dm-message.h"
45 #include "dm-raid45.h"
47 #include <linux/kernel.h>
48 #include <linux/vmalloc.h>
49 #include <linux/raid/xor.h>
51 #include <linux/bio.h>
52 #include <linux/dm-io.h>
53 #include <linux/dm-dirty-log.h>
54 #include "dm-region-hash.h"
56 #include <linux/slab.h>
59 * Configurable parameters
62 /* Minimum/maximum and default # of selectable stripes. */
64 #define STRIPES_MAX 16384
65 #define STRIPES_DEFAULT 80
67 /* Maximum and default chunk size in sectors if not set in constructor. */
68 #define CHUNK_SIZE_MIN 8
69 #define CHUNK_SIZE_MAX 16384
70 #define CHUNK_SIZE_DEFAULT 64
72 /* Default io size in sectors if not set in constructor. */
73 #define IO_SIZE_MIN CHUNK_SIZE_MIN
74 #define IO_SIZE_DEFAULT IO_SIZE_MIN
76 /* Recover io size default in sectors. */
77 #define RECOVER_IO_SIZE_MIN 64
78 #define RECOVER_IO_SIZE_DEFAULT 256
80 /* Default, minimum and maximum percentage of recover io bandwidth. */
81 #define BANDWIDTH_DEFAULT 10
82 #define BANDWIDTH_MIN 1
83 #define BANDWIDTH_MAX 100
85 /* # of parallel recovered regions */
86 #define RECOVERY_STRIPES_MIN 1
87 #define RECOVERY_STRIPES_MAX 64
88 #define RECOVERY_STRIPES_DEFAULT RECOVERY_STRIPES_MIN
90 * END Configurable parameters
93 #define TARGET "dm-raid45"
94 #define DAEMON "kraid45d"
95 #define DM_MSG_PREFIX TARGET
97 #define SECTORS_PER_PAGE (PAGE_SIZE >> SECTOR_SHIFT)
99 /* Amount/size for __xor(). */
100 #define XOR_SIZE PAGE_SIZE
102 /* Check value in range. */
103 #define range_ok(i, min, max) (i >= min && i <= max)
105 /* Check argument is power of 2. */
106 #define POWER_OF_2(a) (!(a & (a - 1)))
108 /* Structure access macros. */
109 /* Derive raid_set from stripe_cache pointer. */
110 #define RS(x) container_of(x, struct raid_set, sc)
112 /* Page reference. */
113 #define PAGE(stripe, p) ((stripe)->obj[p].pl->page)
115 /* Stripe chunk reference. */
116 #define CHUNK(stripe, p) ((stripe)->chunk + p)
118 /* Bio list reference. */
119 #define BL(stripe, p, rw) (stripe->chunk[p].bl + rw)
120 #define BL_CHUNK(chunk, rw) (chunk->bl + rw)
122 /* Page list reference. */
123 #define PL(stripe, p) (stripe->obj[p].pl)
124 /* END: structure access macros. */
126 /* Factor out to dm-bio-list.h */
127 static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
129 bio->bi_next = bl->head;
136 /* Factor out to dm.h */
137 #define TI_ERR_RET(str, ret) \
138 do { ti->error = str; return ret; } while (0);
139 #define TI_ERR(str) TI_ERR_RET(str, -EINVAL)
141 /* Macro to define access IO flags access inline functions. */
142 #define BITOPS(name, what, var, flag) \
143 static inline int TestClear ## name ## what(struct var *v) \
144 { return test_and_clear_bit(flag, &v->io.flags); } \
145 static inline int TestSet ## name ## what(struct var *v) \
146 { return test_and_set_bit(flag, &v->io.flags); } \
147 static inline void Clear ## name ## what(struct var *v) \
148 { clear_bit(flag, &v->io.flags); } \
149 static inline void Set ## name ## what(struct var *v) \
150 { set_bit(flag, &v->io.flags); } \
151 static inline int name ## what(struct var *v) \
152 { return test_bit(flag, &v->io.flags); }
154 /*-----------------------------------------------------------------
157 * Cache for all reads and writes to raid sets (operational or degraded)
159 * We need to run all data to and from a RAID set through this cache,
160 * because parity chunks need to get calculated from data chunks
161 * or, in the degraded/resynchronization case, missing chunks need
162 * to be reconstructed using the other chunks of the stripe.
163 *---------------------------------------------------------------*/
164 /* A chunk within a stripe (holds bios hanging off). */
165 /* IO status flags for chunks of a stripe. */
167 CHUNK_DIRTY, /* Pages of chunk dirty; need writing. */
168 CHUNK_ERROR, /* IO error on any chunk page. */
169 CHUNK_IO, /* Allow/prohibit IO on chunk pages. */
170 CHUNK_LOCKED, /* Chunk pages locked during IO. */
171 CHUNK_MUST_IO, /* Chunk must io. */
172 CHUNK_UNLOCK, /* Enforce chunk unlock. */
173 CHUNK_UPTODATE, /* Chunk pages are uptodate. */
176 #if READ != 0 || WRITE != 1
177 #error dm-raid45: READ/WRITE != 0/1 used as index!!!
181 WRITE_QUEUED = WRITE + 1,
183 NR_BL_TYPES, /* Must be last one! */
185 struct stripe_chunk {
186 atomic_t cnt; /* Reference count. */
187 struct stripe *stripe; /* Backpointer to stripe for endio(). */
188 /* Bio lists for reads, writes, and writes merged. */
189 struct bio_list bl[NR_BL_TYPES];
191 unsigned long flags; /* IO status flags. */
195 /* Define chunk bit operations. */
196 BITOPS(Chunk, Dirty, stripe_chunk, CHUNK_DIRTY)
197 BITOPS(Chunk, Error, stripe_chunk, CHUNK_ERROR)
198 BITOPS(Chunk, Io, stripe_chunk, CHUNK_IO)
199 BITOPS(Chunk, Locked, stripe_chunk, CHUNK_LOCKED)
200 BITOPS(Chunk, MustIo, stripe_chunk, CHUNK_MUST_IO)
201 BITOPS(Chunk, Unlock, stripe_chunk, CHUNK_UNLOCK)
202 BITOPS(Chunk, Uptodate, stripe_chunk, CHUNK_UPTODATE)
205 * Stripe linked list indexes. Keep order, because the stripe
206 * and the stripe cache rely on the first 3!
209 LIST_FLUSH, /* Stripes to flush for io. */
210 LIST_ENDIO, /* Stripes to endio. */
211 LIST_LRU, /* Least recently used stripes. */
212 SC_NR_LISTS, /* # of lists in stripe cache. */
213 LIST_HASH = SC_NR_LISTS, /* Hashed stripes. */
214 LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
215 STRIPE_NR_LISTS,/* To size array in struct stripe. */
218 /* Adressing region recovery. */
219 struct recover_addr {
220 struct dm_region *reg; /* Actual region to recover. */
221 sector_t pos; /* Position within region to recover. */
222 sector_t end; /* End of region to recover. */
225 /* A stripe: the io object to handle all reads and writes to a RAID set. */
227 atomic_t cnt; /* Reference count. */
228 struct stripe_cache *sc; /* Backpointer to stripe cache. */
232 * o io list to flush io
234 * o LRU list to put stripes w/o reference count on
235 * o stripe cache hash
237 struct list_head lists[STRIPE_NR_LISTS];
239 sector_t key; /* Hash key. */
240 region_t region; /* Region stripe is mapped to. */
243 unsigned long flags; /* Stripe state flags (see below). */
246 * Pending ios in flight:
248 * used to control move of stripe to endio list
252 /* Sectors to read and write for multi page stripe sets. */
256 /* Address region recovery. */
257 struct recover_addr *recover;
259 /* Lock on stripe (Future: for clustering). */
263 unsigned short parity; /* Parity chunk index. */
264 short recover; /* Recovery chunk index. */
268 * This stripe's memory cache object (dm-mem-cache);
269 * i.e. the io chunk pages.
271 struct dm_mem_cache_object *obj;
273 /* Array of stripe sets (dynamically allocated). */
274 struct stripe_chunk chunk[0];
277 /* States stripes can be in (flags field). */
279 STRIPE_ERROR, /* io error on stripe. */
280 STRIPE_MERGED, /* Writes got merged to be written. */
281 STRIPE_RBW, /* Read-before-write stripe. */
282 STRIPE_RECONSTRUCT, /* Reconstruct of a missing chunk required. */
283 STRIPE_RECONSTRUCTED, /* Reconstructed of a missing chunk. */
284 STRIPE_RECOVER, /* Stripe used for RAID set recovery. */
287 /* Define stripe bit operations. */
288 BITOPS(Stripe, Error, stripe, STRIPE_ERROR)
289 BITOPS(Stripe, Merged, stripe, STRIPE_MERGED)
290 BITOPS(Stripe, RBW, stripe, STRIPE_RBW)
291 BITOPS(Stripe, Reconstruct, stripe, STRIPE_RECONSTRUCT)
292 BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
293 BITOPS(Stripe, Recover, stripe, STRIPE_RECOVER)
297 struct list_head *hash;
305 LOCK_ENDIO, /* Protect endio list. */
306 LOCK_LRU, /* Protect LRU list. */
307 NR_LOCKS, /* To size array in struct stripe_cache. */
310 /* A stripe cache. */
311 struct stripe_cache {
313 struct stripe_hash hash;
315 spinlock_t locks[NR_LOCKS]; /* Locks to protect lists. */
317 /* Stripes with io to flush, stripes to endio and LRU lists. */
318 struct list_head lists[SC_NR_LISTS];
320 /* Slab cache to allocate stripes from. */
322 struct kmem_cache *cache; /* Cache itself. */
323 char name[32]; /* Unique name. */
326 struct dm_io_client *dm_io_client; /* dm-io client resource context. */
328 /* dm-mem-cache client resource context. */
329 struct dm_mem_cache_client *mem_cache_client;
331 int stripes_parm; /* # stripes parameter from constructor. */
332 atomic_t stripes; /* actual # of stripes in cache. */
333 atomic_t stripes_to_set; /* # of stripes to resize cache to. */
334 atomic_t stripes_last; /* last # of stripes in cache. */
335 atomic_t active_stripes; /* actual # of active stripes in cache. */
338 atomic_t active_stripes_max; /* actual # of active stripes in cache. */
341 /* Flag specs for raid_dev */ ;
342 enum raid_dev_flags {
343 DEV_FAILED, /* Device failed. */
344 DEV_IO_QUEUED, /* Io got queued to device. */
347 /* The raid device in a set. */
350 sector_t start; /* Offset to map to. */
351 struct { /* Using struct to be able to BITOPS(). */
352 unsigned long flags; /* raid_dev_flags. */
356 BITOPS(Dev, Failed, raid_dev, DEV_FAILED)
357 BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
359 /* Flags spec for raid_set. */
360 enum raid_set_flags {
361 RS_CHECK_OVERWRITE, /* Check for chunk overwrites. */
362 RS_DEAD, /* RAID set inoperational. */
363 RS_DEGRADED, /* Io errors on RAID device. */
364 RS_DEVEL_STATS, /* REMOVEME: display status information. */
365 RS_RECOVER, /* Do recovery. */
366 RS_RECOVERY_BANDWIDTH, /* Allow recovery bandwidth (delayed bios). */
367 RS_SC_BUSY, /* Stripe cache busy -> send an event. */
368 RS_SUSPEND, /* Suspend RAID set. */
371 /* REMOVEME: devel stats counters. */
409 S_NR_STATS, /* # of stats counters. Must be last! */
412 /* Status type -> string mappings. */
414 const enum stats_types type;
418 static struct stats_map stats_map[] = {
419 { S_BIOS_READ, "r=" },
420 { S_BIOS_ADDED_READ, "/" },
421 { S_BIOS_ENDIO_READ, "/" },
422 { S_BIOS_WRITE, " w=" },
423 { S_BIOS_ADDED_WRITE, "/" },
424 { S_BIOS_ENDIO_WRITE, "/" },
425 { S_DM_IO_READ, " rc=" },
426 { S_DM_IO_WRITE, " wc=" },
427 { S_BANDWIDTH, "\nbw=" },
428 { S_NO_BANDWIDTH, " no_bw=" },
429 { S_BARRIER, "\nbarrier=" },
430 { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
431 { S_CAN_MERGE, "\nmerge=" },
432 { S_CANT_MERGE, "/no_merge=" },
433 { S_CHUNK_LOCKED, "\nchunk_locked=" },
434 { S_CONGESTED, "\ncgst=" },
435 { S_NOT_CONGESTED, "/not_cgst=" },
436 { S_DEGRADED, "\ndegraded=" },
437 { S_DELAYED_BIOS, "\ndel_bios=" },
438 { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
439 { S_FLUSHS, "\nflushs=" },
440 { S_HITS_1ST, "\nhits_1st=" },
441 { S_IOS_POST, " ios_post=" },
442 { S_INSCACHE, " inscache=" },
443 { S_MAX_LOOKUP, " maxlookup=" },
444 { S_NO_RW, "\nno_rw=" },
445 { S_NOSYNC, " nosync=" },
446 { S_OVERWRITE, " ovr=" },
447 { S_PROHIBITCHUNKIO, " prhbt_io=" },
448 { S_RECONSTRUCT_EI, "\nrec_ei=" },
449 { S_RECONSTRUCT_DEV, " rec_dev=" },
450 { S_RECONSTRUCT_SET, " rec_set=" },
451 { S_RECONSTRUCTED, " rec=" },
452 { S_REQUEUE, " requeue=" },
453 { S_STRIPE_ERROR, " stripe_err=" },
454 { S_XORS, " xors=" },
460 #define dm_rh_client dm_region_hash
461 enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
462 typedef void (*xor_function_t)(unsigned count, unsigned long **data);
464 struct dm_target *ti; /* Target pointer. */
467 unsigned long flags; /* State flags. */
468 struct mutex in_lock; /* Protects central input list below. */
469 struct bio_list in; /* Pending ios (central input list). */
470 struct bio_list work; /* ios work set. */
471 wait_queue_head_t suspendq; /* suspend synchronization. */
472 atomic_t in_process; /* counter of queued bios (suspendq). */
473 atomic_t in_process_max;/* counter of queued bios max. */
476 struct workqueue_struct *wq;
477 struct delayed_work dws_do_raid; /* For main worker. */
478 struct work_struct ws_do_table_event; /* For event worker. */
481 /* Stripe locking abstraction. */
482 struct dm_raid45_locking_type *locking;
484 struct stripe_cache sc; /* Stripe cache for this set. */
486 /* Xor optimization. */
493 /* Recovery parameters. */
495 struct dm_dirty_log *dl; /* Dirty log. */
496 struct dm_rh_client *rh; /* Region hash. */
498 struct dm_io_client *dm_io_client; /* recovery dm-io client. */
499 /* dm-mem-cache client resource context for recovery stripes. */
500 struct dm_mem_cache_client *mem_cache_client;
502 struct list_head stripes; /* List of recovery stripes. */
505 region_t nr_regions_to_recover;
506 region_t nr_regions_recovered;
507 unsigned long start_jiffies;
508 unsigned long end_jiffies;
510 unsigned bandwidth; /* Recovery bandwidth [%]. */
511 unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
512 unsigned bandwidth_parm; /* " constructor parm. */
513 unsigned io_size; /* recovery io size <= region size. */
514 unsigned io_size_parm; /* recovery io size ctr parameter. */
515 unsigned recovery; /* Recovery allowed/prohibited. */
516 unsigned recovery_stripes; /* # of parallel recovery stripes. */
518 /* recovery io throttling. */
519 atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
520 unsigned long last_jiffies;
523 /* RAID set parameters. */
525 struct raid_type *raid_type; /* RAID type (eg, RAID4). */
526 unsigned raid_parms; /* # variable raid parameters. */
528 unsigned chunk_size; /* Sectors per chunk. */
529 unsigned chunk_size_parm;
530 unsigned chunk_shift; /* rsector chunk size shift. */
532 unsigned io_size; /* Sectors per io. */
533 unsigned io_size_parm;
534 unsigned io_mask; /* Mask for bio_copy_page_list(). */
535 unsigned io_inv_mask; /* Mask for raid_address(). */
537 sector_t sectors_per_dev; /* Sectors per device. */
539 atomic_t failed_devs; /* Amount of devices failed. */
541 /* Index of device to initialize. */
543 int dev_to_init_parm;
545 /* Raid devices dynamically allocated. */
546 unsigned raid_devs; /* # of RAID devices below. */
547 unsigned data_devs; /* # of RAID data devices. */
549 int ei; /* index of failed RAID device. */
551 /* Index of dedicated parity device (i.e. RAID4). */
553 int pi_parm; /* constructor parm for status output. */
556 /* REMOVEME: devel stats counters. */
557 atomic_t stats[S_NR_STATS];
559 /* Dynamically allocated temporary pointers for xor(). */
560 unsigned long **data;
562 /* Dynamically allocated RAID devices. Alignment? */
563 struct raid_dev dev[0];
566 /* Define RAID set bit operations. */
567 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
568 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
569 BITOPS(RS, Dead, raid_set, RS_DEAD)
570 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
571 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
572 BITOPS(RS, Recover, raid_set, RS_RECOVER)
573 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
574 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
577 /*-----------------------------------------------------------------
578 * Raid-4/5 set structures.
579 *---------------------------------------------------------------*/
580 /* RAID level definitions. */
586 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
587 enum raid_algorithm {
596 const char *name; /* RAID algorithm. */
597 const char *descr; /* Descriptor text for logging. */
598 const unsigned parity_devs; /* # of parity devices. */
599 const unsigned minimal_devs; /* minimal # of devices in set. */
600 const enum raid_level level; /* RAID level. */
601 const enum raid_algorithm algorithm; /* RAID algorithm. */
604 /* Supported raid types and properties. */
605 static struct raid_type raid_types[] = {
606 {"raid4", "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
607 {"raid5_la", "RAID5 (left asymmetric)", 1, 3, raid5, left_asym},
608 {"raid5_ra", "RAID5 (right asymmetric)", 1, 3, raid5, right_asym},
609 {"raid5_ls", "RAID5 (left symmetric)", 1, 3, raid5, left_sym},
610 {"raid5_rs", "RAID5 (right symmetric)", 1, 3, raid5, right_sym},
613 /* Address as calculated by raid_address(). */
614 struct raid_address {
615 sector_t key; /* Hash key (address of stripe % chunk_size). */
616 unsigned di, pi; /* Data and parity disks index. */
619 /* REMOVEME: reset statistics counters. */
620 static void stats_reset(struct raid_set *rs)
622 unsigned s = S_NR_STATS;
625 atomic_set(rs->stats + s, 0);
628 /*----------------------------------------------------------------
629 * RAID set management routines.
630 *--------------------------------------------------------------*/
632 * Begin small helper functions.
634 /* No need to be called from region hash indirectly at dm_rh_dec(). */
635 static void wake_dummy(void *context) {}
637 /* Return # of io reference. */
638 static int io_ref(struct raid_set *rs)
640 return atomic_read(&rs->io.in_process);
643 /* Get an io reference. */
644 static void io_get(struct raid_set *rs)
646 int p = atomic_inc_return(&rs->io.in_process);
648 if (p > atomic_read(&rs->io.in_process_max))
649 atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
652 /* Put the io reference and conditionally wake io waiters. */
653 static void io_put(struct raid_set *rs)
655 /* Intel: rebuild data corrupter? */
656 if (atomic_dec_and_test(&rs->io.in_process))
657 wake_up(&rs->io.suspendq);
659 BUG_ON(io_ref(rs) < 0);
662 /* Wait until all io has been processed. */
663 static void wait_ios(struct raid_set *rs)
665 wait_event(rs->io.suspendq, !io_ref(rs));
668 /* Queue (optionally delayed) io work. */
669 static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
671 queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
674 /* Queue io work immediately (called from region hash too). */
675 static void wake_do_raid(void *context)
677 struct raid_set *rs = context;
679 queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
682 /* Calculate device sector offset. */
683 static sector_t _sector(struct raid_set *rs, struct bio *bio)
685 sector_t sector = bio->bi_sector;
687 sector_div(sector, rs->set.data_devs);
691 /* Return # of active stripes in stripe cache. */
692 static int sc_active(struct stripe_cache *sc)
694 return atomic_read(&sc->active_stripes);
697 /* Stripe cache busy indicator. */
698 static int sc_busy(struct raid_set *rs)
700 return sc_active(&rs->sc) >
701 atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
704 /* Set chunks states. */
705 enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
706 static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
710 ClearChunkDirty(chunk);
713 SetChunkDirty(chunk);
716 SetChunkError(chunk);
717 SetStripeError(chunk->stripe);
723 SetChunkUptodate(chunk);
725 ClearChunkError(chunk);
728 /* Return region state for a sector. */
729 static int region_state(struct raid_set *rs, sector_t sector,
730 enum dm_rh_region_states state)
732 struct dm_rh_client *rh = rs->recover.rh;
733 region_t region = dm_rh_sector_to_region(rh, sector);
735 return !!(dm_rh_get_state(rh, region, 1) & state);
739 * Return true in case a chunk should be read/written
741 * Conditions to read/write:
742 * o chunk not uptodate
745 * Conditios to avoid io:
746 * o io already ongoing on chunk
747 * o io explitely prohibited
749 static int chunk_io(struct stripe_chunk *chunk)
751 /* 2nd run optimization (flag set below on first run). */
752 if (TestClearChunkMustIo(chunk))
755 /* Avoid io if prohibited or a locked chunk. */
756 if (!ChunkIo(chunk) || ChunkLocked(chunk))
759 if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
760 SetChunkMustIo(chunk); /* 2nd run optimization. */
767 /* Call a function on each chunk needing io unless device failed. */
768 static unsigned for_each_io_dev(struct stripe *stripe,
769 void (*f_io)(struct stripe *stripe, unsigned p))
771 struct raid_set *rs = RS(stripe->sc);
774 for (p = 0; p < rs->set.raid_devs; p++) {
775 if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
785 * Index of device to calculate parity on.
787 * Either the parity device index *or* the selected
788 * device to init after a spare replacement.
790 static int dev_for_parity(struct stripe *stripe, int *sync)
792 struct raid_set *rs = RS(stripe->sc);
793 int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
797 /* Reconstruct a particular device ?. */
798 if (r && rs->set.dev_to_init > -1)
799 return rs->set.dev_to_init;
800 else if (rs->set.raid_type->level == raid4)
802 else if (!StripeRecover(stripe))
803 return stripe->idx.parity;
808 /* RAID set congested function. */
809 static int rs_congested(void *congested_data, int bdi_bits)
813 struct raid_set *rs = congested_data;
815 if (sc_busy(rs) || RSSuspend(rs))
817 else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
818 /* If any of our component devices are overloaded. */
819 struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
821 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
824 /* REMOVEME: statistics. */
825 atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
829 /* RAID device degrade check. */
830 static void rs_check_degrade_dev(struct raid_set *rs,
831 struct stripe *stripe, unsigned p)
833 if (TestSetDevFailed(rs->dev + p))
836 /* Through an event in case of member device errors. */
837 if (atomic_inc_return(&rs->set.failed_devs) >
838 rs->set.raid_type->parity_devs &&
839 !TestSetRSDead(rs)) {
840 /* Display RAID set dead message once. */
842 char buf[BDEVNAME_SIZE];
844 DMERR("FATAL: too many devices failed -> RAID set broken");
845 for (p = 0; p < rs->set.raid_devs; p++) {
846 if (DevFailed(rs->dev + p))
847 DMERR("device /dev/%s failed",
848 bdevname(rs->dev[p].dev->bdev, buf));
852 /* Only log the first member error. */
853 if (!TestSetRSDegraded(rs)) {
854 char buf[BDEVNAME_SIZE];
856 /* Store index for recovery. */
858 DMERR("CRITICAL: %sio error on device /dev/%s "
859 "in region=%llu; DEGRADING RAID set\n",
860 stripe ? "" : "FAKED ",
861 bdevname(rs->dev[p].dev->bdev, buf),
862 (unsigned long long) (stripe ? stripe->key : 0));
863 DMERR("further device error messages suppressed");
866 schedule_work(&rs->io.ws_do_table_event);
869 /* RAID set degrade check. */
870 static void rs_check_degrade(struct stripe *stripe)
872 struct raid_set *rs = RS(stripe->sc);
873 unsigned p = rs->set.raid_devs;
876 if (ChunkError(CHUNK(stripe, p)))
877 rs_check_degrade_dev(rs, stripe, p);
881 /* Lookup a RAID device by name or by major:minor number. */
882 static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
885 struct raid_dev *dev;
888 * Must be an incremental loop, because the device array
889 * can have empty slots still on calls from raid_ctr()
891 for (dev = rs->dev, p = 0;
892 dev->dev && p < rs->set.raid_devs;
894 if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
901 * End small helper functions.
905 * Stripe hash functions
907 /* Initialize/destroy stripe hash. */
908 static int hash_init(struct stripe_hash *hash, unsigned stripes)
910 unsigned buckets = 2, max_buckets = stripes >> 1;
911 static unsigned hash_primes[] = {
912 /* Table of primes for hash_fn/table size optimization. */
913 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
914 1543, 3079, 6151, 12289, 24593, 49157, 98317,
917 /* Calculate number of buckets (2^^n <= stripes / 2). */
918 while (buckets < max_buckets)
921 /* Allocate stripe hash buckets. */
922 hash->hash = vmalloc(buckets * sizeof(*hash->hash));
926 hash->buckets = buckets;
927 hash->mask = buckets - 1;
928 hash->shift = ffs(buckets);
929 if (hash->shift > ARRAY_SIZE(hash_primes))
930 hash->shift = ARRAY_SIZE(hash_primes) - 1;
932 BUG_ON(hash->shift < 2);
933 hash->prime = hash_primes[hash->shift];
935 /* Initialize buckets. */
937 INIT_LIST_HEAD(hash->hash + buckets);
941 static void hash_exit(struct stripe_hash *hash)
949 static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
951 return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
954 static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
956 return hash->hash + hash_fn(hash, key);
959 /* Insert an entry into a hash. */
960 static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
962 list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
965 /* Lookup an entry in the stripe hash. */
966 static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
969 struct stripe *stripe;
970 struct list_head *bucket = hash_bucket(&sc->hash, key);
972 list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
975 if (stripe->key == key) {
976 /* REMOVEME: statisics. */
977 if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
978 atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
986 /* Resize the stripe cache hash on size changes. */
987 static int sc_hash_resize(struct stripe_cache *sc)
989 /* Resize indicated ? */
990 if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
992 struct stripe_hash hash;
994 r = hash_init(&hash, atomic_read(&sc->stripes));
999 unsigned b = sc->hash.buckets;
1000 struct list_head *pos, *tmp;
1002 /* Walk old buckets and insert into new. */
1004 list_for_each_safe(pos, tmp, sc->hash.hash + b)
1005 stripe_insert(&hash,
1006 list_entry(pos, struct stripe,
1012 hash_exit(&sc->hash);
1013 memcpy(&sc->hash, &hash, sizeof(sc->hash));
1014 atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1019 /* End hash stripe hash function. */
1021 /* List add, delete, push and pop functions. */
1022 /* Add stripe to flush list. */
1023 #define DEL_LIST(lh) \
1024 if (!list_empty(lh)) \
1027 /* Delete stripe from hash. */
1028 static void stripe_hash_del(struct stripe *stripe)
1030 DEL_LIST(stripe->lists + LIST_HASH);
1033 /* Return stripe reference count. */
1034 static inline int stripe_ref(struct stripe *stripe)
1036 return atomic_read(&stripe->cnt);
1039 static void stripe_flush_add(struct stripe *stripe)
1041 struct stripe_cache *sc = stripe->sc;
1042 struct list_head *lh = stripe->lists + LIST_FLUSH;
1044 if (!StripeReconstruct(stripe) && list_empty(lh))
1045 list_add_tail(lh, sc->lists + LIST_FLUSH);
1049 * Add stripe to LRU (inactive) list.
1051 * Need lock, because of concurrent access from message interface.
1053 static void stripe_lru_add(struct stripe *stripe)
1055 if (!StripeRecover(stripe)) {
1056 unsigned long flags;
1057 struct list_head *lh = stripe->lists + LIST_LRU;
1058 spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1060 spin_lock_irqsave(lock, flags);
1062 list_add_tail(lh, stripe->sc->lists + LIST_LRU);
1063 spin_unlock_irqrestore(lock, flags);
1067 #define POP_LIST(list) \
1069 if (list_empty(sc->lists + (list))) \
1072 stripe = list_first_entry(sc->lists + (list), \
1075 list_del_init(stripe->lists + (list)); \
1079 /* Pop an available stripe off the LRU list. */
1080 static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1082 struct stripe *stripe;
1083 spinlock_t *lock = sc->locks + LOCK_LRU;
1085 spin_lock_irq(lock);
1087 spin_unlock_irq(lock);
1092 /* Pop an available stripe off the io list. */
1093 static struct stripe *stripe_io_pop(struct stripe_cache *sc)
1095 struct stripe *stripe;
1097 POP_LIST(LIST_FLUSH);
1101 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1102 static void stripe_endio_push(struct stripe *stripe)
1104 unsigned long flags;
1105 struct stripe_cache *sc = stripe->sc;
1106 struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
1107 *sc_list = sc->lists + LIST_ENDIO;
1108 spinlock_t *lock = sc->locks + LOCK_ENDIO;
1110 /* This runs in parallel with do_endios(). */
1111 spin_lock_irqsave(lock, flags);
1112 if (list_empty(stripe_list))
1113 list_add_tail(stripe_list, sc_list);
1114 spin_unlock_irqrestore(lock, flags);
1116 wake_do_raid(RS(sc)); /* Wake myself. */
1119 /* Pop a stripe off safely off the endio list. */
1120 static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
1122 struct stripe *stripe;
1123 spinlock_t *lock = sc->locks + LOCK_ENDIO;
1125 /* This runs in parallel with endio(). */
1126 spin_lock_irq(lock);
1127 POP_LIST(LIST_ENDIO)
1128 spin_unlock_irq(lock);
1134 * Stripe cache locking functions
1136 /* Dummy lock function for single host RAID4+5. */
1137 static void *no_lock(sector_t key, enum dm_lock_type type)
1142 /* Dummy unlock function for single host RAID4+5. */
1143 static void no_unlock(void *lock_handle)
1147 /* No locking (for single host RAID 4+5). */
1148 static struct dm_raid45_locking_type locking_none = {
1150 .unlock = no_unlock,
1153 /* Lock a stripe (for clustering). */
1155 stripe_lock(struct stripe *stripe, int rw, sector_t key)
1157 stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
1158 return stripe->lock ? 0 : -EPERM;
1161 /* Unlock a stripe (for clustering). */
1162 static void stripe_unlock(struct stripe *stripe)
1164 RS(stripe->sc)->locking->unlock(stripe->lock);
1165 stripe->lock = NULL;
1168 /* Test io pending on stripe. */
1169 static int stripe_io_ref(struct stripe *stripe)
1171 return atomic_read(&stripe->io.pending);
1174 static void stripe_io_get(struct stripe *stripe)
1176 if (atomic_inc_return(&stripe->io.pending) == 1)
1177 /* REMOVEME: statistics */
1178 atomic_inc(&stripe->sc->active_stripes);
1180 BUG_ON(stripe_io_ref(stripe) < 0);
1183 static void stripe_io_put(struct stripe *stripe)
1185 if (atomic_dec_and_test(&stripe->io.pending)) {
1186 if (unlikely(StripeRecover(stripe)))
1187 /* Don't put recovery stripe on endio list. */
1188 wake_do_raid(RS(stripe->sc));
1190 /* Add regular stripe to endio list and wake daemon. */
1191 stripe_endio_push(stripe);
1193 /* REMOVEME: statistics */
1194 atomic_dec(&stripe->sc->active_stripes);
1196 BUG_ON(stripe_io_ref(stripe) < 0);
1199 /* Take stripe reference out. */
1200 static int stripe_get(struct stripe *stripe)
1203 struct list_head *lh = stripe->lists + LIST_LRU;
1204 spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1206 /* Delete stripe from LRU (inactive) list if on. */
1207 spin_lock_irq(lock);
1209 spin_unlock_irq(lock);
1211 BUG_ON(stripe_ref(stripe) < 0);
1213 /* Lock stripe on first reference */
1214 r = (atomic_inc_return(&stripe->cnt) == 1) ?
1215 stripe_lock(stripe, WRITE, stripe->key) : 0;
1221 /* Return references on a chunk. */
1222 static int chunk_ref(struct stripe_chunk *chunk)
1224 return atomic_read(&chunk->cnt);
1227 /* Take out reference on a chunk. */
1228 static int chunk_get(struct stripe_chunk *chunk)
1230 return atomic_inc_return(&chunk->cnt);
1233 /* Drop reference on a chunk. */
1234 static void chunk_put(struct stripe_chunk *chunk)
1236 BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
1240 * Drop reference on a stripe.
1242 * Move it to list of LRU stripes if zero.
1244 static void stripe_put(struct stripe *stripe)
1246 if (atomic_dec_and_test(&stripe->cnt)) {
1247 BUG_ON(stripe_io_ref(stripe));
1248 stripe_unlock(stripe);
1250 BUG_ON(stripe_ref(stripe) < 0);
1253 /* Helper needed by for_each_io_dev(). */
1254 static void stripe_get_references(struct stripe *stripe, unsigned p)
1258 * Another one to reference the stripe in
1259 * order to protect vs. LRU list moves.
1261 io_get(RS(stripe->sc)); /* Global io references. */
1263 stripe_io_get(stripe); /* One for each chunk io. */
1266 /* Helper for endio() to put all take references. */
1267 static void stripe_put_references(struct stripe *stripe)
1269 stripe_io_put(stripe); /* One for each chunk io. */
1271 io_put(RS(stripe->sc));
1275 * Stripe cache functions.
1278 * Invalidate all chunks (i.e. their pages) of a stripe.
1280 * I only keep state for the whole chunk.
1282 static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
1284 chunk->io.flags = 0;
1288 stripe_chunks_invalidate(struct stripe *stripe)
1290 unsigned p = RS(stripe->sc)->set.raid_devs;
1293 stripe_chunk_invalidate(CHUNK(stripe, p));
1296 /* Prepare stripe for (re)use. */
1297 static void stripe_invalidate(struct stripe *stripe)
1299 stripe->io.flags = 0;
1300 stripe->idx.parity = stripe->idx.recover = -1;
1301 stripe_chunks_invalidate(stripe);
1305 * Allow io on all chunks of a stripe.
1306 * If not set, IO will not occur; i.e. it's prohibited.
1308 * Actual IO submission for allowed chunks depends
1309 * on their !uptodate or dirty state.
1311 static void stripe_allow_io(struct stripe *stripe)
1313 unsigned p = RS(stripe->sc)->set.raid_devs;
1316 SetChunkIo(CHUNK(stripe, p));
1319 /* Initialize a stripe. */
1320 static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1322 unsigned i, p = RS(sc)->set.raid_devs;
1324 /* Work all io chunks. */
1326 struct stripe_chunk *chunk = CHUNK(stripe, p);
1328 atomic_set(&chunk->cnt, 0);
1329 chunk->stripe = stripe;
1330 i = ARRAY_SIZE(chunk->bl);
1332 bio_list_init(chunk->bl + i);
1338 i = ARRAY_SIZE(stripe->lists);
1340 INIT_LIST_HEAD(stripe->lists + i);
1342 stripe->io.size = RS(sc)->set.io_size;
1343 atomic_set(&stripe->cnt, 0);
1344 atomic_set(&stripe->io.pending, 0);
1345 stripe_invalidate(stripe);
1348 /* Number of pages per chunk. */
1349 static inline unsigned chunk_pages(unsigned sectors)
1351 return dm_div_up(sectors, SECTORS_PER_PAGE);
1354 /* Number of pages per stripe. */
1355 static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
1357 return chunk_pages(io_size) * rs->set.raid_devs;
1360 /* Initialize part of page_list (recovery). */
1361 static void stripe_zero_pl_part(struct stripe *stripe, int p,
1362 unsigned start, unsigned count)
1364 unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
1365 /* Get offset into the page_list. */
1366 struct page_list *pl = pl_elem(PL(stripe, p), o);
1369 while (pl && pages--) {
1371 memset(page_address(pl->page), 0, PAGE_SIZE);
1376 /* Initialize parity chunk of stripe. */
1377 static void stripe_zero_chunk(struct stripe *stripe, int p)
1380 stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
1383 /* Return dynamic stripe structure size. */
1384 static size_t stripe_size(struct raid_set *rs)
1386 return sizeof(struct stripe) +
1387 rs->set.raid_devs * sizeof(struct stripe_chunk);
1390 /* Allocate a stripe and its memory object. */
1391 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1392 enum grow { SC_GROW, SC_KEEP };
1393 static struct stripe *stripe_alloc(struct stripe_cache *sc,
1394 struct dm_mem_cache_client *mc,
1398 struct stripe *stripe;
1400 stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
1402 /* Grow the dm-mem-cache by one object. */
1403 if (grow == SC_GROW) {
1404 r = dm_mem_cache_grow(mc, 1);
1409 stripe->obj = dm_mem_cache_alloc(mc);
1413 stripe_init(sc, stripe);
1419 if (grow == SC_GROW)
1420 dm_mem_cache_shrink(mc, 1);
1422 kmem_cache_free(sc->kc.cache, stripe);
1427 * Free a stripes memory object, shrink the
1428 * memory cache and free the stripe itself.
1430 static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
1432 dm_mem_cache_free(mc, stripe->obj);
1433 dm_mem_cache_shrink(mc, 1);
1434 kmem_cache_free(stripe->sc->kc.cache, stripe);
1437 /* Free the recovery stripe. */
1438 static void stripe_recover_free(struct raid_set *rs)
1440 struct recover *rec = &rs->recover;
1441 struct dm_mem_cache_client *mc;
1443 mc = rec->mem_cache_client;
1444 rec->mem_cache_client = NULL;
1446 struct stripe *stripe;
1448 while (!list_empty(&rec->stripes)) {
1449 stripe = list_first_entry(&rec->stripes, struct stripe,
1450 lists[LIST_RECOVER]);
1451 list_del(stripe->lists + LIST_RECOVER);
1452 kfree(stripe->recover);
1453 stripe_free(stripe, mc);
1456 dm_mem_cache_client_destroy(mc);
1457 dm_io_client_destroy(rec->dm_io_client);
1458 rec->dm_io_client = NULL;
1462 /* Grow stripe cache. */
1463 static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
1467 /* Try to allocate this many (additional) stripes. */
1469 struct stripe *stripe =
1470 stripe_alloc(sc, sc->mem_cache_client, grow);
1472 if (likely(stripe)) {
1473 stripe_lru_add(stripe);
1474 atomic_inc(&sc->stripes);
1481 return r ? r : sc_hash_resize(sc);
1484 /* Shrink stripe cache. */
1485 static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
1489 /* Try to get unused stripe from LRU list. */
1491 struct stripe *stripe;
1493 stripe = stripe_lru_pop(sc);
1495 /* An LRU stripe may never have ios pending! */
1496 BUG_ON(stripe_io_ref(stripe));
1497 BUG_ON(stripe_ref(stripe));
1498 atomic_dec(&sc->stripes);
1499 /* Remove from hash if on before deletion. */
1500 stripe_hash_del(stripe);
1501 stripe_free(stripe, sc->mem_cache_client);
1508 /* Check if stats are still sane. */
1509 if (atomic_read(&sc->active_stripes_max) >
1510 atomic_read(&sc->stripes))
1511 atomic_set(&sc->active_stripes_max, 0);
1516 return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
1519 /* Create stripe cache and recovery. */
1520 static int sc_init(struct raid_set *rs, unsigned stripes)
1522 unsigned i, r, rstripes;
1523 struct stripe_cache *sc = &rs->sc;
1524 struct stripe *stripe;
1525 struct recover *rec = &rs->recover;
1526 struct mapped_device *md;
1527 struct gendisk *disk;
1529 /* Initialize lists and locks. */
1530 i = ARRAY_SIZE(sc->lists);
1532 INIT_LIST_HEAD(sc->lists + i);
1534 INIT_LIST_HEAD(&rec->stripes);
1536 /* Initialize endio and LRU list locks. */
1539 spin_lock_init(sc->locks + i);
1541 /* Initialize atomic variables. */
1542 atomic_set(&sc->stripes, 0);
1543 atomic_set(&sc->stripes_to_set, 0);
1544 atomic_set(&sc->active_stripes, 0);
1545 atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
1548 * We need a runtime unique # to suffix the kmem cache name
1549 * because we'll have one for each active RAID set.
1551 md = dm_table_get_md(rs->ti->table);
1553 sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor);
1555 sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
1560 /* Create memory cache client context for RAID stripe cache. */
1561 sc->mem_cache_client =
1562 dm_mem_cache_client_create(stripes, rs->set.raid_devs,
1563 chunk_pages(rs->set.io_size));
1564 if (IS_ERR(sc->mem_cache_client))
1565 return PTR_ERR(sc->mem_cache_client);
1567 /* Create memory cache client context for RAID recovery stripe(s). */
1568 rstripes = rec->recovery_stripes;
1569 rec->mem_cache_client =
1570 dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
1571 chunk_pages(rec->io_size));
1572 if (IS_ERR(rec->mem_cache_client))
1573 return PTR_ERR(rec->mem_cache_client);
1575 /* Create dm-io client context for IO stripes. */
1577 dm_io_client_create((stripes > 32 ? 32 : stripes) *
1579 chunk_pages(rs->set.io_size));
1580 if (IS_ERR(sc->dm_io_client))
1581 return PTR_ERR(sc->dm_io_client);
1583 /* FIXME: intermingeled with stripe cache initialization. */
1584 /* Create dm-io client context for recovery stripes. */
1586 dm_io_client_create(rstripes * rs->set.raid_devs *
1587 chunk_pages(rec->io_size));
1588 if (IS_ERR(rec->dm_io_client))
1589 return PTR_ERR(rec->dm_io_client);
1591 /* Allocate stripes for set recovery. */
1592 while (rstripes--) {
1593 stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
1597 stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
1598 if (!stripe->recover) {
1599 stripe_free(stripe, rec->mem_cache_client);
1603 SetStripeRecover(stripe);
1604 stripe->io.size = rec->io_size;
1605 list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
1606 /* Don't add recovery stripes to LRU list! */
1610 * Allocate the stripe objetcs from the
1611 * cache and add them to the LRU list.
1613 r = sc_grow(sc, stripes, SC_KEEP);
1615 atomic_set(&sc->stripes_last, stripes);
1620 /* Destroy the stripe cache. */
1621 static void sc_exit(struct stripe_cache *sc)
1623 struct raid_set *rs = RS(sc);
1626 stripe_recover_free(rs);
1627 BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
1628 kmem_cache_destroy(sc->kc.cache);
1629 sc->kc.cache = NULL;
1631 if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
1632 dm_mem_cache_client_destroy(sc->mem_cache_client);
1634 if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
1635 dm_io_client_destroy(sc->dm_io_client);
1637 hash_exit(&sc->hash);
1642 * Calculate RAID address
1644 * Delivers tuple with the index of the data disk holding the chunk
1645 * in the set, the parity disks index and the start of the stripe
1646 * within the address space of the set (used as the stripe cache hash key).
1649 static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
1650 struct raid_address *addr)
1652 sector_t stripe, tmp;
1655 * chunk_number = sector / chunk_size
1656 * stripe_number = chunk_number / data_devs
1657 * di = stripe % data_devs;
1659 stripe = sector >> rs->set.chunk_shift;
1660 addr->di = sector_div(stripe, rs->set.data_devs);
1662 switch (rs->set.raid_type->level) {
1664 addr->pi = rs->set.pi;
1665 goto check_shift_di;
1668 addr->pi = sector_div(tmp, rs->set.raid_devs);
1670 switch (rs->set.raid_type->algorithm) {
1671 case left_asym: /* Left asymmetric. */
1672 addr->pi = rs->set.data_devs - addr->pi;
1673 case right_asym: /* Right asymmetric. */
1675 if (addr->di >= addr->pi)
1678 case left_sym: /* Left symmetric. */
1679 addr->pi = rs->set.data_devs - addr->pi;
1680 case right_sym: /* Right symmetric. */
1681 addr->di = (addr->pi + addr->di + 1) %
1684 case none: /* Ain't happen: RAID4 algorithm placeholder. */
1690 * Start offset of the stripes chunk on any single device of the RAID
1691 * set, adjusted in case io size differs from chunk size.
1693 addr->key = (stripe << rs->set.chunk_shift) +
1694 (sector & rs->set.io_inv_mask);
1699 * Copy data across between stripe pages and bio vectors.
1701 * Pay attention to data alignment in stripe and bio pages.
1703 static void bio_copy_page_list(int rw, struct stripe *stripe,
1704 struct page_list *pl, struct bio *bio)
1706 unsigned i, page_offset;
1708 struct raid_set *rs = RS(stripe->sc);
1711 /* Get start page in page list for this sector. */
1712 i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
1713 pl = pl_elem(pl, i);
1717 page_addr = page_address(pl->page);
1718 page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
1720 /* Walk all segments and copy data across between bio_vecs and pages. */
1721 bio_for_each_segment(bv, bio, i) {
1722 int len = bv->bv_len, size;
1723 unsigned bio_offset = 0;
1724 void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
1726 size = (page_offset + len > PAGE_SIZE) ?
1727 PAGE_SIZE - page_offset : len;
1730 memcpy(bio_addr + bio_offset,
1731 page_addr + page_offset, size);
1733 memcpy(page_addr + page_offset,
1734 bio_addr + bio_offset, size);
1736 page_offset += size;
1737 if (page_offset == PAGE_SIZE) {
1739 * We reached the end of the chunk page ->
1740 * need to refer to the next one to copy more data.
1744 /* Get next page. */
1748 page_addr = page_address(pl->page);
1751 /* REMOVEME: statistics. */
1752 atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
1757 __bio_kunmap_atomic(bio_addr, KM_USER0);
1762 * Xor optimization macros.
1764 /* Xor data pointer declaration and initialization macros. */
1765 #define DECLARE_2 unsigned long *d0 = data[0], *d1 = data[1]
1766 #define DECLARE_3 DECLARE_2, *d2 = data[2]
1767 #define DECLARE_4 DECLARE_3, *d3 = data[3]
1768 #define DECLARE_5 DECLARE_4, *d4 = data[4]
1769 #define DECLARE_6 DECLARE_5, *d5 = data[5]
1770 #define DECLARE_7 DECLARE_6, *d6 = data[6]
1771 #define DECLARE_8 DECLARE_7, *d7 = data[7]
1773 /* Xor unrole macros. */
1774 #define D2(n) d0[n] = d0[n] ^ d1[n]
1775 #define D3(n) D2(n) ^ d2[n]
1776 #define D4(n) D3(n) ^ d3[n]
1777 #define D5(n) D4(n) ^ d4[n]
1778 #define D6(n) D5(n) ^ d5[n]
1779 #define D7(n) D6(n) ^ d6[n]
1780 #define D8(n) D7(n) ^ d7[n]
1782 #define X_2(macro, offset) macro(offset); macro(offset + 1);
1783 #define X_4(macro, offset) X_2(macro, offset); X_2(macro, offset + 2);
1784 #define X_8(macro, offset) X_4(macro, offset); X_4(macro, offset + 4);
1785 #define X_16(macro, offset) X_8(macro, offset); X_8(macro, offset + 8);
1786 #define X_32(macro, offset) X_16(macro, offset); X_16(macro, offset + 16);
1787 #define X_64(macro, offset) X_32(macro, offset); X_32(macro, offset + 32);
1789 /* Define a _xor_#chunks_#xors_per_run() function. */
1790 #define _XOR(chunks, xors_per_run) \
1791 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1793 unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1794 DECLARE_ ## chunks; \
1796 for (i = 0; i < end; i += xors_per_run) { \
1797 X_ ## xors_per_run(D ## chunks, i); \
1801 /* Define xor functions for 2 - 8 chunks and xors per run. */
1802 #define MAKE_XOR_PER_RUN(xors_per_run) \
1803 _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1804 _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1805 _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1806 _XOR(8, xors_per_run);
1808 MAKE_XOR_PER_RUN(8) /* Define _xor_*_8() functions. */
1809 MAKE_XOR_PER_RUN(16) /* Define _xor_*_16() functions. */
1810 MAKE_XOR_PER_RUN(32) /* Define _xor_*_32() functions. */
1811 MAKE_XOR_PER_RUN(64) /* Define _xor_*_64() functions. */
1813 #define MAKE_XOR(xors_per_run) \
1815 void (*f)(unsigned long **); \
1816 } static xor_funcs ## xors_per_run[] = { \
1817 { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1819 { _xor2_ ## xors_per_run }, \
1820 { _xor3_ ## xors_per_run }, \
1821 { _xor4_ ## xors_per_run }, \
1822 { _xor5_ ## xors_per_run }, \
1823 { _xor6_ ## xors_per_run }, \
1824 { _xor7_ ## xors_per_run }, \
1825 { _xor8_ ## xors_per_run }, \
1828 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1830 /* Call respective function for amount of chunks. */ \
1831 xor_funcs ## xors_per_run[n].f(data); \
1834 /* Define xor_8() - xor_64 functions. */
1840 /* Maximum number of chunks, which can be xor'ed in one go. */
1841 #define XOR_CHUNKS_MAX (ARRAY_SIZE(xor_funcs8) - 1)
1843 static void xor_blocks_wrapper(unsigned n, unsigned long **data)
1845 BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
1846 xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
1852 } static xor_funcs[] = {
1854 { xor_16, "xor_16" },
1855 { xor_32, "xor_32" },
1856 { xor_64, "xor_64" },
1857 { xor_blocks_wrapper, "xor_blocks" },
1861 * Check, if chunk has to be xored in/out:
1863 * o if writes are queued
1864 * o if writes are merged
1865 * o if stripe is to be reconstructed
1866 * o if recovery stripe
1868 static inline int chunk_must_xor(struct stripe_chunk *chunk)
1870 if (ChunkUptodate(chunk)) {
1871 BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
1872 !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
1874 if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
1875 !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
1878 if (StripeReconstruct(chunk->stripe) ||
1879 StripeRecover(chunk->stripe))
1889 * This indexes into the chunks of a stripe and their pages.
1891 * All chunks will be xored into the indexed (@pi)
1892 * chunk in maximum groups of xor.chunks.
1895 static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
1897 struct raid_set *rs = RS(stripe->sc);
1898 unsigned max_chunks = rs->xor.chunks, n = 1,
1899 o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
1900 p = rs->set.raid_devs;
1901 unsigned long **d = rs->data;
1902 xor_function_t xor_f = rs->xor.f->f;
1904 BUG_ON(sector > stripe->io.size);
1906 /* Address of parity page to xor into. */
1907 d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
1910 /* Preset pointers to data pages. */
1911 if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
1912 d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
1914 /* If max chunks -> xor. */
1915 if (n == max_chunks) {
1921 /* If chunks -> xor. */
1926 /* Common xor loop through all stripe page lists. */
1927 static void common_xor(struct stripe *stripe, sector_t count,
1928 unsigned off, unsigned pi)
1933 for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
1934 xor(stripe, pi, sector);
1936 /* Set parity page uptodate and clean. */
1937 chunk_set(CHUNK(stripe, pi), CLEAN);
1938 atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
1942 * Calculate parity sectors on intact stripes.
1944 * Need to calculate raid address for recover stripe, because its
1945 * chunk sizes differs and is typically larger than io chunk size.
1947 static void parity_xor(struct stripe *stripe)
1949 struct raid_set *rs = RS(stripe->sc);
1950 unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
1951 xor_size = chunk_size > io_size ? io_size : chunk_size;
1954 /* This can be the recover stripe with a larger io size. */
1955 for (off = 0; off < io_size; off += xor_size) {
1957 * Recover stripe is likely bigger than regular io
1958 * ones and has no precalculated parity disk index ->
1959 * need to calculate RAID address.
1961 if (unlikely(StripeRecover(stripe))) {
1962 struct raid_address addr;
1965 (stripe->key + off) * rs->set.data_devs,
1967 stripe->idx.parity = addr.pi;
1968 stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
1971 common_xor(stripe, xor_size, off, stripe->idx.parity);
1972 chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
1976 /* Reconstruct missing chunk. */
1977 static void stripe_reconstruct(struct stripe *stripe)
1979 struct raid_set *rs = RS(stripe->sc);
1980 int p = rs->set.raid_devs, pr = stripe->idx.recover;
1984 /* Check if all but the chunk to be reconstructed are uptodate. */
1986 BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
1988 /* REMOVEME: statistics. */
1989 atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
1990 S_RECONSTRUCT_DEV));
1991 /* Zero chunk to be reconstructed. */
1992 stripe_zero_chunk(stripe, pr);
1993 common_xor(stripe, stripe->io.size, 0, pr);
1994 stripe->idx.recover = -1;
1998 * Recovery io throttling
2000 /* Conditionally reset io counters. */
2001 static int recover_io_reset(struct raid_set *rs)
2003 unsigned long j = jiffies;
2005 /* Pay attention to jiffies overflows. */
2006 if (j > rs->recover.last_jiffies + HZ / 20 ||
2007 j < rs->recover.last_jiffies) {
2008 atomic_set(rs->recover.io_count + IO_WORK, 0);
2009 atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2010 rs->recover.last_jiffies = j;
2018 static void recover_io_count(struct stripe *stripe)
2020 struct raid_set *rs = RS(stripe->sc);
2022 recover_io_reset(rs);
2023 atomic_inc(rs->recover.io_count +
2024 (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2027 /* Try getting a stripe either from the hash or from the LRU list. */
2028 static struct stripe *stripe_find(struct raid_set *rs,
2029 struct raid_address *addr)
2032 struct stripe_cache *sc = &rs->sc;
2033 struct stripe *stripe;
2035 /* Try stripe from hash. */
2036 stripe = stripe_lookup(sc, addr->key);
2038 r = stripe_get(stripe);
2040 goto get_lock_failed;
2042 atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2044 /* Not in hash -> try to get an LRU stripe. */
2045 stripe = stripe_lru_pop(sc);
2048 * An LRU stripe may not be referenced
2049 * and may never have ios pending!
2051 BUG_ON(stripe_ref(stripe));
2052 BUG_ON(stripe_io_ref(stripe));
2054 /* Remove from hash if on before reuse. */
2055 stripe_hash_del(stripe);
2057 /* Invalidate before reinserting with changed key. */
2058 stripe_invalidate(stripe);
2060 stripe->key = addr->key;
2061 stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2063 stripe->idx.parity = addr->pi;
2064 r = stripe_get(stripe);
2066 goto get_lock_failed;
2068 /* Insert stripe into the stripe hash. */
2069 stripe_insert(&sc->hash, stripe);
2070 /* REMOVEME: statistics. */
2071 atomic_inc(rs->stats + S_INSCACHE);
2085 * I need to do it here because I can't in interrupt
2087 /* End io all bios on a bio list. */
2088 static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
2091 struct raid_set *rs = RS(stripe->sc);
2093 struct page_list *pl = PL(stripe, p);
2094 struct stripe_chunk *chunk = CHUNK(stripe, p);
2096 /* Update region counters. */
2097 while ((bio = bio_list_pop(bl))) {
2098 if (bio_data_dir(bio) == WRITE)
2099 /* Drop io pending count for any writes. */
2100 dm_rh_dec(rs->recover.rh, stripe->region);
2102 /* Copy data accross. */
2103 bio_copy_page_list(READ, stripe, pl, bio);
2105 bio_endio(bio, error);
2107 /* REMOVEME: statistics. */
2108 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
2109 S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
2113 io_put(rs); /* Wake any suspend waiters on last bio. */
2118 * End io all reads/writes on a stripe copying
2119 * read data accross from stripe to bios and
2120 * decrementing region counters for writes.
2122 * Processing of ios depeding on state:
2123 * o no chunk error -> endio ok
2125 * - chunk error and read -> ignore to be requeued
2126 * - chunk error and write -> endio ok
2127 * o dead (more than parity_devs failed) and chunk_error-> endio failed
2129 static void stripe_endio(int rw, struct stripe *stripe)
2131 struct raid_set *rs = RS(stripe->sc);
2132 unsigned p = rs->set.raid_devs;
2133 int write = (rw != READ);
2136 struct stripe_chunk *chunk = CHUNK(stripe, p);
2137 struct bio_list *bl;
2139 BUG_ON(ChunkLocked(chunk));
2141 bl = BL_CHUNK(chunk, rw);
2142 if (bio_list_empty(bl))
2145 if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
2146 /* RAID set dead. */
2147 if (unlikely(RSDead(rs)))
2148 bio_list_endio(stripe, bl, p, -EIO);
2149 /* RAID set degraded. */
2151 bio_list_endio(stripe, bl, p, 0);
2153 BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
2154 bio_list_endio(stripe, bl, p, 0);
2159 /* Fail all ios hanging off all bio lists of a stripe. */
2160 static void stripe_fail_io(struct stripe *stripe)
2162 struct raid_set *rs = RS(stripe->sc);
2163 unsigned p = rs->set.raid_devs;
2166 struct stripe_chunk *chunk = CHUNK(stripe, p);
2167 int i = ARRAY_SIZE(chunk->bl);
2169 /* Fail all bios on all bio lists of the stripe. */
2171 struct bio_list *bl = chunk->bl + i;
2173 if (!bio_list_empty(bl))
2174 bio_list_endio(stripe, bl, p, -EIO);
2178 /* Put stripe on LRU list. */
2179 BUG_ON(stripe_io_ref(stripe));
2180 BUG_ON(stripe_ref(stripe));
2183 /* Unlock all required chunks. */
2184 static void stripe_chunks_unlock(struct stripe *stripe)
2186 unsigned p = RS(stripe->sc)->set.raid_devs;
2187 struct stripe_chunk *chunk;
2190 chunk = CHUNK(stripe, p);
2192 if (TestClearChunkUnlock(chunk))
2193 ClearChunkLocked(chunk);
2198 * Queue reads and writes to a stripe by hanging
2199 * their bios off the stripesets read/write lists.
2201 static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
2202 struct bio_list *reject)
2204 struct raid_address addr;
2205 struct stripe *stripe;
2207 stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
2209 int r = 0, rw = bio_data_dir(bio);
2211 /* Distinguish reads and writes. */
2212 bio_list_add(BL(stripe, addr.di, rw), bio);
2215 /* REMOVEME: statistics. */
2216 atomic_inc(rs->stats + S_BIOS_ADDED_READ);
2218 /* Inrement pending write count on region. */
2219 dm_rh_inc(rs->recover.rh, stripe->region);
2222 /* REMOVEME: statistics. */
2223 atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
2227 * Put on io (flush) list in case of
2228 * initial bio queued to chunk.
2230 if (chunk_get(CHUNK(stripe, addr.di)) == 1)
2231 stripe_flush_add(stripe);
2236 /* Got no stripe from cache or failed to lock it -> reject bio. */
2237 bio_list_add(reject, bio);
2238 atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
2243 * Handle all stripes by handing them to the daemon, because we can't
2244 * map their chunk pages to copy the data in interrupt context.
2246 * We don't want to handle them here either, while interrupts are disabled.
2249 /* Read/write endio function for dm-io (interrupt context). */
2250 static void endio(unsigned long error, void *context)
2252 struct stripe_chunk *chunk = context;
2254 if (unlikely(error)) {
2255 chunk_set(chunk, ERROR);
2256 /* REMOVEME: statistics. */
2257 atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
2259 chunk_set(chunk, CLEAN);
2262 * For recovery stripes, I need to reset locked locked
2263 * here, because those aren't processed in do_endios().
2265 if (unlikely(StripeRecover(chunk->stripe)))
2266 ClearChunkLocked(chunk);
2268 SetChunkUnlock(chunk);
2270 /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2271 stripe_put_references(chunk->stripe);
2274 /* Read/Write a chunk asynchronously. */
2275 static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
2277 struct stripe_cache *sc = stripe->sc;
2278 struct raid_set *rs = RS(sc);
2279 struct dm_mem_cache_object *obj = stripe->obj + p;
2280 struct page_list *pl = obj->pl;
2281 struct stripe_chunk *chunk = CHUNK(stripe, p);
2282 struct raid_dev *dev = rs->dev + p;
2283 struct dm_io_region io = {
2284 .bdev = dev->dev->bdev,
2285 .sector = stripe->key,
2286 .count = stripe->io.size,
2288 struct dm_io_request control = {
2289 .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
2291 .type = DM_IO_PAGE_LIST,
2299 .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
2303 BUG_ON(ChunkLocked(chunk));
2304 BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
2305 BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
2308 * Don't rw past end of device, which can happen, because
2309 * typically sectors_per_dev isn't divisible by io_size.
2311 if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2312 io.count = rs->set.sectors_per_dev - io.sector;
2315 io.sector += dev->start; /* Add <offset>. */
2317 recover_io_count(stripe); /* Recovery io accounting. */
2319 /* REMOVEME: statistics. */
2320 atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
2322 SetChunkLocked(chunk);
2323 SetDevIoQueued(dev);
2324 BUG_ON(dm_io(&control, 1, &io, NULL));
2328 * Write dirty or read not uptodate page lists of a stripe.
2330 static int stripe_chunks_rw(struct stripe *stripe)
2333 struct raid_set *rs = RS(stripe->sc);
2336 * Increment the pending count on the stripe
2337 * first, so that we don't race in endio().
2339 * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2342 * o dirtied by writes merged
2343 * o dirtied by parity calculations
2345 r = for_each_io_dev(stripe, stripe_get_references);
2347 /* Io needed: chunks are either not uptodate or dirty. */
2348 int max; /* REMOVEME: */
2349 struct stripe_cache *sc = &rs->sc;
2351 /* Submit actual io. */
2352 for_each_io_dev(stripe, stripe_chunk_rw);
2354 /* REMOVEME: statistics */
2355 max = sc_active(sc);
2356 if (atomic_read(&sc->active_stripes_max) < max)
2357 atomic_set(&sc->active_stripes_max, max);
2359 atomic_inc(rs->stats + S_FLUSHS);
2360 /* END REMOVEME: statistics */
2366 /* Merge in all writes hence dirtying respective chunks. */
2367 static void stripe_merge_writes(struct stripe *stripe)
2369 unsigned p = RS(stripe->sc)->set.raid_devs;
2372 struct stripe_chunk *chunk = CHUNK(stripe, p);
2373 struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
2375 if (!bio_list_empty(write)) {
2377 struct page_list *pl = stripe->obj[p].pl;
2380 * We can play with the lists without holding a lock,
2381 * because it is just us accessing them anyway.
2383 bio_list_for_each(bio, write)
2384 bio_copy_page_list(WRITE, stripe, pl, bio);
2386 bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
2387 bio_list_init(write);
2388 chunk_set(chunk, DIRTY);
2393 /* Queue all writes to get merged. */
2394 static int stripe_queue_writes(struct stripe *stripe)
2397 unsigned p = RS(stripe->sc)->set.raid_devs;
2400 struct stripe_chunk *chunk = CHUNK(stripe, p);
2401 struct bio_list *write = BL_CHUNK(chunk, WRITE);
2403 if (!bio_list_empty(write)) {
2404 bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
2405 bio_list_init(write);
2415 /* Check, if a chunk gets completely overwritten. */
2416 static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
2418 unsigned sectors = 0;
2420 struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
2422 bio_list_for_each(bio, bl)
2423 sectors += bio_sectors(bio);
2425 BUG_ON(sectors > RS(stripe->sc)->set.io_size);
2426 return sectors == RS(stripe->sc)->set.io_size;
2430 * Avoid io on broken/reconstructed drive in order to
2431 * reconstruct date on endio.
2433 * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2434 * will trigger a reconstruct call before resetting it.
2436 static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
2438 struct stripe_chunk *chunk = CHUNK(stripe, pr);
2441 * Allow io on all chunks but the indexed one,
2442 * because we're either degraded or prohibit it
2443 * on the one for later reconstruction.
2445 /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2446 stripe_chunk_invalidate(chunk);
2447 stripe->idx.recover = pr;
2448 SetStripeReconstruct(stripe);
2450 /* REMOVEME: statistics. */
2451 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2455 /* Chunk locked/uptodate and device failed tests. */
2456 static struct stripe_chunk *
2457 stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
2459 struct raid_set *rs = RS(stripe->sc);
2460 struct stripe_chunk *chunk = CHUNK(stripe, p);
2462 /* Can't access active chunks. */
2463 if (ChunkLocked(chunk)) {
2464 /* REMOVEME: statistics. */
2465 atomic_inc(rs->stats + S_CHUNK_LOCKED);
2469 /* Can't access broken devive. */
2470 if (ChunkError(chunk) || DevFailed(rs->dev + p))
2473 /* Can access uptodate chunks. */
2474 if (ChunkUptodate(chunk)) {
2475 (*chunks_uptodate)++;
2483 * Degraded/reconstruction mode.
2485 * Check stripe state to figure which chunks don't need IO.
2487 * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2489 static int stripe_check_reconstruct(struct stripe *stripe)
2491 struct raid_set *rs = RS(stripe->sc);
2494 ClearStripeReconstruct(stripe);
2495 ClearStripeReconstructed(stripe);
2496 stripe_allow_io(stripe);
2500 /* Avoid further reconstruction setting, when already set. */
2501 if (StripeReconstruct(stripe)) {
2502 /* REMOVEME: statistics. */
2503 atomic_inc(rs->stats + S_RECONSTRUCT_SET);
2507 /* Initially allow io on all chunks. */
2508 stripe_allow_io(stripe);
2510 /* Return if stripe is already reconstructed. */
2511 if (StripeReconstructed(stripe)) {
2512 atomic_inc(rs->stats + S_RECONSTRUCTED);
2517 * Degraded/reconstruction mode (device failed) ->
2518 * avoid io on the failed device.
2520 if (unlikely(RSDegraded(rs))) {
2521 /* REMOVEME: statistics. */
2522 atomic_inc(rs->stats + S_DEGRADED);
2523 /* Allow IO on all devices but the dead one. */
2524 BUG_ON(rs->set.ei < 0);
2525 return stripe_chunk_set_io_flags(stripe, rs->set.ei);
2527 int sync, pi = dev_for_parity(stripe, &sync);
2530 * Reconstruction mode (ie. a particular (replaced) device or
2531 * some (rotating) parity chunk is being resynchronized) ->
2532 * o make sure all needed chunks are read in
2533 * o writes are allowed to go through
2536 /* REMOVEME: statistics. */
2537 atomic_inc(rs->stats + S_NOSYNC);
2538 /* Allow IO on all devs but the one to reconstruct. */
2539 return stripe_chunk_set_io_flags(stripe, pi);
2547 * Check, if stripe is ready to merge writes.
2548 * I.e. if all chunks present to allow to merge bios.
2550 * We prohibit io on:
2552 * o chunks without bios
2553 * o chunks which get completely written over
2555 static int stripe_merge_possible(struct stripe *stripe, int nosync)
2557 struct raid_set *rs = RS(stripe->sc);
2558 unsigned chunks_overwrite = 0, chunks_prohibited = 0,
2559 chunks_uptodate = 0, p = rs->set.raid_devs;
2561 /* Walk all chunks. */
2563 struct stripe_chunk *chunk;
2565 /* Prohibit io on broken devices. */
2566 if (DevFailed(rs->dev + p)) {
2567 chunk = CHUNK(stripe, p);
2571 /* We can't optimize any further if no chunk. */
2572 chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
2573 if (!chunk || nosync)
2577 * We have a chunk, which is not uptodate.
2579 * If this is not parity and we don't have
2580 * reads queued, we can optimize further.
2582 if (p != stripe->idx.parity &&
2583 bio_list_empty(BL_CHUNK(chunk, READ)) &&
2584 bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
2585 if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
2587 else if (RSCheckOverwrite(rs) &&
2588 stripe_check_chunk_overwrite(stripe, p))
2589 /* Completely overwritten chunk. */
2593 /* Allow io for chunks with bios and overwritten ones. */
2598 /* No io for broken devices or for chunks w/o bios. */
2599 ClearChunkIo(chunk);
2600 chunks_prohibited++;
2601 /* REMOVEME: statistics. */
2602 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2605 /* All data chunks will get written over. */
2606 if (chunks_overwrite == rs->set.data_devs)
2607 atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
2608 else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
2609 /* We don't have enough chunks to merge. */
2610 atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
2615 * If we have all chunks up to date or overwrite them, we
2616 * just zero the parity chunk and let stripe_rw() recreate it.
2618 if (chunks_uptodate == rs->set.raid_devs ||
2619 chunks_overwrite == rs->set.data_devs) {
2620 stripe_zero_chunk(stripe, stripe->idx.parity);
2621 BUG_ON(StripeReconstruct(stripe));
2622 SetStripeReconstruct(stripe); /* Enforce xor in caller. */
2625 * With less chunks, we xor parity out.
2627 * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2628 * so that only chunks with queued or merged writes
2635 * We do have enough chunks to merge.
2636 * All chunks are uptodate or get written over.
2638 atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
2643 * Avoid reading chunks in case we're fully operational.
2645 * We prohibit io on any chunks without bios but the parity chunk.
2647 static void stripe_avoid_reads(struct stripe *stripe)
2649 struct raid_set *rs = RS(stripe->sc);
2650 unsigned dummy = 0, p = rs->set.raid_devs;
2652 /* Walk all chunks. */
2654 struct stripe_chunk *chunk =
2655 stripe_chunk_check(stripe, p, &dummy);
2660 /* If parity or any bios pending -> allow io. */
2661 if (chunk_ref(chunk) || p == stripe->idx.parity)
2664 ClearChunkIo(chunk);
2665 /* REMOVEME: statistics. */
2666 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2672 * Read/write a stripe.
2674 * All stripe read/write activity goes through this function
2675 * unless recovery, which has to call stripe_chunk_rw() directly.
2677 * Make sure we don't try already merged stripes in order
2678 * to avoid data corruption.
2680 * Check the state of the RAID set and if degraded (or
2681 * resynchronizing for reads), read in all other chunks but
2682 * the one on the dead/resynchronizing device in order to be
2683 * able to reconstruct the missing one in _do_endios().
2685 * Can be called on active stripes in order
2686 * to dispatch new io on inactive chunks.
2689 * o stripe to read and/or write
2690 * o stripe with error to reconstruct
2692 static void stripe_rw(struct stripe *stripe)
2695 struct raid_set *rs = RS(stripe->sc);
2698 * Check, if a chunk needs to be reconstructed
2699 * because of a degraded set or a region out of sync.
2701 nosync = stripe_check_reconstruct(stripe);
2704 return; /* Wait for stripe reconstruction to finish. */
2710 * If we don't have merged writes pending, we can schedule
2711 * queued writes to be merged next without corrupting data.
2713 if (!StripeMerged(stripe)) {
2714 r = stripe_queue_writes(stripe);
2716 /* Writes got queued -> flag RBW. */
2717 SetStripeRBW(stripe);
2721 * Merge all writes hanging off uptodate/overwritten
2722 * chunks of the stripe.
2724 if (StripeRBW(stripe)) {
2725 r = stripe_merge_possible(stripe, nosync);
2726 if (!r) { /* Merge possible. */
2727 struct stripe_chunk *chunk;
2730 * I rely on valid parity in order
2731 * to xor a fraction of chunks out
2732 * of parity and back in.
2734 stripe_merge_writes(stripe); /* Merge writes in. */
2735 parity_xor(stripe); /* Update parity. */
2736 ClearStripeReconstruct(stripe); /* Reset xor enforce. */
2737 SetStripeMerged(stripe); /* Writes merged. */
2738 ClearStripeRBW(stripe); /* Disable RBW. */
2741 * REMOVEME: sanity check on parity chunk
2742 * states after writes got merged.
2744 chunk = CHUNK(stripe, stripe->idx.parity);
2745 BUG_ON(ChunkLocked(chunk));
2746 BUG_ON(!ChunkUptodate(chunk));
2747 BUG_ON(!ChunkDirty(chunk));
2748 BUG_ON(!ChunkIo(chunk));
2750 } else if (!nosync && !StripeMerged(stripe))
2751 /* Read avoidance if not degraded/resynchronizing/merged. */
2752 stripe_avoid_reads(stripe);
2755 /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2756 r = stripe_chunks_rw(stripe);
2759 * No io submitted because of chunk io
2760 * prohibited or locked chunks/failed devices
2761 * -> push to end io list for processing.
2763 stripe_endio_push(stripe);
2764 atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
2769 * Recovery functions
2771 /* Read a stripe off a raid set for recovery. */
2772 static int stripe_recover_read(struct stripe *stripe, int pi)
2774 BUG_ON(stripe_io_ref(stripe));
2776 /* Invalidate all chunks so that they get read in. */
2777 stripe_chunks_invalidate(stripe);
2778 stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
2781 * If we are reconstructing a perticular device, we can avoid
2782 * reading the respective chunk in, because we're going to
2783 * reconstruct it anyway.
2785 * We can't do that for resynchronization of rotating parity,
2786 * because the recovery stripe chunk size is typically larger
2787 * than the sets chunk size.
2790 ClearChunkIo(CHUNK(stripe, pi));
2792 return stripe_chunks_rw(stripe);
2795 /* Write a stripe to a raid set for recovery. */
2796 static int stripe_recover_write(struct stripe *stripe, int pi)
2798 BUG_ON(stripe_io_ref(stripe));
2801 * If this is a reconstruct of a particular device, then
2802 * reconstruct the respective chunk, else create parity chunk.
2805 stripe_zero_chunk(stripe, pi);
2806 common_xor(stripe, stripe->io.size, 0, pi);
2807 chunk_set(CHUNK(stripe, pi), DIRTY);
2811 return stripe_chunks_rw(stripe);
2814 /* Read/write a recovery stripe. */
2815 static int stripe_recover_rw(struct stripe *stripe)
2817 int r = 0, sync = 0;
2819 /* Read/write flip-flop. */
2820 if (TestClearStripeRBW(stripe)) {
2821 SetStripeMerged(stripe);
2822 stripe->key = stripe->recover->pos;
2823 r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
2825 } else if (TestClearStripeMerged(stripe)) {
2826 r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
2834 /* Recover bandwidth available ?. */
2835 static int recover_bandwidth(struct raid_set *rs)
2839 /* On reset or when bios delayed -> allow recovery. */
2840 r = recover_io_reset(rs);
2841 if (r || RSBandwidth(rs))
2844 work = atomic_read(rs->recover.io_count + IO_WORK);
2846 /* Pay attention to larger recover stripe size. */
2847 int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
2848 rs->recover.io_size / rs->set.io_size;
2851 * Don't use more than given bandwidth
2852 * of the work io for recovery.
2854 if (recover > work / rs->recover.bandwidth_work) {
2855 /* REMOVEME: statistics. */
2856 atomic_inc(rs->stats + S_NO_BANDWIDTH);
2862 atomic_inc(rs->stats + S_BANDWIDTH); /* REMOVEME: statistics. */
2866 /* Try to get a region to recover. */
2867 static int stripe_recover_get_region(struct stripe *stripe)
2869 struct raid_set *rs = RS(stripe->sc);
2870 struct recover *rec = &rs->recover;
2871 struct recover_addr *addr = stripe->recover;
2872 struct dm_dirty_log *dl = rec->dl;
2873 struct dm_rh_client *rh = rec->rh;
2878 /* Return, that we have region first to finish it during suspension. */
2885 if (dl->type->get_sync_count(dl) >= rec->nr_regions)
2888 /* If we don't have enough bandwidth, we don't proceed recovering. */
2889 if (!recover_bandwidth(rs))
2892 /* Start quiescing a region. */
2893 dm_rh_recovery_prepare(rh);
2894 addr->reg = dm_rh_recovery_start(rh);
2898 addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
2899 addr->end = addr->pos + dm_rh_get_region_size(rh);
2902 * Take one global io reference out for the
2903 * whole region, which is going to be released
2904 * when the region is completely done with.
2910 /* Update region hash state. */
2911 enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
2912 static void recover_rh_update(struct stripe *stripe, enum recover_type success)
2914 struct recover_addr *addr = stripe->recover;
2915 struct raid_set *rs = RS(stripe->sc);
2916 struct recover *rec = &rs->recover;
2919 DMERR("%s- Called w/o region", __func__);
2923 dm_rh_recovery_end(addr->reg, success);
2925 rec->nr_regions_recovered++;
2930 * Completely done with this region ->
2931 * release the 1st io reference.
2936 /* Set start of recovery state. */
2937 static void set_start_recovery(struct raid_set *rs)
2939 /* Initialize recovery. */
2940 rs->recover.start_jiffies = jiffies;
2941 rs->recover.end_jiffies = 0;
2944 /* Set end of recovery state. */
2945 static void set_end_recovery(struct raid_set *rs)
2948 rs->set.dev_to_init = -1;
2950 /* Check for jiffies overrun. */
2951 rs->recover.end_jiffies = jiffies;
2952 if (rs->recover.end_jiffies < rs->recover.start_jiffies)
2953 rs->recover.end_jiffies = ~0;
2956 /* Handle recovery on one recovery stripe. */
2957 static int _do_recovery(struct stripe *stripe)
2960 struct raid_set *rs = RS(stripe->sc);
2961 struct recover_addr *addr = stripe->recover;
2963 /* If recovery is active -> return. */
2964 if (stripe_io_ref(stripe))
2967 /* IO error is fatal for recovery -> stop it. */
2968 if (unlikely(StripeError(stripe)))
2971 /* Recovery end required. */
2975 /* Get a region to recover. */
2976 r = stripe_recover_get_region(stripe);
2978 case 0: /* Got a new region: flag initial read before write. */
2979 SetStripeRBW(stripe);
2980 case 1: /* Have a region in the works. */
2983 /* No bandwidth/quiesced region yet, try later. */
2985 wake_do_raid_delayed(rs, HZ / 4);
2989 case -ENOENT: /* No more regions to recover. */
2990 schedule_work(&rs->io.ws_do_table_event);
2996 /* Read/write a recover stripe. */
2997 r = stripe_recover_rw(stripe);
3002 /* Read and write finished-> update recovery position within region. */
3003 addr->pos += stripe->io.size;
3005 /* If we're at end of region, update region hash. */
3006 if (addr->pos >= addr->end ||
3007 addr->pos >= rs->set.sectors_per_dev)
3008 recover_rh_update(stripe, REC_SUCCESS);
3010 /* Prepare to read next region segment. */
3011 SetStripeRBW(stripe);
3013 /* Schedule myself for another round... */
3018 /* FIXME: rather try recovering other regions on error? */
3019 rs_check_degrade(stripe);
3020 recover_rh_update(stripe, REC_FAILURE);
3022 /* Check state of partially recovered array. */
3023 if (RSDegraded(rs) && !RSDead(rs) &&
3024 rs->set.dev_to_init != -1 &&
3025 rs->set.ei != rs->set.dev_to_init)
3026 /* Broken drive != drive to recover -> FATAL. */
3029 if (StripeError(stripe)) {
3030 char buf[BDEVNAME_SIZE];
3032 DMERR("stopping recovery due to "
3033 "ERROR on /dev/%s, stripe at offset %llu",
3034 bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3035 (unsigned long long) stripe->key);
3039 /* Make sure, that all quiesced regions get released. */
3041 dm_rh_recovery_end(addr->reg, -EIO);
3042 addr->reg = dm_rh_recovery_start(rs->recover.rh);
3048 /* Called by main io daemon to recover regions. */
3049 static void do_recovery(struct raid_set *rs)
3051 if (RSRecover(rs)) {
3053 struct stripe *stripe;
3055 list_for_each_entry(stripe, &rs->recover.stripes,
3056 lists[LIST_RECOVER])
3057 r += _do_recovery(stripe);
3060 set_end_recovery(rs);
3061 stripe_recover_free(rs);
3067 * END recovery functions
3070 /* End io process all stripes handed in by endio() callback. */
3071 static void _do_endios(struct raid_set *rs, struct stripe *stripe,
3072 struct list_head *flush_list)
3074 /* First unlock all required chunks. */
3075 stripe_chunks_unlock(stripe);
3078 * If an io error on a stripe occured, degrade the RAID set
3079 * and try to endio as many bios as possible. If any bios can't
3080 * be endio processed, requeue the stripe (stripe_ref() != 0).
3082 if (TestClearStripeError(stripe)) {
3084 * FIXME: if read, rewrite the failed chunk after reconstruction
3085 * in order to trigger disk bad sector relocation.
3087 rs_check_degrade(stripe); /* Resets ChunkError(). */
3088 ClearStripeReconstruct(stripe);
3089 ClearStripeReconstructed(stripe);
3092 /* Got to reconstruct a missing chunk. */
3093 if (StripeReconstruct(stripe)) {
3095 * (*2*) We use StripeReconstruct() to allow for
3096 * all chunks to be xored into the reconstructed
3097 * one (see chunk_must_xor()).
3099 stripe_reconstruct(stripe);
3102 * (*3*) Now we reset StripeReconstruct() and flag
3103 * StripeReconstructed() to show to stripe_rw(),
3104 * that we have reconstructed a missing chunk.
3106 ClearStripeReconstruct(stripe);
3107 SetStripeReconstructed(stripe);
3109 /* FIXME: reschedule to be written in case of read. */
3110 // if (!StripeRBW(stripe)) {
3111 // chunk_set(CHUNK(stripe, pr), DIRTY);
3112 // stripe_chunks_rw(stripe);
3117 * Now that we eventually got a complete stripe, we
3118 * can process the rest of the end ios on reads.
3120 stripe_endio(READ, stripe);
3122 /* End io all merged writes. */
3123 if (TestClearStripeMerged(stripe))
3124 stripe_endio(WRITE_MERGED, stripe);
3126 /* If RAID set is dead -> fail any ios to dead drives. */
3128 DMERR_LIMIT("RAID set dead: failing ios to dead devices");
3129 stripe_fail_io(stripe);
3133 * We have stripe references still,
3134 * beacuse of read befeore writes or IO errors ->
3135 * got to put on flush list for processing.
3137 if (stripe_ref(stripe)) {
3138 BUG_ON(!list_empty(stripe->lists + LIST_LRU));
3139 list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
3140 atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
3142 stripe_lru_add(stripe);
3145 /* Pop any endio stripes off of the endio list and belabour them. */
3146 static void do_endios(struct raid_set *rs)
3148 struct stripe_cache *sc = &rs->sc;
3149 struct stripe *stripe;
3150 /* IO flush list for sorted requeued stripes. */
3151 struct list_head flush_list;
3153 INIT_LIST_HEAD(&flush_list);
3155 while ((stripe = stripe_endio_pop(sc))) {
3156 /* Avoid endio on stripes with newly io'ed chunks. */
3157 if (!stripe_io_ref(stripe))
3158 _do_endios(rs, stripe, &flush_list);
3162 * Insert any requeued stripes in the proper
3163 * order at the beginning of the io (flush) list.
3165 list_splice(&flush_list, sc->lists + LIST_FLUSH);
3168 /* Flush any stripes on the io list. */
3169 static void do_flush(struct raid_set *rs)
3171 struct stripe *stripe;
3173 while ((stripe = stripe_io_pop(&rs->sc)))
3174 stripe_rw(stripe); /* Read/write stripe. */
3177 /* Stripe cache resizing. */
3178 static void do_sc_resize(struct raid_set *rs)
3180 unsigned set = atomic_read(&rs->sc.stripes_to_set);
3183 unsigned cur = atomic_read(&rs->sc.stripes);
3184 int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
3185 sc_shrink(&rs->sc, cur - set);
3187 /* Flag end of resizeing if ok. */
3189 atomic_set(&rs->sc.stripes_to_set, 0);
3196 * We do different things with the io depending
3197 * on the state of the region that it is in:
3199 * o reads: hang off stripe cache or postpone if full
3203 * CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3204 * In case stripe cache is full or busy, postpone the io.
3206 * RECOVERING: delay the io until recovery of the region completes.
3209 static void do_ios(struct raid_set *rs, struct bio_list *ios)
3212 unsigned flush = 0, delay = 0;
3214 struct dm_rh_client *rh = rs->recover.rh;
3216 struct bio_list reject;
3218 bio_list_init(&reject);
3222 * o delay writes to recovering regions (let reads go through)
3223 * o queue io to all other regions
3225 while ((bio = bio_list_pop(ios))) {
3227 * In case we get a barrier bio, push it back onto
3228 * the input queue unless all work queues are empty
3229 * and the stripe cache is inactive.
3231 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
3232 /* REMOVEME: statistics. */
3233 atomic_inc(rs->stats + S_BARRIER);
3235 !list_empty(rs->sc.lists + LIST_FLUSH) ||
3236 !bio_list_empty(&reject) ||
3237 sc_active(&rs->sc)) {
3238 bio_list_push(ios, bio);
3243 /* Check for recovering regions. */
3244 sector = _sector(rs, bio);
3245 r = region_state(rs, sector, DM_RH_RECOVERING);
3246 if (unlikely(r && bio_data_dir(bio) == WRITE)) {
3248 /* Wait writing to recovering regions. */
3249 dm_rh_delay_by_region(rh, bio,
3250 dm_rh_sector_to_region(rh,
3252 /* REMOVEME: statistics.*/
3253 atomic_inc(rs->stats + S_DELAYED_BIOS);
3254 atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3256 /* Force bandwidth tests in recovery. */
3260 * Process ios to non-recovering regions by queueing
3261 * them to stripes (does dm_rh_inc()) for writes).
3263 flush += stripe_queue_bio(rs, bio, &reject);
3268 /* FIXME: better error handling. */
3269 r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3271 DMERR_LIMIT("dirty log flush");
3274 /* Merge any rejected bios back to the head of the input list. */
3275 bio_list_merge_head(ios, &reject);
3278 /* Unplug: let any queued io role on the sets devices. */
3279 static void do_unplug(struct raid_set *rs)
3281 struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3283 while (dev-- > rs->dev) {
3284 /* Only call any device unplug function, if io got queued. */
3285 if (TestClearDevIoQueued(dev))
3286 blk_unplug(bdev_get_queue(dev->dev->bdev));
3290 /* Send an event in case we're getting too busy. */
3291 static void do_busy_event(struct raid_set *rs)
3294 if (!TestSetRSScBusy(rs))
3295 schedule_work(&rs->io.ws_do_table_event);
3301 /* Throw an event. */
3302 static void do_table_event(struct work_struct *ws)
3304 struct raid_set *rs = container_of(ws, struct raid_set,
3305 io.ws_do_table_event);
3306 dm_table_event(rs->ti->table);
3310 /*-----------------------------------------------------------------
3312 *---------------------------------------------------------------*/
3314 * o belabour all end ios
3315 * o update the region hash states
3316 * o optionally shrink the stripe cache
3317 * o optionally do recovery
3318 * o unplug any component raid devices with queued bios
3319 * o grab the input queue
3320 * o work an all requeued or new ios and perform stripe cache flushs
3321 * o unplug any component raid devices with queued bios
3322 * o check, if the stripe cache gets too busy and throw an event if so
3324 static void do_raid(struct work_struct *ws)
3326 struct raid_set *rs = container_of(ws, struct raid_set,
3327 io.dws_do_raid.work);
3328 struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3331 * We always need to end io, so that ios can get errored in
3332 * case the set failed and the region counters get decremented
3333 * before we update region hash states and go any further.
3336 dm_rh_update_states(rs->recover.rh, 1);
3339 * Now that we've end io'd, which may have put stripes on the LRU list
3340 * to allow for shrinking, we resize the stripe cache if requested.
3344 /* Try to recover regions. */
3346 do_unplug(rs); /* Unplug the sets device queues. */
3348 /* Quickly grab all new ios queued and add them to the work list. */
3349 mutex_lock(&rs->io.in_lock);
3350 bio_list_merge(ios, ios_in);
3351 bio_list_init(ios_in);
3352 mutex_unlock(&rs->io.in_lock);
3354 if (!bio_list_empty(ios))
3355 do_ios(rs, ios); /* Got ios to work into the cache. */
3357 do_flush(rs); /* Flush any stripes on io list. */
3358 do_unplug(rs); /* Unplug the sets device queues. */
3359 do_busy_event(rs); /* Check if we got too busy. */
3363 * Callback for region hash to dispatch
3364 * delayed bios queued to recovered regions
3365 * (gets called via dm_rh_update_states()).
3367 static void dispatch_delayed_bios(void *context, struct bio_list *bl)
3369 struct raid_set *rs = context;
3372 /* REMOVEME: statistics; decrement pending delayed bios counter. */
3373 bio_list_for_each(bio, bl)
3374 atomic_dec(rs->stats + S_DELAYED_BIOS);
3376 /* Merge region hash private list to work list. */
3377 bio_list_merge_head(&rs->io.work, bl);
3379 ClearRSBandwidth(rs);
3382 /*************************************************************
3383 * Constructor helpers
3384 *************************************************************/
3385 /* Calculate MB/sec. */
3386 static unsigned mbpers(struct raid_set *rs, unsigned speed)
3388 return to_bytes(speed * rs->set.data_devs *
3389 rs->recover.io_size * HZ >> 10) >> 10;
3393 * Discover fastest xor algorithm and # of chunks combination.
3395 /* Calculate speed for algorithm and # of chunks. */
3396 static unsigned xor_speed(struct stripe *stripe)
3401 /* Wait for next tick. */
3402 for (j = jiffies; j == jiffies; )
3405 /* Do xors for a full tick. */
3406 for (j = jiffies; j == jiffies; ) {
3408 common_xor(stripe, stripe->io.size, 0, 0);
3416 /* Optimize xor algorithm for this RAID set. */
3417 static unsigned xor_optimize(struct raid_set *rs)
3419 unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0;
3420 struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3421 struct stripe *stripe;
3423 BUG_ON(list_empty(&rs->recover.stripes));
3424 stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3425 lists[LIST_RECOVER]);
3427 /* Must set uptodate so that xor() will belabour chunks. */
3429 SetChunkUptodate(CHUNK(stripe, p));
3431 /* Try all xor functions. */
3432 while (f-- > xor_funcs) {
3435 /* Set actual xor function for common_xor(). */
3437 rs->xor.chunks = (f->f == xor_blocks_wrapper ?
3438 (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1;
3440 while (rs->xor.chunks-- > 2) {
3441 speed = xor_speed(stripe);
3442 if (speed > speed_max) {
3444 chunks_max = rs->xor.chunks;
3450 /* Memorize optimum parameters. */
3452 rs->xor.chunks = chunks_max;
3457 * Allocate a RAID context (a RAID set)
3459 /* Structure for variable RAID parameters. */
3460 struct variable_parms {
3464 int chunk_size_parm;
3469 int recover_io_size;
3470 int recover_io_size_parm;
3473 int recovery_stripes;
3474 int recovery_stripes_parm;
3477 static struct raid_set *
3478 context_alloc(struct raid_type *raid_type, struct variable_parms *p,
3479 unsigned raid_devs, sector_t sectors_per_dev,
3480 struct dm_target *ti, unsigned dl_parms, char **argv)
3484 sector_t region_size, ti_len;
3485 struct raid_set *rs = NULL;
3486 struct dm_dirty_log *dl;
3487 struct recover *rec;
3490 * Create the dirty log
3492 * We need to change length for the dirty log constructor,
3493 * because we want an amount of regions for all stripes derived
3494 * from the single device size, so that we can keep region
3495 * size = 2^^n independant of the number of devices
3498 ti->len = sectors_per_dev;
3499 dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
3504 /* Chunk size *must* be smaller than region size. */
3505 region_size = dl->type->get_region_size(dl);
3506 if (p->chunk_size > region_size)
3507 goto bad_chunk_size;
3509 /* Recover io size *must* be smaller than region size as well. */
3510 if (p->recover_io_size > region_size)
3511 goto bad_recover_io_size;
3513 /* Size and allocate the RAID set structure. */
3514 len = sizeof(*rs->data) + sizeof(*rs->dev);
3515 if (dm_array_too_big(sizeof(*rs), len, raid_devs))
3518 len = sizeof(*rs) + raid_devs * len;
3519 rs = kzalloc(len, GFP_KERNEL);
3524 atomic_set(&rs->io.in_process, 0);
3525 atomic_set(&rs->io.in_process_max, 0);
3526 rec->io_size = p->recover_io_size;
3528 /* Pointer to data array. */
3529 rs->data = (unsigned long **)
3530 ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
3532 rs->set.raid_devs = raid_devs;
3533 rs->set.data_devs = raid_devs - raid_type->parity_devs;
3534 rs->set.raid_type = raid_type;
3536 rs->set.raid_parms = p->raid_parms;
3537 rs->set.chunk_size_parm = p->chunk_size_parm;
3538 rs->set.io_size_parm = p->io_size_parm;
3539 rs->sc.stripes_parm = p->stripes_parm;
3540 rec->io_size_parm = p->recover_io_size_parm;
3541 rec->bandwidth_parm = p->bandwidth_parm;
3542 rec->recovery = p->recovery;
3543 rec->recovery_stripes = p->recovery_stripes;
3546 * Set chunk and io size and respective shifts
3547 * (used to avoid divisions)
3549 rs->set.chunk_size = p->chunk_size;
3550 rs->set.chunk_shift = ffs(p->chunk_size) - 1;
3552 rs->set.io_size = p->io_size;
3553 rs->set.io_mask = p->io_size - 1;
3554 /* Mask to adjust address key in case io_size != chunk_size. */
3555 rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
3557 rs->set.sectors_per_dev = sectors_per_dev;
3559 rs->set.ei = -1; /* Indicate no failed device. */
3560 atomic_set(&rs->set.failed_devs, 0);
3564 atomic_set(rec->io_count + IO_WORK, 0);
3565 atomic_set(rec->io_count + IO_RECOVER, 0);
3567 /* Initialize io lock and queues. */
3568 mutex_init(&rs->io.in_lock);
3569 bio_list_init(&rs->io.in);
3570 bio_list_init(&rs->io.work);
3572 init_waitqueue_head(&rs->io.suspendq); /* Suspend waiters (dm-io). */
3574 rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
3575 rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
3576 wake_dummy, wake_do_raid, 0, p->recovery_stripes,
3577 dl, region_size, rec->nr_regions);
3578 if (IS_ERR(rec->rh))
3581 /* Initialize stripe cache. */
3582 r = sc_init(rs, p->stripes);
3586 /* REMOVEME: statistics. */
3588 ClearRSDevelStats(rs); /* Disnable development status. */
3592 TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
3595 dm_dirty_log_destroy(dl);
3596 TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
3598 bad_recover_io_size:
3599 dm_dirty_log_destroy(dl);
3600 TI_ERR_RET("Recover stripe io size larger than region size",
3604 dm_dirty_log_destroy(dl);
3605 TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
3608 dm_dirty_log_destroy(dl);
3609 TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
3612 dm_dirty_log_destroy(dl);
3613 ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
3617 dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
3619 ti->error = DM_MSG_PREFIX "Error creating stripe cache";
3622 return ERR_PTR(-ENOMEM);
3625 /* Free a RAID context (a RAID set). */
3626 static void context_free(struct raid_set *rs, unsigned p)
3629 dm_put_device(rs->ti, rs->dev[p].dev);
3632 dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
3636 /* Create work queue and initialize delayed work. */
3637 static int rs_workqueue_init(struct raid_set *rs)
3639 struct dm_target *ti = rs->ti;
3641 rs->io.wq = create_singlethread_workqueue(DAEMON);
3643 TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
3645 INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
3646 INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
3650 /* Return pointer to raid_type structure for raid name. */
3651 static struct raid_type *get_raid_type(char *name)
3653 struct raid_type *r = ARRAY_END(raid_types);
3655 while (r-- > raid_types) {
3656 if (!strcmp(r->name, name))
3663 /* FIXME: factor out to dm core. */
3664 static int multiple(sector_t a, sector_t b, sector_t *n)
3673 /* Log RAID set information to kernel log. */
3674 static void rs_log(struct raid_set *rs, unsigned speed)
3677 char buf[BDEVNAME_SIZE];
3679 for (p = 0; p < rs->set.raid_devs; p++)
3680 DMINFO("/dev/%s is raid disk %u%s",
3681 bdevname(rs->dev[p].dev->bdev, buf), p,
3682 (p == rs->set.pi) ? " (parity)" : "");
3684 DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3685 "algorithm \"%s\", %u chunks with %uMB/s\n"
3686 "%s set with net %u/%u devices",
3687 rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
3688 atomic_read(&rs->sc.stripes),
3689 rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed),
3690 rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
3693 /* Get all devices and offsets. */
3694 static int dev_parms(struct raid_set *rs, char **argv, int *p)
3696 struct dm_target *ti = rs->ti;
3698 for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
3700 unsigned long long tmp;
3701 struct raid_dev *dev = rs->dev + *p;
3703 /* Get offset and device. */
3704 if (sscanf(argv[1], "%llu", &tmp) != 1 ||
3705 tmp > rs->set.sectors_per_dev)
3706 TI_ERR("Invalid RAID device offset parameter");
3709 r = dm_get_device(ti, *argv, dm_table_get_mode(ti->table), &dev->dev);
3711 TI_ERR_RET("RAID device lookup failure", r);
3713 r = raid_dev_lookup(rs, dev);
3714 if (r != -ENODEV && r < *p) {
3715 (*p)++; /* Ensure dm_put_device() on actual device. */
3716 TI_ERR_RET("Duplicate RAID device", -ENXIO);
3723 /* Set recovery bandwidth. */
3725 recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
3727 rs->recover.bandwidth = bandwidth;
3728 rs->recover.bandwidth_work = 100 / bandwidth;
3731 /* Handle variable number of RAID parameters. */
3732 static int get_raid_variable_parms(struct dm_target *ti, char **argv,
3733 struct variable_parms *vp)
3737 int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */
3740 int *var, *var2, *var3;
3743 "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3744 IO_SIZE_MIN, CHUNK_SIZE_MAX,
3745 &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
3747 "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3748 STRIPES_MIN, STRIPES_MAX,
3749 &vp->stripes_parm, &vp->stripes, NULL },
3751 "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3752 "min(BIO_MAX_SECTORS/2, chunk size)",
3753 IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
3754 &vp->io_size_parm, &vp->io_size, NULL },
3756 "Invalid recovery io size; must be -1 or "
3757 "2^^n and less equal BIO_MAX_SECTORS/2",
3758 RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
3759 &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
3761 "Invalid recovery bandwidth percentage; "
3762 "must be -1 or > 0 and <= 100",
3763 BANDWIDTH_MIN, BANDWIDTH_MAX,
3764 &vp->bandwidth_parm, &vp->bandwidth, NULL },
3765 /* Handle sync argument seperately in loop. */
3767 "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3769 "Invalid number of recovery stripes;"
3770 "must be -1, > 0 and <= 16384",
3771 RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
3772 &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
3775 /* Fetch # of variable raid parameters. */
3776 if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
3777 !range_ok(vp->raid_parms, 0, 7))
3778 TI_ERR("Bad variable raid parameters number");
3780 /* Preset variable RAID parameters. */
3781 vp->chunk_size = CHUNK_SIZE_DEFAULT;
3782 vp->io_size = IO_SIZE_DEFAULT;
3783 vp->stripes = STRIPES_DEFAULT;
3784 vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
3785 vp->bandwidth = BANDWIDTH_DEFAULT;
3787 vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
3789 /* Walk the array of argument constraints for all given ones. */
3790 for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
3791 BUG_ON(varp >= ARRAY_END(argctr));
3793 /* Special case for "[no]sync" string argument. */
3794 if (varp->action < 0) {
3795 if (!strcmp(*argv, "sync"))
3797 else if (!strcmp(*argv, "nosync"))
3800 TI_ERR(varp->errmsg);
3807 * Special case for io_size depending
3808 * on previously set chunk size.
3811 varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
3813 if (sscanf(*(argv++), "%d", &value) != 1 ||
3815 ((varp->action && !POWER_OF_2(value)) ||
3816 !range_ok(value, varp->min, varp->max))))
3817 TI_ERR(varp->errmsg);
3822 *varp->var2 = value;
3824 *varp->var3 = value;
3831 /* Parse optional locking parameters. */
3832 static int get_raid_locking_parms(struct dm_target *ti, char **argv,
3834 struct dm_raid45_locking_type **locking_type)
3836 if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
3837 char *lckstr = argv[1];
3838 size_t lcksz = strlen(lckstr);
3840 if (!strnicmp(lckstr, "none", lcksz)) {
3841 *locking_type = &locking_none;
3843 } else if (!strnicmp(lckstr, "cluster", lcksz)) {
3844 DMERR("locking type \"%s\" not yet implemented",
3848 DMERR("unknown locking type \"%s\"", lckstr);
3854 *locking_type = &locking_none;
3858 /* Set backing device read ahead properties of RAID set. */
3859 static void rs_set_read_ahead(struct raid_set *rs,
3860 unsigned sectors, unsigned stripes)
3862 unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
3863 struct mapped_device *md = dm_table_get_md(rs->ti->table);
3864 struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3866 /* Set read-ahead for the RAID set and the component devices. */
3868 unsigned p = rs->set.raid_devs;
3870 bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
3873 struct request_queue *q =
3874 bdev_get_queue(rs->dev[p].dev->bdev);
3876 q->backing_dev_info.ra_pages = ra_pages;
3883 /* Set congested function. */
3884 static void rs_set_congested_fn(struct raid_set *rs)
3886 struct mapped_device *md = dm_table_get_md(rs->ti->table);
3887 struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3889 /* Set congested function and data. */
3890 bdi->congested_fn = rs_congested;
3891 bdi->congested_data = rs;
3896 * Construct a RAID4/5 mapping:
3898 * log_type #log_params <log_params> \
3899 * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3900 * [locking "none"/"cluster"]
3901 * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3903 * log_type = "core"/"disk",
3904 * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3905 * log_params = [dirty_log_path] region_size [[no]sync])
3907 * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3909 * #parity_dev = N if raid_type = "raid4"
3910 * o N = -1: pick default = last device
3911 * o N >= 0 and < #raid_devs: parity device index
3913 * #raid_variable_params = 0-7; raid_params (-1 = default):
3914 * [chunk_size [#stripes [io_size [recover_io_size \
3915 * [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3916 * o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3917 * and <= CHUNK_SIZE_MAX)
3918 * o #stripes is number of stripes allocated to stripe cache
3919 * (must be > 1 and < STRIPES_MAX)
3920 * o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3921 * o recover_io_size (io unit size per device for recovery in sectors;
3922 must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
3923 * o %recovery_bandwith is the maximum amount spend for recovery during
3924 * application io (1-100%)
3925 * o recovery switch = [sync|nosync]
3926 * o #recovery_stripes is the number of recovery stripes used for
3927 * parallel recovery of the RAID set
3928 * If raid_variable_params = 0, defaults will be used.
3929 * Any raid_variable_param can be set to -1 to apply a default
3931 * #raid_devs = N (N >= 3)
3933 * #dev_to_initialize = N
3934 * -1: initialize parity on all devices
3935 * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
3936 * of a failed devices content after replacement
3938 * <dev_path> = device_path (eg, /dev/sdd1)
3939 * <offset> = begin at offset on <dev_path>
3942 #define MIN_PARMS 13
3943 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
3945 int dev_to_init, dl_parms, i, locking_parms,
3946 parity_parm, pi = -1, r, raid_devs;
3948 sector_t tmp, sectors_per_dev;
3949 struct dm_raid45_locking_type *locking;
3950 struct raid_set *rs;
3951 struct raid_type *raid_type;
3952 struct variable_parms parms;
3954 /* Ensure minimum number of parameters. */
3955 if (argc < MIN_PARMS)
3956 TI_ERR("Not enough parameters");
3958 /* Fetch # of dirty log parameters. */
3959 if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
3960 !range_ok(dl_parms, 1, 4711)) /* ;-) */
3961 TI_ERR("Bad dirty log parameters number");
3963 /* Check raid_type. */
3964 raid_type = get_raid_type(argv[dl_parms + 2]);
3966 TI_ERR("Bad raid type");
3968 /* In case of RAID4, parity drive is selectable. */
3969 parity_parm = !!(raid_type->level == raid4);
3971 /* Handle variable number of RAID parameters. */
3972 r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
3977 /* Handle any locking parameters. */
3978 r = get_raid_locking_parms(ti,
3979 argv + dl_parms + parity_parm +
3980 parms.raid_parms + 4,
3981 &locking_parms, &locking);
3985 /* # of raid devices. */
3986 i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
3987 if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
3988 raid_devs < raid_type->minimal_devs)
3989 TI_ERR("Invalid number of raid devices");
3991 /* In case of RAID4, check parity drive index is in limits. */
3992 if (raid_type->level == raid4) {
3993 /* Fetch index of parity device. */
3994 if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
3995 (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
3996 TI_ERR("Invalid RAID4 parity device index");
4000 * Index of device to initialize starts at 0
4002 * o -1 -> don't initialize a selected device;
4003 * initialize parity conforming to algorithm
4004 * o 0..raid_devs-1 -> initialize respective device
4005 * (used for reconstruction of a replaced device)
4007 if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
4008 locking_parms + 5], "%d", &dev_to_init) != 1 ||
4009 !range_ok(dev_to_init, -1, raid_devs - 1))
4010 TI_ERR("Invalid number for raid device to initialize");
4012 /* Check # of raid device arguments. */
4013 if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
4015 TI_ERR("Wrong number of raid device/offset arguments");
4018 * Check that the table length is devisable
4019 * w/o rest by (raid_devs - parity_devs)
4021 if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4023 TI_ERR("Target length not divisible by number of data devices");
4026 * Check that the device size is
4027 * devisable w/o rest by chunk size
4029 if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
4030 TI_ERR("Device length not divisible by chunk_size");
4032 /****************************************************************
4033 * Now that we checked the constructor arguments ->
4034 * let's allocate the RAID set
4035 ****************************************************************/
4036 rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
4037 ti, dl_parms, argv);
4042 rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
4043 rs->set.pi = rs->set.pi_parm = pi;
4045 /* Set RAID4 parity drive index. */
4046 if (raid_type->level == raid4)
4047 rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4049 recover_set_bandwidth(rs, parms.bandwidth);
4051 /* Use locking type to lock stripe access. */
4052 rs->locking = locking;
4054 /* Get the device/offset tupels. */
4055 argv += dl_parms + 6 + parity_parm + parms.raid_parms;
4056 r = dev_parms(rs, argv, &i);
4060 /* Set backing device information (eg. read ahead). */
4061 rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */);
4062 rs_set_congested_fn(rs); /* Set congested function. */
4063 SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4064 speed = xor_optimize(rs); /* Select best xor algorithm. */
4066 /* Set for recovery of any nosync regions. */
4071 * Need to free recovery stripe(s) here in case
4072 * of nosync, because xor_optimize uses one.
4074 set_start_recovery(rs);
4075 set_end_recovery(rs);
4076 stripe_recover_free(rs);
4080 * Make sure that dm core only hands maximum io size
4081 * length down and pays attention to io boundaries.
4083 ti->split_io = rs->set.io_size;
4086 /* Initialize work queue to handle this RAID set's io. */
4087 r = rs_workqueue_init(rs);
4091 rs_log(rs, speed); /* Log information about RAID set. */
4095 context_free(rs, i);
4100 * Destruct a raid mapping
4102 static void raid_dtr(struct dm_target *ti)
4104 struct raid_set *rs = ti->private;
4106 destroy_workqueue(rs->io.wq);
4107 context_free(rs, rs->set.raid_devs);
4110 /* Raid mapping function. */
4111 static int raid_map(struct dm_target *ti, struct bio *bio,
4112 union map_info *map_context)
4114 /* I don't want to waste stripe cache capacity. */
4115 if (bio_rw(bio) == READA)
4118 struct raid_set *rs = ti->private;
4121 * Get io reference to be waiting for to drop
4122 * to zero on device suspension/destruction.
4125 bio->bi_sector -= ti->begin; /* Remap sector. */
4127 /* Queue io to RAID set. */
4128 mutex_lock(&rs->io.in_lock);
4129 bio_list_add(&rs->io.in, bio);
4130 mutex_unlock(&rs->io.in_lock);
4132 /* Wake daemon to process input list. */
4135 /* REMOVEME: statistics. */
4136 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
4137 S_BIOS_READ : S_BIOS_WRITE));
4138 return DM_MAPIO_SUBMITTED; /* Handle later. */
4142 /* Device suspend. */
4143 static void raid_presuspend(struct dm_target *ti)
4145 struct raid_set *rs = ti->private;
4146 struct dm_dirty_log *dl = rs->recover.dl;
4151 dm_rh_stop_recovery(rs->recover.rh);
4153 cancel_delayed_work(&rs->io.dws_do_raid);
4154 flush_workqueue(rs->io.wq);
4155 wait_ios(rs); /* Wait for completion of all ios being processed. */
4157 if (dl->type->presuspend && dl->type->presuspend(dl))
4158 /* FIXME: need better error handling. */
4159 DMWARN("log presuspend failed");
4162 static void raid_postsuspend(struct dm_target *ti)
4164 struct raid_set *rs = ti->private;
4165 struct dm_dirty_log *dl = rs->recover.dl;
4167 if (dl->type->postsuspend && dl->type->postsuspend(dl))
4168 /* FIXME: need better error handling. */
4169 DMWARN("log postsuspend failed");
4173 /* Device resume. */
4174 static void raid_resume(struct dm_target *ti)
4176 struct raid_set *rs = ti->private;
4177 struct recover *rec = &rs->recover;
4178 struct dm_dirty_log *dl = rec->dl;
4180 if (dl->type->resume && dl->type->resume(dl))
4181 /* Resume dirty log. */
4182 /* FIXME: need better error handling. */
4183 DMWARN("log resume failed");
4185 rec->nr_regions_to_recover =
4186 rec->nr_regions - dl->type->get_sync_count(dl);
4188 /* Restart any unfinished recovery. */
4189 if (RSRecover(rs)) {
4190 set_start_recovery(rs);
4191 dm_rh_start_recovery(rec->rh);
4198 /* Return stripe cache size. */
4199 static unsigned sc_size(struct raid_set *rs)
4201 return to_sector(atomic_read(&rs->sc.stripes) *
4202 (sizeof(struct stripe) +
4203 (sizeof(struct stripe_chunk) +
4204 (sizeof(struct page_list) +
4205 to_bytes(rs->set.io_size) *
4206 rs->set.raid_devs)) +
4207 (rs->recover.end_jiffies ?
4208 0 : rs->recover.recovery_stripes *
4209 to_bytes(rs->set.raid_devs * rs->recover.io_size))));
4212 /* REMOVEME: status output for development. */
4213 static void raid_devel_stats(struct dm_target *ti, char *result,
4214 unsigned *size, unsigned maxlen)
4216 unsigned sz = *size;
4218 char buf[BDEVNAME_SIZE], *p;
4219 struct stats_map *sm;
4220 struct raid_set *rs = ti->private;
4221 struct recover *rec = &rs->recover;
4224 DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks);
4225 DMEMIT("act_ios=%d ", io_ref(rs));
4226 DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
4227 DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
4228 DMEMIT("act_stripes_max=%d\n",
4229 atomic_read(&rs->sc.active_stripes_max));
4231 for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
4232 DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4234 DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
4235 DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
4236 atomic_read(&rs->sc.stripes), rs->set.io_size,
4237 rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
4240 j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4242 jiffies_to_timespec(j, &ts);
4243 sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4244 p = strchr(buf, '.');
4247 DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4248 (unsigned long long) rec->nr_regions_recovered,
4249 (unsigned long long) rec->nr_regions_to_recover,
4250 (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4255 static int raid_status(struct dm_target *ti, status_type_t type,
4256 char *result, unsigned maxlen)
4259 char buf[BDEVNAME_SIZE];
4260 struct raid_set *rs = ti->private;
4261 int raid_parms[] = {
4262 rs->set.chunk_size_parm,
4263 rs->sc.stripes_parm,
4264 rs->set.io_size_parm,
4265 rs->recover.io_size_parm,
4266 rs->recover.bandwidth_parm,
4268 rs->recover.recovery_stripes,
4272 case STATUSTYPE_INFO:
4273 /* REMOVEME: statistics. */
4274 if (RSDevelStats(rs))
4275 raid_devel_stats(ti, result, &sz, maxlen);
4277 DMEMIT("%u ", rs->set.raid_devs);
4279 for (p = 0; p < rs->set.raid_devs; p++)
4281 format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
4284 for (p = 0; p < rs->set.raid_devs; p++) {
4285 DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
4287 if (p == rs->set.pi)
4290 if (rs->set.dev_to_init == p)
4295 case STATUSTYPE_TABLE:
4296 sz = rs->recover.dl->type->status(rs->recover.dl, type,
4298 DMEMIT("%s %u ", rs->set.raid_type->name,
4299 rs->set.raid_parms);
4301 for (p = 0; p < rs->set.raid_parms; p++) {
4302 if (raid_parms[p] > -2)
4303 DMEMIT("%d ", raid_parms[p]);
4305 DMEMIT("%s ", rs->recover.recovery ?
4309 DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4311 for (p = 0; p < rs->set.raid_devs; p++)
4313 format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
4314 (unsigned long long) rs->dev[p].start);
4323 enum raid_msg_actions {
4324 act_bw, /* Recovery bandwidth switch. */
4325 act_dev, /* Device failure switch. */
4326 act_overwrite, /* Stripe overwrite check. */
4327 act_stats, /* Development statistics switch. */
4328 act_sc, /* Stripe cache switch. */
4330 act_on, /* Set entity on. */
4331 act_off, /* Set entity off. */
4332 act_reset, /* Reset entity. */
4334 act_set = act_on, /* Set # absolute. */
4335 act_grow = act_off, /* Grow # by an amount. */
4336 act_shrink = act_reset, /* Shrink # by an amount. */
4339 /* Turn a delta into an absolute value. */
4340 static int _absolute(unsigned long action, int act, int r)
4342 /* Make delta absolute. */
4343 if (test_bit(act_set, &action))
4345 else if (test_bit(act_grow, &action))
4347 else if (test_bit(act_shrink, &action))
4355 /* Change recovery io bandwidth. */
4356 static int bandwidth_change(struct dm_msg *msg, void *context)
4358 struct raid_set *rs = context;
4359 int act = rs->recover.bandwidth;
4360 int bandwidth = DM_MSG_INT_ARG(msg);
4362 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4363 /* Make delta bandwidth absolute. */
4364 bandwidth = _absolute(msg->action, act, bandwidth);
4367 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4368 recover_set_bandwidth(rs, bandwidth);
4373 set_bit(dm_msg_ret_arg, &msg->ret);
4374 set_bit(dm_msg_ret_inval, &msg->ret);
4378 /* Set/reset development feature flags. */
4379 static int devel_flags(struct dm_msg *msg, void *context)
4381 struct raid_set *rs = context;
4383 if (test_bit(act_on, &msg->action))
4384 return test_and_set_bit(msg->spec->parm,
4385 &rs->io.flags) ? -EPERM : 0;
4386 else if (test_bit(act_off, &msg->action))
4387 return test_and_clear_bit(msg->spec->parm,
4388 &rs->io.flags) ? 0 : -EPERM;
4389 else if (test_bit(act_reset, &msg->action)) {
4390 if (test_bit(act_stats, &msg->action)) {
4393 } else if (test_bit(act_overwrite, &msg->action)) {
4395 set_bit(msg->spec->parm, &rs->io.flags);
4403 /* Resize the stripe cache. */
4404 static int sc_resize(struct dm_msg *msg, void *context)
4407 struct raid_set *rs = context;
4409 /* Deny permission in case the daemon is still resizing!. */
4410 if (atomic_read(&rs->sc.stripes_to_set))
4413 stripes = DM_MSG_INT_ARG(msg);
4415 act = atomic_read(&rs->sc.stripes);
4417 /* Make delta stripes absolute. */
4418 stripes = _absolute(msg->action, act, stripes);
4421 * Check range and that the # of stripes changes.
4422 * We leave the resizing to the wroker.
4424 if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
4425 stripes != atomic_read(&rs->sc.stripes)) {
4426 atomic_set(&rs->sc.stripes_to_set, stripes);
4432 set_bit(dm_msg_ret_arg, &msg->ret);
4433 set_bit(dm_msg_ret_inval, &msg->ret);
4437 /* Parse the RAID message action. */
4439 * 'ba[ndwidth] {se[t],g[row],sh[rink]} #' # e.g 'ba se 50'
4440 * "o[verwrite] {on,of[f],r[eset]}' # e.g. 'o of'
4441 * 'sta[tistics] {on,of[f],r[eset]}' # e.g. 'stat of'
4442 * 'str[ipecache] {se[t],g[row],sh[rink]} #' # e.g. 'stripe set 1024'
4445 static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
4447 /* Variables to store the parsed parameters im. */
4449 static unsigned long *i_arg[] = {
4450 (unsigned long *) i + 0,
4451 (unsigned long *) i + 1,
4454 /* Declare all message option strings. */
4455 static char *str_sgs[] = { "set", "grow", "shrink" };
4456 static char *str_oor[] = { "on", "off", "reset" };
4458 /* Declare all actions. */
4459 static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
4460 static unsigned long act_oor[] = { act_on, act_off, act_reset };
4462 /* Bandwidth option. */
4463 static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
4464 static struct dm_message_argument bw_args = {
4465 1, i_arg, { dm_msg_int_t }
4468 static struct dm_message_argument null_args = {
4469 0, NULL, { dm_msg_int_t }
4472 /* Overwrite and statistics option. */
4473 static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
4475 /* Sripecache option. */
4476 static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
4478 /* Declare messages. */
4479 static struct dm_msg_spec specs[] = {
4480 { "bandwidth", act_bw, &bw_opt, &bw_args,
4481 0, bandwidth_change },
4482 { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
4483 RS_CHECK_OVERWRITE, devel_flags },
4484 { "statistics", act_stats, &ovr_stats_opt, &null_args,
4485 RS_DEVEL_STATS, devel_flags },
4486 { "stripecache", act_sc, &stripe_opt, &bw_args,
4490 /* The message for the parser. */
4491 struct dm_msg msg = {
4492 .num_specs = ARRAY_SIZE(specs),
4496 return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
4499 * END message interface
4502 static struct target_type raid_target = {
4504 .version = {1, 0, 0},
4505 .module = THIS_MODULE,
4509 .presuspend = raid_presuspend,
4510 .postsuspend = raid_postsuspend,
4511 .resume = raid_resume,
4512 .status = raid_status,
4513 .message = raid_message,
4516 static void init_exit(const char *bad_msg, const char *good_msg, int r)
4519 DMERR("Failed to %sregister target [%d]", bad_msg, r);
4521 DMINFO("%s %s", good_msg, version);
4524 static int __init dm_raid_init(void)
4526 int r = dm_register_target(&raid_target);
4528 init_exit("", "initialized", r);
4532 static void __exit dm_raid_exit(void)
4534 dm_unregister_target(&raid_target);
4535 init_exit("un", "exit", 0);
4539 module_init(dm_raid_init);
4540 module_exit(dm_raid_exit);
4542 MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
4543 MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
4544 MODULE_LICENSE("GPL");
4545 MODULE_ALIAS("dm-raid4");
4546 MODULE_ALIAS("dm-raid5");