ubuntu/dm-raid4-5/dm-raid4-5.c

   1 /*[A[A
   2  * Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved.
   3  *
   4  * Module Author: Heinz Mauelshagen <heinzm@redhat.com>
   5  *
   6  * This file is released under the GPL.
   7  *
   8  *
   9  * Linux 2.6 Device Mapper RAID4 and RAID5 target.
  10  *
  11  * Supports:
  12  *      o RAID4 with dedicated and selectable parity device
  13  *      o RAID5 with rotating parity (left+right, symmetric+asymmetric)
  14  *      o recovery of out of sync device for initial
  15  *        RAID set creation or after dead drive replacement
  16  *      o run time optimization of xor algorithm used to calculate parity
  17  *
  18  *
  19  * Thanks to MD for:
  20  *    o the raid address calculation algorithm
  21  *    o the base of the biovec <-> page list copier.
  22  *
  23  *
  24  * Uses region hash to keep track of how many writes are in flight to
  25  * regions in order to use dirty log to keep state of regions to recover:
  26  *
  27  *    o clean regions (those which are synchronized
  28  *      and don't have write io in flight)
  29  *    o dirty regions (those with write io in flight)
  30  *
  31  *
  32  * On startup, any dirty regions are migrated to the
  33  * 'nosync' state and are subject to recovery by the daemon.
  34  *
  35  * See raid_ctr() for table definition.
  36  *
  37  * FIXME: recovery bandwidth
  38  */
  39
  40 static const char *version = "v0.2594b";
  41
  42 #include "dm.h"
  43 #include "dm-memcache.h"
  44 #include "dm-message.h"
  45 #include "dm-raid45.h"
  46
  47 #include <linux/kernel.h>
  48 #include <linux/vmalloc.h>
  49 #include <linux/raid/xor.h>
  50
  51 #include <linux/bio.h>
  52 #include <linux/dm-io.h>
  53 #include <linux/dm-dirty-log.h>
  54 #include "dm-region-hash.h"
  55
  56 #include <linux/slab.h>
  57
  58 /*
  59  * Configurable parameters
  60  */
  61
  62 /* Minimum/maximum and default # of selectable stripes. */
  63 #define STRIPES_MIN             8
  64 #define STRIPES_MAX             16384
  65 #define STRIPES_DEFAULT         80
  66
  67 /* Maximum and default chunk size in sectors if not set in constructor. */
  68 #define CHUNK_SIZE_MIN          8
  69 #define CHUNK_SIZE_MAX          16384
  70 #define CHUNK_SIZE_DEFAULT      64
  71
  72 /* Default io size in sectors if not set in constructor. */
  73 #define IO_SIZE_MIN             CHUNK_SIZE_MIN
  74 #define IO_SIZE_DEFAULT         IO_SIZE_MIN
  75
  76 /* Recover io size default in sectors. */
  77 #define RECOVER_IO_SIZE_MIN             64
  78 #define RECOVER_IO_SIZE_DEFAULT         256
  79
  80 /* Default, minimum and maximum percentage of recover io bandwidth. */
  81 #define BANDWIDTH_DEFAULT       10
  82 #define BANDWIDTH_MIN           1
  83 #define BANDWIDTH_MAX           100
  84
  85 /* # of parallel recovered regions */
  86 #define RECOVERY_STRIPES_MIN    1
  87 #define RECOVERY_STRIPES_MAX    64
  88 #define RECOVERY_STRIPES_DEFAULT        RECOVERY_STRIPES_MIN
  89 /*
  90  * END Configurable parameters
  91  */
  92
  93 #define TARGET  "dm-raid45"
  94 #define DAEMON  "kraid45d"
  95 #define DM_MSG_PREFIX   TARGET
  96
  97 #define SECTORS_PER_PAGE        (PAGE_SIZE >> SECTOR_SHIFT)
  98
  99 /* Amount/size for __xor(). */
 100 #define XOR_SIZE        PAGE_SIZE
 101
 102 /* Check value in range. */
 103 #define range_ok(i, min, max)   (i >= min && i <= max)
 104
 105 /* Check argument is power of 2. */
 106 #define POWER_OF_2(a) (!(a & (a - 1)))
 107
 108 /* Structure access macros. */
 109 /* Derive raid_set from stripe_cache pointer. */
 110 #define RS(x)   container_of(x, struct raid_set, sc)
 111
 112 /* Page reference. */
 113 #define PAGE(stripe, p)  ((stripe)->obj[p].pl->page)
 114
 115 /* Stripe chunk reference. */
 116 #define CHUNK(stripe, p) ((stripe)->chunk + p)
 117
 118 /* Bio list reference. */
 119 #define BL(stripe, p, rw)       (stripe->chunk[p].bl + rw)
 120 #define BL_CHUNK(chunk, rw)     (chunk->bl + rw)
 121
 122 /* Page list reference. */
 123 #define PL(stripe, p)           (stripe->obj[p].pl)
 124 /* END: structure access macros. */
 125
 126 /* Factor out to dm-bio-list.h */
 127 static inline void bio_list_push(struct bio_list *bl, struct bio *bio)
 128 {
 129         bio->bi_next = bl->head;
 130         bl->head = bio;
 131
 132         if (!bl->tail)
 133                 bl->tail = bio;
 134 }
 135
 136 /* Factor out to dm.h */
 137 #define TI_ERR_RET(str, ret) \
 138         do { ti->error = str; return ret; } while (0);
 139 #define TI_ERR(str)     TI_ERR_RET(str, -EINVAL)
 140
 141 /* Macro to define access IO flags access inline functions. */
 142 #define BITOPS(name, what, var, flag) \
 143 static inline int TestClear ## name ## what(struct var *v) \
 144 { return test_and_clear_bit(flag, &v->io.flags); } \
 145 static inline int TestSet ## name ## what(struct var *v) \
 146 { return test_and_set_bit(flag, &v->io.flags); } \
 147 static inline void Clear ## name ## what(struct var *v) \
 148 { clear_bit(flag, &v->io.flags); } \
 149 static inline void Set ## name ## what(struct var *v) \
 150 { set_bit(flag, &v->io.flags); } \
 151 static inline int name ## what(struct var *v) \
 152 { return test_bit(flag, &v->io.flags); }
 153
 154 /*-----------------------------------------------------------------
 155  * Stripe cache
 156  *
 157  * Cache for all reads and writes to raid sets (operational or degraded)
 158  *
 159  * We need to run all data to and from a RAID set through this cache,
 160  * because parity chunks need to get calculated from data chunks
 161  * or, in the degraded/resynchronization case, missing chunks need
 162  * to be reconstructed using the other chunks of the stripe.
 163  *---------------------------------------------------------------*/
 164 /* A chunk within a stripe (holds bios hanging off). */
 165 /* IO status flags for chunks of a stripe. */
 166 enum chunk_flags {
 167         CHUNK_DIRTY,            /* Pages of chunk dirty; need writing. */
 168         CHUNK_ERROR,            /* IO error on any chunk page. */
 169         CHUNK_IO,               /* Allow/prohibit IO on chunk pages. */
 170         CHUNK_LOCKED,           /* Chunk pages locked during IO. */
 171         CHUNK_MUST_IO,          /* Chunk must io. */
 172         CHUNK_UNLOCK,           /* Enforce chunk unlock. */
 173         CHUNK_UPTODATE,         /* Chunk pages are uptodate. */
 174 };
 175
 176 #if READ != 0 || WRITE != 1
 177 #error dm-raid45: READ/WRITE != 0/1 used as index!!!
 178 #endif
 179
 180 enum bl_type {
 181         WRITE_QUEUED = WRITE + 1,
 182         WRITE_MERGED,
 183         NR_BL_TYPES,    /* Must be last one! */
 184 };
 185 struct stripe_chunk {
 186         atomic_t cnt;           /* Reference count. */
 187         struct stripe *stripe;  /* Backpointer to stripe for endio(). */
 188         /* Bio lists for reads, writes, and writes merged. */
 189         struct bio_list bl[NR_BL_TYPES];
 190         struct {
 191                 unsigned long flags; /* IO status flags. */
 192         } io;
 193 };
 194
 195 /* Define chunk bit operations. */
 196 BITOPS(Chunk, Dirty,     stripe_chunk, CHUNK_DIRTY)
 197 BITOPS(Chunk, Error,     stripe_chunk, CHUNK_ERROR)
 198 BITOPS(Chunk, Io,        stripe_chunk, CHUNK_IO)
 199 BITOPS(Chunk, Locked,    stripe_chunk, CHUNK_LOCKED)
 200 BITOPS(Chunk, MustIo,    stripe_chunk, CHUNK_MUST_IO)
 201 BITOPS(Chunk, Unlock,    stripe_chunk, CHUNK_UNLOCK)
 202 BITOPS(Chunk, Uptodate,  stripe_chunk, CHUNK_UPTODATE)
 203
 204 /*
 205  * Stripe linked list indexes. Keep order, because the stripe
 206  * and the stripe cache rely on the first 3!
 207  */
 208 enum list_types {
 209         LIST_FLUSH,     /* Stripes to flush for io. */
 210         LIST_ENDIO,     /* Stripes to endio. */
 211         LIST_LRU,       /* Least recently used stripes. */
 212         SC_NR_LISTS,    /* # of lists in stripe cache. */
 213         LIST_HASH = SC_NR_LISTS,        /* Hashed stripes. */
 214         LIST_RECOVER = LIST_HASH, /* For recovery type stripes only. */
 215         STRIPE_NR_LISTS,/* To size array in struct stripe. */
 216 };
 217
 218 /* Adressing region recovery. */
 219 struct recover_addr {
 220         struct dm_region *reg;  /* Actual region to recover. */
 221         sector_t pos;   /* Position within region to recover. */
 222         sector_t end;   /* End of region to recover. */
 223 };
 224
 225 /* A stripe: the io object to handle all reads and writes to a RAID set. */
 226 struct stripe {
 227         atomic_t cnt;                   /* Reference count. */
 228         struct stripe_cache *sc;        /* Backpointer to stripe cache. */
 229
 230         /*
 231          * 4 linked lists:
 232          *   o io list to flush io
 233          *   o endio list
 234          *   o LRU list to put stripes w/o reference count on
 235          *   o stripe cache hash
 236          */
 237         struct list_head lists[STRIPE_NR_LISTS];
 238
 239         sector_t key;    /* Hash key. */
 240         region_t region; /* Region stripe is mapped to. */
 241
 242         struct {
 243                 unsigned long flags;    /* Stripe state flags (see below). */
 244
 245                 /*
 246                  * Pending ios in flight:
 247                  *
 248                  * used to control move of stripe to endio list
 249                  */
 250                 atomic_t pending;
 251
 252                 /* Sectors to read and write for multi page stripe sets. */
 253                 unsigned size;
 254         } io;
 255
 256         /* Address region recovery. */
 257         struct recover_addr *recover;
 258
 259         /* Lock on stripe (Future: for clustering). */
 260         void *lock;
 261
 262         struct {
 263                 unsigned short parity;  /* Parity chunk index. */
 264                 short recover;          /* Recovery chunk index. */
 265         } idx;
 266
 267         /*
 268          * This stripe's memory cache object (dm-mem-cache);
 269          * i.e. the io chunk pages.
 270          */
 271         struct dm_mem_cache_object *obj;
 272
 273         /* Array of stripe sets (dynamically allocated). */
 274         struct stripe_chunk chunk[0];
 275 };
 276
 277 /* States stripes can be in (flags field). */
 278 enum stripe_states {
 279         STRIPE_ERROR,           /* io error on stripe. */
 280         STRIPE_MERGED,          /* Writes got merged to be written. */
 281         STRIPE_RBW,             /* Read-before-write stripe. */
 282         STRIPE_RECONSTRUCT,     /* Reconstruct of a missing chunk required. */
 283         STRIPE_RECONSTRUCTED,   /* Reconstructed of a missing chunk. */
 284         STRIPE_RECOVER,         /* Stripe used for RAID set recovery. */
 285 };
 286
 287 /* Define stripe bit operations. */
 288 BITOPS(Stripe, Error,         stripe, STRIPE_ERROR)
 289 BITOPS(Stripe, Merged,        stripe, STRIPE_MERGED)
 290 BITOPS(Stripe, RBW,           stripe, STRIPE_RBW)
 291 BITOPS(Stripe, Reconstruct,   stripe, STRIPE_RECONSTRUCT)
 292 BITOPS(Stripe, Reconstructed, stripe, STRIPE_RECONSTRUCTED)
 293 BITOPS(Stripe, Recover,       stripe, STRIPE_RECOVER)
 294
 295 /* A stripe hash. */
 296 struct stripe_hash {
 297         struct list_head *hash;
 298         unsigned buckets;
 299         unsigned mask;
 300         unsigned prime;
 301         unsigned shift;
 302 };
 303
 304 enum sc_lock_types {
 305         LOCK_ENDIO,     /* Protect endio list. */
 306         LOCK_LRU,       /* Protect LRU list. */
 307         NR_LOCKS,       /* To size array in struct stripe_cache. */
 308 };
 309
 310 /* A stripe cache. */
 311 struct stripe_cache {
 312         /* Stripe hash. */
 313         struct stripe_hash hash;
 314
 315         spinlock_t locks[NR_LOCKS];     /* Locks to protect lists. */
 316
 317         /* Stripes with io to flush, stripes to endio and LRU lists. */
 318         struct list_head lists[SC_NR_LISTS];
 319
 320         /* Slab cache to allocate stripes from. */
 321         struct {
 322                 struct kmem_cache *cache;       /* Cache itself. */
 323                 char name[32];  /* Unique name. */
 324         } kc;
 325
 326         struct dm_io_client *dm_io_client; /* dm-io client resource context. */
 327
 328         /* dm-mem-cache client resource context. */
 329         struct dm_mem_cache_client *mem_cache_client;
 330
 331         int stripes_parm;           /* # stripes parameter from constructor. */
 332         atomic_t stripes;           /* actual # of stripes in cache. */
 333         atomic_t stripes_to_set;    /* # of stripes to resize cache to. */
 334         atomic_t stripes_last;      /* last # of stripes in cache. */
 335         atomic_t active_stripes;    /* actual # of active stripes in cache. */
 336
 337         /* REMOVEME: */
 338         atomic_t active_stripes_max; /* actual # of active stripes in cache. */
 339 };
 340
 341 /* Flag specs for raid_dev */ ;
 342 enum raid_dev_flags {
 343         DEV_FAILED,     /* Device failed. */
 344         DEV_IO_QUEUED,  /* Io got queued to device. */
 345 };
 346
 347 /* The raid device in a set. */
 348 struct raid_dev {
 349         struct dm_dev *dev;
 350         sector_t start;         /* Offset to map to. */
 351         struct {        /* Using struct to be able to BITOPS(). */
 352                 unsigned long flags;    /* raid_dev_flags. */
 353         } io;
 354 };
 355
 356 BITOPS(Dev, Failed,   raid_dev, DEV_FAILED)
 357 BITOPS(Dev, IoQueued, raid_dev, DEV_IO_QUEUED)
 358
 359 /* Flags spec for raid_set. */
 360 enum raid_set_flags {
 361         RS_CHECK_OVERWRITE,     /* Check for chunk overwrites. */
 362         RS_DEAD,                /* RAID set inoperational. */
 363         RS_DEGRADED,            /* Io errors on RAID device. */
 364         RS_DEVEL_STATS,         /* REMOVEME: display status information. */
 365         RS_RECOVER,             /* Do recovery. */
 366         RS_RECOVERY_BANDWIDTH,  /* Allow recovery bandwidth (delayed bios). */
 367         RS_SC_BUSY,             /* Stripe cache busy -> send an event. */
 368         RS_SUSPEND,             /* Suspend RAID set. */
 369 };
 370
 371 /* REMOVEME: devel stats counters. */
 372 enum stats_types {
 373         S_BIOS_READ,
 374         S_BIOS_ADDED_READ,
 375         S_BIOS_ENDIO_READ,
 376         S_BIOS_WRITE,
 377         S_BIOS_ADDED_WRITE,
 378         S_BIOS_ENDIO_WRITE,
 379         S_CAN_MERGE,
 380         S_CANT_MERGE,
 381         S_CONGESTED,
 382         S_DM_IO_READ,
 383         S_DM_IO_WRITE,
 384         S_BANDWIDTH,
 385         S_BARRIER,
 386         S_BIO_COPY_PL_NEXT,
 387         S_DEGRADED,
 388         S_DELAYED_BIOS,
 389         S_FLUSHS,
 390         S_HITS_1ST,
 391         S_IOS_POST,
 392         S_INSCACHE,
 393         S_MAX_LOOKUP,
 394         S_CHUNK_LOCKED,
 395         S_NO_BANDWIDTH,
 396         S_NOT_CONGESTED,
 397         S_NO_RW,
 398         S_NOSYNC,
 399         S_OVERWRITE,
 400         S_PROHIBITCHUNKIO,
 401         S_RECONSTRUCT_EI,
 402         S_RECONSTRUCT_DEV,
 403         S_RECONSTRUCT_SET,
 404         S_RECONSTRUCTED,
 405         S_REQUEUE,
 406         S_STRIPE_ERROR,
 407         S_SUM_DELAYED_BIOS,
 408         S_XORS,
 409         S_NR_STATS,     /* # of stats counters. Must be last! */
 410 };
 411
 412 /* Status type -> string mappings. */
 413 struct stats_map {
 414         const enum stats_types type;
 415         const char *str;
 416 };
 417
 418 static struct stats_map stats_map[] = {
 419         { S_BIOS_READ, "r=" },
 420         { S_BIOS_ADDED_READ, "/" },
 421         { S_BIOS_ENDIO_READ, "/" },
 422         { S_BIOS_WRITE, " w=" },
 423         { S_BIOS_ADDED_WRITE, "/" },
 424         { S_BIOS_ENDIO_WRITE, "/" },
 425         { S_DM_IO_READ, " rc=" },
 426         { S_DM_IO_WRITE, " wc=" },
 427         { S_BANDWIDTH, "\nbw=" },
 428         { S_NO_BANDWIDTH, " no_bw=" },
 429         { S_BARRIER, "\nbarrier=" },
 430         { S_BIO_COPY_PL_NEXT, "\nbio_cp_next=" },
 431         { S_CAN_MERGE, "\nmerge=" },
 432         { S_CANT_MERGE, "/no_merge=" },
 433         { S_CHUNK_LOCKED, "\nchunk_locked=" },
 434         { S_CONGESTED, "\ncgst=" },
 435         { S_NOT_CONGESTED, "/not_cgst=" },
 436         { S_DEGRADED, "\ndegraded=" },
 437         { S_DELAYED_BIOS, "\ndel_bios=" },
 438         { S_SUM_DELAYED_BIOS, "/sum_del_bios=" },
 439         { S_FLUSHS, "\nflushs=" },
 440         { S_HITS_1ST, "\nhits_1st=" },
 441         { S_IOS_POST, " ios_post=" },
 442         { S_INSCACHE, " inscache=" },
 443         { S_MAX_LOOKUP, " maxlookup=" },
 444         { S_NO_RW, "\nno_rw=" },
 445         { S_NOSYNC, " nosync=" },
 446         { S_OVERWRITE, " ovr=" },
 447         { S_PROHIBITCHUNKIO, " prhbt_io=" },
 448         { S_RECONSTRUCT_EI, "\nrec_ei=" },
 449         { S_RECONSTRUCT_DEV, " rec_dev=" },
 450         { S_RECONSTRUCT_SET, " rec_set=" },
 451         { S_RECONSTRUCTED, " rec=" },
 452         { S_REQUEUE, " requeue=" },
 453         { S_STRIPE_ERROR, " stripe_err=" },
 454         { S_XORS, " xors=" },
 455 };
 456
 457 /*
 458  * A RAID set.
 459  */
 460 #define dm_rh_client    dm_region_hash
 461 enum count_type { IO_WORK = 0, IO_RECOVER, IO_NR_COUNT };
 462 typedef void (*xor_function_t)(unsigned count, unsigned long **data);
 463 struct raid_set {
 464         struct dm_target *ti;   /* Target pointer. */
 465
 466         struct {
 467                 unsigned long flags;    /* State flags. */
 468                 struct mutex in_lock;   /* Protects central input list below. */
 469                 struct bio_list in;     /* Pending ios (central input list). */
 470                 struct bio_list work;   /* ios work set. */
 471                 wait_queue_head_t suspendq;     /* suspend synchronization. */
 472                 atomic_t in_process;    /* counter of queued bios (suspendq). */
 473                 atomic_t in_process_max;/* counter of queued bios max. */
 474
 475                 /* io work. */
 476                 struct workqueue_struct *wq;
 477                 struct delayed_work dws_do_raid;        /* For main worker. */
 478                 struct work_struct ws_do_table_event;   /* For event worker. */
 479         } io;
 480
 481         /* Stripe locking abstraction. */
 482         struct dm_raid45_locking_type *locking;
 483
 484         struct stripe_cache sc; /* Stripe cache for this set. */
 485
 486         /* Xor optimization. */
 487         struct {
 488                 struct xor_func *f;
 489                 unsigned chunks;
 490                 unsigned speed;
 491         } xor;
 492
 493         /* Recovery parameters. */
 494         struct recover {
 495                 struct dm_dirty_log *dl;        /* Dirty log. */
 496                 struct dm_rh_client *rh;        /* Region hash. */
 497
 498                 struct dm_io_client *dm_io_client; /* recovery dm-io client. */
 499                 /* dm-mem-cache client resource context for recovery stripes. */
 500                 struct dm_mem_cache_client *mem_cache_client;
 501
 502                 struct list_head stripes;       /* List of recovery stripes. */
 503
 504                 region_t nr_regions;
 505                 region_t nr_regions_to_recover;
 506                 region_t nr_regions_recovered;
 507                 unsigned long start_jiffies;
 508                 unsigned long end_jiffies;
 509
 510                 unsigned bandwidth;      /* Recovery bandwidth [%]. */
 511                 unsigned bandwidth_work; /* Recovery bandwidth [factor]. */
 512                 unsigned bandwidth_parm; /*  " constructor parm. */
 513                 unsigned io_size;        /* recovery io size <= region size. */
 514                 unsigned io_size_parm;   /* recovery io size ctr parameter. */
 515                 unsigned recovery;       /* Recovery allowed/prohibited. */
 516                 unsigned recovery_stripes; /* # of parallel recovery stripes. */
 517
 518                 /* recovery io throttling. */
 519                 atomic_t io_count[IO_NR_COUNT]; /* counter recover/regular io.*/
 520                 unsigned long last_jiffies;
 521         } recover;
 522
 523         /* RAID set parameters. */
 524         struct {
 525                 struct raid_type *raid_type;    /* RAID type (eg, RAID4). */
 526                 unsigned raid_parms;    /* # variable raid parameters. */
 527
 528                 unsigned chunk_size;    /* Sectors per chunk. */
 529                 unsigned chunk_size_parm;
 530                 unsigned chunk_shift;   /* rsector chunk size shift. */
 531
 532                 unsigned io_size;       /* Sectors per io. */
 533                 unsigned io_size_parm;
 534                 unsigned io_mask;       /* Mask for bio_copy_page_list(). */
 535                 unsigned io_inv_mask;   /* Mask for raid_address(). */
 536
 537                 sector_t sectors_per_dev;       /* Sectors per device. */
 538
 539                 atomic_t failed_devs;           /* Amount of devices failed. */
 540
 541                 /* Index of device to initialize. */
 542                 int dev_to_init;
 543                 int dev_to_init_parm;
 544
 545                 /* Raid devices dynamically allocated. */
 546                 unsigned raid_devs;     /* # of RAID devices below. */
 547                 unsigned data_devs;     /* # of RAID data devices. */
 548
 549                 int ei;         /* index of failed RAID device. */
 550
 551                 /* Index of dedicated parity device (i.e. RAID4). */
 552                 int pi;
 553                 int pi_parm;    /* constructor parm for status output. */
 554         } set;
 555
 556         /* REMOVEME: devel stats counters. */
 557         atomic_t stats[S_NR_STATS];
 558
 559         /* Dynamically allocated temporary pointers for xor(). */
 560         unsigned long **data;
 561
 562         /* Dynamically allocated RAID devices. Alignment? */
 563         struct raid_dev dev[0];
 564 };
 565
 566 /* Define RAID set bit operations. */
 567 BITOPS(RS, Bandwidth, raid_set, RS_RECOVERY_BANDWIDTH)
 568 BITOPS(RS, CheckOverwrite, raid_set, RS_CHECK_OVERWRITE)
 569 BITOPS(RS, Dead, raid_set, RS_DEAD)
 570 BITOPS(RS, Degraded, raid_set, RS_DEGRADED)
 571 BITOPS(RS, DevelStats, raid_set, RS_DEVEL_STATS)
 572 BITOPS(RS, Recover, raid_set, RS_RECOVER)
 573 BITOPS(RS, ScBusy, raid_set, RS_SC_BUSY)
 574 BITOPS(RS, Suspend, raid_set, RS_SUSPEND)
 575 #undef BITOPS
 576
 577 /*-----------------------------------------------------------------
 578  * Raid-4/5 set structures.
 579  *---------------------------------------------------------------*/
 580 /* RAID level definitions. */
 581 enum raid_level {
 582         raid4,
 583         raid5,
 584 };
 585
 586 /* Symmetric/Asymmetric, Left/Right parity rotating algorithms. */
 587 enum raid_algorithm {
 588         none,
 589         left_asym,
 590         right_asym,
 591         left_sym,
 592         right_sym,
 593 };
 594
 595 struct raid_type {
 596         const char *name;               /* RAID algorithm. */
 597         const char *descr;              /* Descriptor text for logging. */
 598         const unsigned parity_devs;     /* # of parity devices. */
 599         const unsigned minimal_devs;    /* minimal # of devices in set. */
 600         const enum raid_level level;            /* RAID level. */
 601         const enum raid_algorithm algorithm;    /* RAID algorithm. */
 602 };
 603
 604 /* Supported raid types and properties. */
 605 static struct raid_type raid_types[] = {
 606         {"raid4",    "RAID4 (dedicated parity disk)", 1, 3, raid4, none},
 607         {"raid5_la", "RAID5 (left asymmetric)",       1, 3, raid5, left_asym},
 608         {"raid5_ra", "RAID5 (right asymmetric)",      1, 3, raid5, right_asym},
 609         {"raid5_ls", "RAID5 (left symmetric)",        1, 3, raid5, left_sym},
 610         {"raid5_rs", "RAID5 (right symmetric)",       1, 3, raid5, right_sym},
 611 };
 612
 613 /* Address as calculated by raid_address(). */
 614 struct raid_address {
 615         sector_t key;           /* Hash key (address of stripe % chunk_size). */
 616         unsigned di, pi;        /* Data and parity disks index. */
 617 };
 618
 619 /* REMOVEME: reset statistics counters. */
 620 static void stats_reset(struct raid_set *rs)
 621 {
 622         unsigned s = S_NR_STATS;
 623
 624         while (s--)
 625                 atomic_set(rs->stats + s, 0);
 626 }
 627
 628 /*----------------------------------------------------------------
 629  * RAID set management routines.
 630  *--------------------------------------------------------------*/
 631 /*
 632  * Begin small helper functions.
 633  */
 634 /* No need to be called from region hash indirectly at dm_rh_dec(). */
 635 static void wake_dummy(void *context) {}
 636
 637 /* Return # of io reference. */
 638 static int io_ref(struct raid_set *rs)
 639 {
 640         return atomic_read(&rs->io.in_process);
 641 }
 642
 643 /* Get an io reference. */
 644 static void io_get(struct raid_set *rs)
 645 {
 646         int p = atomic_inc_return(&rs->io.in_process);
 647
 648         if (p > atomic_read(&rs->io.in_process_max))
 649                 atomic_set(&rs->io.in_process_max, p); /* REMOVEME: max. */
 650 }
 651
 652 /* Put the io reference and conditionally wake io waiters. */
 653 static void io_put(struct raid_set *rs)
 654 {
 655         /* Intel: rebuild data corrupter? */
 656         if (atomic_dec_and_test(&rs->io.in_process))
 657                 wake_up(&rs->io.suspendq);
 658         else
 659                 BUG_ON(io_ref(rs) < 0);
 660 }
 661
 662 /* Wait until all io has been processed. */
 663 static void wait_ios(struct raid_set *rs)
 664 {
 665         wait_event(rs->io.suspendq, !io_ref(rs));
 666 }
 667
 668 /* Queue (optionally delayed) io work. */
 669 static void wake_do_raid_delayed(struct raid_set *rs, unsigned long delay)
 670 {
 671         queue_delayed_work(rs->io.wq, &rs->io.dws_do_raid, delay);
 672 }
 673
 674 /* Queue io work immediately (called from region hash too). */
 675 static void wake_do_raid(void *context)
 676 {
 677         struct raid_set *rs = context;
 678
 679         queue_work(rs->io.wq, &rs->io.dws_do_raid.work);
 680 }
 681
 682 /* Calculate device sector offset. */
 683 static sector_t _sector(struct raid_set *rs, struct bio *bio)
 684 {
 685         sector_t sector = bio->bi_sector;
 686
 687         sector_div(sector, rs->set.data_devs);
 688         return sector;
 689 }
 690
 691 /* Return # of active stripes in stripe cache. */
 692 static int sc_active(struct stripe_cache *sc)
 693 {
 694         return atomic_read(&sc->active_stripes);
 695 }
 696
 697 /* Stripe cache busy indicator. */
 698 static int sc_busy(struct raid_set *rs)
 699 {
 700         return sc_active(&rs->sc) >
 701                atomic_read(&rs->sc.stripes) - (STRIPES_MIN / 2);
 702 }
 703
 704 /* Set chunks states. */
 705 enum chunk_dirty_type { CLEAN, DIRTY, ERROR };
 706 static void chunk_set(struct stripe_chunk *chunk, enum chunk_dirty_type type)
 707 {
 708         switch (type) {
 709         case CLEAN:
 710                 ClearChunkDirty(chunk);
 711                 break;
 712         case DIRTY:
 713                 SetChunkDirty(chunk);
 714                 break;
 715         case ERROR:
 716                 SetChunkError(chunk);
 717                 SetStripeError(chunk->stripe);
 718                 return;
 719         default:
 720                 BUG();
 721         }
 722
 723         SetChunkUptodate(chunk);
 724         SetChunkIo(chunk);
 725         ClearChunkError(chunk);
 726 }
 727
 728 /* Return region state for a sector. */
 729 static int region_state(struct raid_set *rs, sector_t sector,
 730                         enum dm_rh_region_states state)
 731 {
 732         struct dm_rh_client *rh = rs->recover.rh;
 733         region_t region = dm_rh_sector_to_region(rh, sector);
 734
 735         return !!(dm_rh_get_state(rh, region, 1) & state);
 736 }
 737
 738 /*
 739  * Return true in case a chunk should be read/written
 740  *
 741  * Conditions to read/write:
 742  *      o chunk not uptodate
 743  *      o chunk dirty
 744  *
 745  * Conditios to avoid io:
 746  *      o io already ongoing on chunk
 747  *      o io explitely prohibited
 748  */
 749 static int chunk_io(struct stripe_chunk *chunk)
 750 {
 751         /* 2nd run optimization (flag set below on first run). */
 752         if (TestClearChunkMustIo(chunk))
 753                 return 1;
 754
 755         /* Avoid io if prohibited or a locked chunk. */
 756         if (!ChunkIo(chunk) || ChunkLocked(chunk))
 757                 return 0;
 758
 759         if (!ChunkUptodate(chunk) || ChunkDirty(chunk)) {
 760                 SetChunkMustIo(chunk); /* 2nd run optimization. */
 761                 return 1;
 762         }
 763
 764         return 0;
 765 }
 766
 767 /* Call a function on each chunk needing io unless device failed. */
 768 static unsigned for_each_io_dev(struct stripe *stripe,
 769                                 void (*f_io)(struct stripe *stripe, unsigned p))
 770 {
 771         struct raid_set *rs = RS(stripe->sc);
 772         unsigned p, r = 0;
 773
 774         for (p = 0; p < rs->set.raid_devs; p++) {
 775                 if (chunk_io(CHUNK(stripe, p)) && !DevFailed(rs->dev + p)) {
 776                         f_io(stripe, p);
 777                         r++;
 778                 }
 779         }
 780
 781         return r;
 782 }
 783
 784 /*
 785  * Index of device to calculate parity on.
 786  *
 787  * Either the parity device index *or* the selected
 788  * device to init after a spare replacement.
 789  */
 790 static int dev_for_parity(struct stripe *stripe, int *sync)
 791 {
 792         struct raid_set *rs = RS(stripe->sc);
 793         int r = region_state(rs, stripe->key, DM_RH_NOSYNC | DM_RH_RECOVERING);
 794
 795         *sync = !r;
 796
 797         /* Reconstruct a particular device ?. */
 798         if (r && rs->set.dev_to_init > -1)
 799                 return rs->set.dev_to_init;
 800         else if (rs->set.raid_type->level == raid4)
 801                 return rs->set.pi;
 802         else if (!StripeRecover(stripe))
 803                 return stripe->idx.parity;
 804         else
 805                 return -1;
 806 }
 807
 808 /* RAID set congested function. */
 809 static int rs_congested(void *congested_data, int bdi_bits)
 810 {
 811         int r;
 812         unsigned p;
 813         struct raid_set *rs = congested_data;
 814
 815         if (sc_busy(rs) || RSSuspend(rs))
 816                 r = 1;
 817         else for (r = 0, p = rs->set.raid_devs; !r && p--; ) {
 818                 /* If any of our component devices are overloaded. */
 819                 struct request_queue *q = bdev_get_queue(rs->dev[p].dev->bdev);
 820
 821                 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 822         }
 823
 824         /* REMOVEME: statistics. */
 825         atomic_inc(rs->stats + (r ? S_CONGESTED : S_NOT_CONGESTED));
 826         return r;
 827 }
 828
 829 /* RAID device degrade check. */
 830 static void rs_check_degrade_dev(struct raid_set *rs,
 831                                        struct stripe *stripe, unsigned p)
 832 {
 833         if (TestSetDevFailed(rs->dev + p))
 834                 return;
 835
 836         /* Through an event in case of member device errors. */
 837         if (atomic_inc_return(&rs->set.failed_devs) >
 838             rs->set.raid_type->parity_devs &&
 839             !TestSetRSDead(rs)) {
 840                 /* Display RAID set dead message once. */
 841                 unsigned p;
 842                 char buf[BDEVNAME_SIZE];
 843
 844                 DMERR("FATAL: too many devices failed -> RAID set broken");
 845                 for (p = 0; p < rs->set.raid_devs; p++) {
 846                         if (DevFailed(rs->dev + p))
 847                                 DMERR("device /dev/%s failed",
 848                                       bdevname(rs->dev[p].dev->bdev, buf));
 849                 }
 850         }
 851
 852         /* Only log the first member error. */
 853         if (!TestSetRSDegraded(rs)) {
 854                 char buf[BDEVNAME_SIZE];
 855
 856                 /* Store index for recovery. */
 857                 rs->set.ei = p;
 858                 DMERR("CRITICAL: %sio error on device /dev/%s "
 859                       "in region=%llu; DEGRADING RAID set\n",
 860                       stripe ? "" : "FAKED ",
 861                       bdevname(rs->dev[p].dev->bdev, buf),
 862                       (unsigned long long) (stripe ? stripe->key : 0));
 863                 DMERR("further device error messages suppressed");
 864         }
 865
 866         schedule_work(&rs->io.ws_do_table_event);
 867 }
 868
 869 /* RAID set degrade check. */
 870 static void rs_check_degrade(struct stripe *stripe)
 871 {
 872         struct raid_set *rs = RS(stripe->sc);
 873         unsigned p = rs->set.raid_devs;
 874
 875         while (p--) {
 876                 if (ChunkError(CHUNK(stripe, p)))
 877                         rs_check_degrade_dev(rs, stripe, p);
 878         }
 879 }
 880
 881 /* Lookup a RAID device by name or by major:minor number. */
 882 static int raid_dev_lookup(struct raid_set *rs, struct raid_dev *dev_lookup)
 883 {
 884         unsigned p;
 885         struct raid_dev *dev;
 886
 887         /*
 888          * Must be an incremental loop, because the device array
 889          * can have empty slots still on calls from raid_ctr()
 890          */
 891         for (dev = rs->dev, p = 0;
 892              dev->dev && p < rs->set.raid_devs;
 893              dev++, p++) {
 894                 if (dev_lookup->dev->bdev->bd_dev == dev->dev->bdev->bd_dev)
 895                         return p;
 896         }
 897
 898         return -ENODEV;
 899 }
 900 /*
 901  * End small helper functions.
 902  */
 903
 904 /*
 905  * Stripe hash functions
 906  */
 907 /* Initialize/destroy stripe hash. */
 908 static int hash_init(struct stripe_hash *hash, unsigned stripes)
 909 {
 910         unsigned buckets = 2, max_buckets = stripes >> 1;
 911         static unsigned hash_primes[] = {
 912                 /* Table of primes for hash_fn/table size optimization. */
 913                 1, 2, 3, 7, 13, 27, 53, 97, 193, 389, 769,
 914                 1543, 3079, 6151, 12289, 24593, 49157, 98317,
 915         };
 916
 917         /* Calculate number of buckets (2^^n <= stripes / 2). */
 918         while (buckets < max_buckets)
 919                 buckets <<= 1;
 920
 921         /* Allocate stripe hash buckets. */
 922         hash->hash = vmalloc(buckets * sizeof(*hash->hash));
 923         if (!hash->hash)
 924                 return -ENOMEM;
 925
 926         hash->buckets = buckets;
 927         hash->mask = buckets - 1;
 928         hash->shift = ffs(buckets);
 929         if (hash->shift > ARRAY_SIZE(hash_primes))
 930                 hash->shift = ARRAY_SIZE(hash_primes) - 1;
 931
 932         BUG_ON(hash->shift < 2);
 933         hash->prime = hash_primes[hash->shift];
 934
 935         /* Initialize buckets. */
 936         while (buckets--)
 937                 INIT_LIST_HEAD(hash->hash + buckets);
 938         return 0;
 939 }
 940
 941 static void hash_exit(struct stripe_hash *hash)
 942 {
 943         if (hash->hash) {
 944                 vfree(hash->hash);
 945                 hash->hash = NULL;
 946         }
 947 }
 948
 949 static unsigned hash_fn(struct stripe_hash *hash, sector_t key)
 950 {
 951         return (unsigned) (((key * hash->prime) >> hash->shift) & hash->mask);
 952 }
 953
 954 static struct list_head *hash_bucket(struct stripe_hash *hash, sector_t key)
 955 {
 956         return hash->hash + hash_fn(hash, key);
 957 }
 958
 959 /* Insert an entry into a hash. */
 960 static void stripe_insert(struct stripe_hash *hash, struct stripe *stripe)
 961 {
 962         list_add(stripe->lists + LIST_HASH, hash_bucket(hash, stripe->key));
 963 }
 964
 965 /* Lookup an entry in the stripe hash. */
 966 static struct stripe *stripe_lookup(struct stripe_cache *sc, sector_t key)
 967 {
 968         unsigned look = 0;
 969         struct stripe *stripe;
 970         struct list_head *bucket = hash_bucket(&sc->hash, key);
 971
 972         list_for_each_entry(stripe, bucket, lists[LIST_HASH]) {
 973                 look++;
 974
 975                 if (stripe->key == key) {
 976                         /* REMOVEME: statisics. */
 977                         if (look > atomic_read(RS(sc)->stats + S_MAX_LOOKUP))
 978                                 atomic_set(RS(sc)->stats + S_MAX_LOOKUP, look);
 979                         return stripe;
 980                 }
 981         }
 982
 983         return NULL;
 984 }
 985
 986 /* Resize the stripe cache hash on size changes. */
 987 static int sc_hash_resize(struct stripe_cache *sc)
 988 {
 989         /* Resize indicated ? */
 990         if (atomic_read(&sc->stripes) != atomic_read(&sc->stripes_last)) {
 991                 int r;
 992                 struct stripe_hash hash;
 993
 994                 r = hash_init(&hash, atomic_read(&sc->stripes));
 995                 if (r)
 996                         return r;
 997
 998                 if (sc->hash.hash) {
 999                         unsigned b = sc->hash.buckets;
1000                         struct list_head *pos, *tmp;
1001
1002                         /* Walk old buckets and insert into new. */
1003                         while (b--) {
1004                                 list_for_each_safe(pos, tmp, sc->hash.hash + b)
1005                                     stripe_insert(&hash,
1006                                                   list_entry(pos, struct stripe,
1007                                                              lists[LIST_HASH]));
1008                         }
1009
1010                 }
1011
1012                 hash_exit(&sc->hash);
1013                 memcpy(&sc->hash, &hash, sizeof(sc->hash));
1014                 atomic_set(&sc->stripes_last, atomic_read(&sc->stripes));
1015         }
1016
1017         return 0;
1018 }
1019 /* End hash stripe hash function. */
1020
1021 /* List add, delete, push and pop functions. */
1022 /* Add stripe to flush list. */
1023 #define DEL_LIST(lh) \
1024         if (!list_empty(lh)) \
1025                 list_del_init(lh);
1026
1027 /* Delete stripe from hash. */
1028 static void stripe_hash_del(struct stripe *stripe)
1029 {
1030         DEL_LIST(stripe->lists + LIST_HASH);
1031 }
1032
1033 /* Return stripe reference count. */
1034 static inline int stripe_ref(struct stripe *stripe)
1035 {
1036         return atomic_read(&stripe->cnt);
1037 }
1038
1039 static void stripe_flush_add(struct stripe *stripe)
1040 {
1041         struct stripe_cache *sc = stripe->sc;
1042         struct list_head *lh = stripe->lists + LIST_FLUSH;
1043
1044         if (!StripeReconstruct(stripe) && list_empty(lh))
1045                 list_add_tail(lh, sc->lists + LIST_FLUSH);
1046 }
1047
1048 /*
1049  * Add stripe to LRU (inactive) list.
1050  *
1051  * Need lock, because of concurrent access from message interface.
1052  */
1053 static void stripe_lru_add(struct stripe *stripe)
1054 {
1055         if (!StripeRecover(stripe)) {
1056                 unsigned long flags;
1057                 struct list_head *lh = stripe->lists + LIST_LRU;
1058                 spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1059
1060                 spin_lock_irqsave(lock, flags);
1061                 if (list_empty(lh))
1062                         list_add_tail(lh, stripe->sc->lists + LIST_LRU);
1063                 spin_unlock_irqrestore(lock, flags);
1064         }
1065 }
1066
1067 #define POP_LIST(list) \
1068         do { \
1069                 if (list_empty(sc->lists + (list))) \
1070                         stripe = NULL; \
1071                 else { \
1072                         stripe = list_first_entry(sc->lists + (list), \
1073                                                   struct stripe, \
1074                                                   lists[(list)]); \
1075                         list_del_init(stripe->lists + (list)); \
1076                 } \
1077         } while (0);
1078
1079 /* Pop an available stripe off the LRU list. */
1080 static struct stripe *stripe_lru_pop(struct stripe_cache *sc)
1081 {
1082         struct stripe *stripe;
1083         spinlock_t *lock = sc->locks + LOCK_LRU;
1084
1085         spin_lock_irq(lock);
1086         POP_LIST(LIST_LRU);
1087         spin_unlock_irq(lock);
1088
1089         return stripe;
1090 }
1091
1092 /* Pop an available stripe off the io list. */
1093 static struct stripe *stripe_io_pop(struct stripe_cache *sc)
1094 {
1095         struct stripe *stripe;
1096
1097         POP_LIST(LIST_FLUSH);
1098         return stripe;
1099 }
1100
1101 /* Push a stripe safely onto the endio list to be handled by do_endios(). */
1102 static void stripe_endio_push(struct stripe *stripe)
1103 {
1104         unsigned long flags;
1105         struct stripe_cache *sc = stripe->sc;
1106         struct list_head *stripe_list = stripe->lists + LIST_ENDIO,
1107                          *sc_list = sc->lists + LIST_ENDIO;
1108         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1109
1110         /* This runs in parallel with do_endios(). */
1111         spin_lock_irqsave(lock, flags);
1112         if (list_empty(stripe_list))
1113                 list_add_tail(stripe_list, sc_list);
1114         spin_unlock_irqrestore(lock, flags);
1115
1116         wake_do_raid(RS(sc)); /* Wake myself. */
1117 }
1118
1119 /* Pop a stripe off safely off the endio list. */
1120 static struct stripe *stripe_endio_pop(struct stripe_cache *sc)
1121 {
1122         struct stripe *stripe;
1123         spinlock_t *lock = sc->locks + LOCK_ENDIO;
1124
1125         /* This runs in parallel with endio(). */
1126         spin_lock_irq(lock);
1127         POP_LIST(LIST_ENDIO)
1128         spin_unlock_irq(lock);
1129         return stripe;
1130 }
1131 #undef POP_LIST
1132
1133 /*
1134  * Stripe cache locking functions
1135  */
1136 /* Dummy lock function for single host RAID4+5. */
1137 static void *no_lock(sector_t key, enum dm_lock_type type)
1138 {
1139         return &no_lock;
1140 }
1141
1142 /* Dummy unlock function for single host RAID4+5. */
1143 static void no_unlock(void *lock_handle)
1144 {
1145 }
1146
1147 /* No locking (for single host RAID 4+5). */
1148 static struct dm_raid45_locking_type locking_none = {
1149         .lock = no_lock,
1150         .unlock = no_unlock,
1151 };
1152
1153 /* Lock a stripe (for clustering). */
1154 static int
1155 stripe_lock(struct stripe *stripe, int rw, sector_t key)
1156 {
1157         stripe->lock = RS(stripe->sc)->locking->lock(key, rw == READ ? DM_RAID45_SHARED : DM_RAID45_EX);
1158         return stripe->lock ? 0 : -EPERM;
1159 }
1160
1161 /* Unlock a stripe (for clustering). */
1162 static void stripe_unlock(struct stripe *stripe)
1163 {
1164         RS(stripe->sc)->locking->unlock(stripe->lock);
1165         stripe->lock = NULL;
1166 }
1167
1168 /* Test io pending on stripe. */
1169 static int stripe_io_ref(struct stripe *stripe)
1170 {
1171         return atomic_read(&stripe->io.pending);
1172 }
1173
1174 static void stripe_io_get(struct stripe *stripe)
1175 {
1176         if (atomic_inc_return(&stripe->io.pending) == 1)
1177                 /* REMOVEME: statistics */
1178                 atomic_inc(&stripe->sc->active_stripes);
1179         else
1180                 BUG_ON(stripe_io_ref(stripe) < 0);
1181 }
1182
1183 static void stripe_io_put(struct stripe *stripe)
1184 {
1185         if (atomic_dec_and_test(&stripe->io.pending)) {
1186                 if (unlikely(StripeRecover(stripe)))
1187                         /* Don't put recovery stripe on endio list. */
1188                         wake_do_raid(RS(stripe->sc));
1189                 else
1190                         /* Add regular stripe to endio list and wake daemon. */
1191                         stripe_endio_push(stripe);
1192
1193                 /* REMOVEME: statistics */
1194                 atomic_dec(&stripe->sc->active_stripes);
1195         } else
1196                 BUG_ON(stripe_io_ref(stripe) < 0);
1197 }
1198
1199 /* Take stripe reference out. */
1200 static int stripe_get(struct stripe *stripe)
1201 {
1202         int r;
1203         struct list_head *lh = stripe->lists + LIST_LRU;
1204         spinlock_t *lock = stripe->sc->locks + LOCK_LRU;
1205
1206         /* Delete stripe from LRU (inactive) list if on. */
1207         spin_lock_irq(lock);
1208         DEL_LIST(lh);
1209         spin_unlock_irq(lock);
1210
1211         BUG_ON(stripe_ref(stripe) < 0);
1212
1213         /* Lock stripe on first reference */
1214         r = (atomic_inc_return(&stripe->cnt) == 1) ?
1215             stripe_lock(stripe, WRITE, stripe->key) : 0;
1216
1217         return r;
1218 }
1219 #undef DEL_LIST
1220
1221 /* Return references on a chunk. */
1222 static int chunk_ref(struct stripe_chunk *chunk)
1223 {
1224         return atomic_read(&chunk->cnt);
1225 }
1226
1227 /* Take out reference on a chunk. */
1228 static int chunk_get(struct stripe_chunk *chunk)
1229 {
1230         return atomic_inc_return(&chunk->cnt);
1231 }
1232
1233 /* Drop reference on a chunk. */
1234 static void chunk_put(struct stripe_chunk *chunk)
1235 {
1236         BUG_ON(atomic_dec_return(&chunk->cnt) < 0);
1237 }
1238
1239 /*
1240  * Drop reference on a stripe.
1241  *
1242  * Move it to list of LRU stripes if zero.
1243  */
1244 static void stripe_put(struct stripe *stripe)
1245 {
1246         if (atomic_dec_and_test(&stripe->cnt)) {
1247                 BUG_ON(stripe_io_ref(stripe));
1248                 stripe_unlock(stripe);
1249         } else
1250                 BUG_ON(stripe_ref(stripe) < 0);
1251 }
1252
1253 /* Helper needed by for_each_io_dev(). */
1254 static void stripe_get_references(struct stripe *stripe, unsigned p)
1255 {
1256
1257         /*
1258          * Another one to reference the stripe in
1259          * order to protect vs. LRU list moves.
1260          */
1261         io_get(RS(stripe->sc)); /* Global io references. */
1262         stripe_get(stripe);
1263         stripe_io_get(stripe);  /* One for each chunk io. */
1264 }
1265
1266 /* Helper for endio() to put all take references. */
1267 static void stripe_put_references(struct stripe *stripe)
1268 {
1269         stripe_io_put(stripe);  /* One for each chunk io. */
1270         stripe_put(stripe);
1271         io_put(RS(stripe->sc));
1272 }
1273
1274 /*
1275  * Stripe cache functions.
1276  */
1277 /*
1278  * Invalidate all chunks (i.e. their pages)  of a stripe.
1279  *
1280  * I only keep state for the whole chunk.
1281  */
1282 static inline void stripe_chunk_invalidate(struct stripe_chunk *chunk)
1283 {
1284         chunk->io.flags = 0;
1285 }
1286
1287 static void
1288 stripe_chunks_invalidate(struct stripe *stripe)
1289 {
1290         unsigned p = RS(stripe->sc)->set.raid_devs;
1291
1292         while (p--)
1293                 stripe_chunk_invalidate(CHUNK(stripe, p));
1294 }
1295
1296 /* Prepare stripe for (re)use. */
1297 static void stripe_invalidate(struct stripe *stripe)
1298 {
1299         stripe->io.flags = 0;
1300         stripe->idx.parity = stripe->idx.recover = -1;
1301         stripe_chunks_invalidate(stripe);
1302 }
1303
1304 /*
1305  * Allow io on all chunks of a stripe.
1306  * If not set, IO will not occur; i.e. it's prohibited.
1307  *
1308  * Actual IO submission for allowed chunks depends
1309  * on their !uptodate or dirty state.
1310  */
1311 static void stripe_allow_io(struct stripe *stripe)
1312 {
1313         unsigned p = RS(stripe->sc)->set.raid_devs;
1314
1315         while (p--)
1316                 SetChunkIo(CHUNK(stripe, p));
1317 }
1318
1319 /* Initialize a stripe. */
1320 static void stripe_init(struct stripe_cache *sc, struct stripe *stripe)
1321 {
1322         unsigned i, p = RS(sc)->set.raid_devs;
1323
1324         /* Work all io chunks. */
1325         while (p--) {
1326                 struct stripe_chunk *chunk = CHUNK(stripe, p);
1327
1328                 atomic_set(&chunk->cnt, 0);
1329                 chunk->stripe = stripe;
1330                 i = ARRAY_SIZE(chunk->bl);
1331                 while (i--)
1332                         bio_list_init(chunk->bl + i);
1333         }
1334
1335         stripe->sc = sc;
1336
1337
1338         i = ARRAY_SIZE(stripe->lists);
1339         while (i--)
1340                 INIT_LIST_HEAD(stripe->lists + i);
1341
1342         stripe->io.size = RS(sc)->set.io_size;
1343         atomic_set(&stripe->cnt, 0);
1344         atomic_set(&stripe->io.pending, 0);
1345         stripe_invalidate(stripe);
1346 }
1347
1348 /* Number of pages per chunk. */
1349 static inline unsigned chunk_pages(unsigned sectors)
1350 {
1351         return dm_div_up(sectors, SECTORS_PER_PAGE);
1352 }
1353
1354 /* Number of pages per stripe. */
1355 static inline unsigned stripe_pages(struct raid_set *rs, unsigned io_size)
1356 {
1357         return chunk_pages(io_size) * rs->set.raid_devs;
1358 }
1359
1360 /* Initialize part of page_list (recovery). */
1361 static void stripe_zero_pl_part(struct stripe *stripe, int p,
1362                                 unsigned start, unsigned count)
1363 {
1364         unsigned o = start / SECTORS_PER_PAGE, pages = chunk_pages(count);
1365         /* Get offset into the page_list. */
1366         struct page_list *pl = pl_elem(PL(stripe, p), o);
1367
1368         BUG_ON(!pl);
1369         while (pl && pages--) {
1370                 BUG_ON(!pl->page);
1371                 memset(page_address(pl->page), 0, PAGE_SIZE);
1372                 pl = pl->next;
1373         }
1374 }
1375
1376 /* Initialize parity chunk of stripe. */
1377 static void stripe_zero_chunk(struct stripe *stripe, int p)
1378 {
1379         if (p > -1)
1380                 stripe_zero_pl_part(stripe, p, 0, stripe->io.size);
1381 }
1382
1383 /* Return dynamic stripe structure size. */
1384 static size_t stripe_size(struct raid_set *rs)
1385 {
1386         return sizeof(struct stripe) +
1387                       rs->set.raid_devs * sizeof(struct stripe_chunk);
1388 }
1389
1390 /* Allocate a stripe and its memory object. */
1391 /* XXX adjust to cope with stripe cache and recovery stripe caches. */
1392 enum grow { SC_GROW, SC_KEEP };
1393 static struct stripe *stripe_alloc(struct stripe_cache *sc,
1394                                    struct dm_mem_cache_client *mc,
1395                                    enum grow grow)
1396 {
1397         int r;
1398         struct stripe *stripe;
1399
1400         stripe = kmem_cache_zalloc(sc->kc.cache, GFP_KERNEL);
1401         if (stripe) {
1402                 /* Grow the dm-mem-cache by one object. */
1403                 if (grow == SC_GROW) {
1404                         r = dm_mem_cache_grow(mc, 1);
1405                         if (r)
1406                                 goto err_free;
1407                 }
1408
1409                 stripe->obj = dm_mem_cache_alloc(mc);
1410                 if (!stripe->obj)
1411                         goto err_shrink;
1412
1413                 stripe_init(sc, stripe);
1414         }
1415
1416         return stripe;
1417
1418 err_shrink:
1419         if (grow == SC_GROW)
1420                 dm_mem_cache_shrink(mc, 1);
1421 err_free:
1422         kmem_cache_free(sc->kc.cache, stripe);
1423         return NULL;
1424 }
1425
1426 /*
1427  * Free a stripes memory object, shrink the
1428  * memory cache and free the stripe itself.
1429  */
1430 static void stripe_free(struct stripe *stripe, struct dm_mem_cache_client *mc)
1431 {
1432         dm_mem_cache_free(mc, stripe->obj);
1433         dm_mem_cache_shrink(mc, 1);
1434         kmem_cache_free(stripe->sc->kc.cache, stripe);
1435 }
1436
1437 /* Free the recovery stripe. */
1438 static void stripe_recover_free(struct raid_set *rs)
1439 {
1440         struct recover *rec = &rs->recover;
1441         struct dm_mem_cache_client *mc;
1442
1443         mc = rec->mem_cache_client;
1444         rec->mem_cache_client = NULL;
1445         if (mc) {
1446                 struct stripe *stripe;
1447
1448                 while (!list_empty(&rec->stripes)) {
1449                         stripe = list_first_entry(&rec->stripes, struct stripe,
1450                                                   lists[LIST_RECOVER]);
1451                         list_del(stripe->lists + LIST_RECOVER);
1452                         kfree(stripe->recover);
1453                         stripe_free(stripe, mc);
1454                 }
1455
1456                 dm_mem_cache_client_destroy(mc);
1457                 dm_io_client_destroy(rec->dm_io_client);
1458                 rec->dm_io_client = NULL;
1459         }
1460 }
1461
1462 /* Grow stripe cache. */
1463 static int sc_grow(struct stripe_cache *sc, unsigned stripes, enum grow grow)
1464 {
1465         int r = 0;
1466
1467         /* Try to allocate this many (additional) stripes. */
1468         while (stripes--) {
1469                 struct stripe *stripe =
1470                         stripe_alloc(sc, sc->mem_cache_client, grow);
1471
1472                 if (likely(stripe)) {
1473                         stripe_lru_add(stripe);
1474                         atomic_inc(&sc->stripes);
1475                 } else {
1476                         r = -ENOMEM;
1477                         break;
1478                 }
1479         }
1480
1481         return r ? r : sc_hash_resize(sc);
1482 }
1483
1484 /* Shrink stripe cache. */
1485 static int sc_shrink(struct stripe_cache *sc, unsigned stripes)
1486 {
1487         int r = 0;
1488
1489         /* Try to get unused stripe from LRU list. */
1490         while (stripes--) {
1491                 struct stripe *stripe;
1492
1493                 stripe = stripe_lru_pop(sc);
1494                 if (stripe) {
1495                         /* An LRU stripe may never have ios pending! */
1496                         BUG_ON(stripe_io_ref(stripe));
1497                         BUG_ON(stripe_ref(stripe));
1498                         atomic_dec(&sc->stripes);
1499                         /* Remove from hash if on before deletion. */
1500                         stripe_hash_del(stripe);
1501                         stripe_free(stripe, sc->mem_cache_client);
1502                 } else {
1503                         r = -ENOENT;
1504                         break;
1505                 }
1506         }
1507
1508         /* Check if stats are still sane. */
1509         if (atomic_read(&sc->active_stripes_max) >
1510             atomic_read(&sc->stripes))
1511                 atomic_set(&sc->active_stripes_max, 0);
1512
1513         if (r)
1514                 return r;
1515
1516         return atomic_read(&sc->stripes) ? sc_hash_resize(sc) : 0;
1517 }
1518
1519 /* Create stripe cache and recovery. */
1520 static int sc_init(struct raid_set *rs, unsigned stripes)
1521 {
1522         unsigned i, r, rstripes;
1523         struct stripe_cache *sc = &rs->sc;
1524         struct stripe *stripe;
1525         struct recover *rec = &rs->recover;
1526         struct mapped_device *md;
1527         struct gendisk *disk;
1528
1529         /* Initialize lists and locks. */
1530         i = ARRAY_SIZE(sc->lists);
1531         while (i--)
1532                 INIT_LIST_HEAD(sc->lists + i);
1533
1534         INIT_LIST_HEAD(&rec->stripes);
1535
1536         /* Initialize endio and LRU list locks. */
1537         i = NR_LOCKS;
1538         while (i--)
1539                 spin_lock_init(sc->locks + i);
1540
1541         /* Initialize atomic variables. */
1542         atomic_set(&sc->stripes, 0);
1543         atomic_set(&sc->stripes_to_set, 0);
1544         atomic_set(&sc->active_stripes, 0);
1545         atomic_set(&sc->active_stripes_max, 0); /* REMOVEME: statistics. */
1546
1547         /*
1548          * We need a runtime unique # to suffix the kmem cache name
1549          * because we'll have one for each active RAID set.
1550          */
1551         md = dm_table_get_md(rs->ti->table);
1552         disk = dm_disk(md);
1553         sprintf(sc->kc.name, "%s-%d", TARGET, disk->first_minor);
1554         dm_put(md);
1555         sc->kc.cache = kmem_cache_create(sc->kc.name, stripe_size(rs),
1556                                          0, 0, NULL);
1557         if (!sc->kc.cache)
1558                 return -ENOMEM;
1559
1560         /* Create memory cache client context for RAID stripe cache. */
1561         sc->mem_cache_client =
1562                 dm_mem_cache_client_create(stripes, rs->set.raid_devs,
1563                                            chunk_pages(rs->set.io_size));
1564         if (IS_ERR(sc->mem_cache_client))
1565                 return PTR_ERR(sc->mem_cache_client);
1566
1567         /* Create memory cache client context for RAID recovery stripe(s). */
1568         rstripes = rec->recovery_stripes;
1569         rec->mem_cache_client =
1570                 dm_mem_cache_client_create(rstripes, rs->set.raid_devs,
1571                                            chunk_pages(rec->io_size));
1572         if (IS_ERR(rec->mem_cache_client))
1573                 return PTR_ERR(rec->mem_cache_client);
1574
1575         /* Create dm-io client context for IO stripes. */
1576         sc->dm_io_client =
1577                 dm_io_client_create((stripes > 32 ? 32 : stripes) *
1578                                     rs->set.raid_devs *
1579                                     chunk_pages(rs->set.io_size));
1580         if (IS_ERR(sc->dm_io_client))
1581                 return PTR_ERR(sc->dm_io_client);
1582
1583         /* FIXME: intermingeled with stripe cache initialization. */
1584         /* Create dm-io client context for recovery stripes. */
1585         rec->dm_io_client =
1586                 dm_io_client_create(rstripes * rs->set.raid_devs *
1587                                     chunk_pages(rec->io_size));
1588         if (IS_ERR(rec->dm_io_client))
1589                 return PTR_ERR(rec->dm_io_client);
1590
1591         /* Allocate stripes for set recovery. */
1592         while (rstripes--) {
1593                 stripe = stripe_alloc(sc, rec->mem_cache_client, SC_KEEP);
1594                 if (!stripe)
1595                         return -ENOMEM;
1596
1597                 stripe->recover = kzalloc(sizeof(*stripe->recover), GFP_KERNEL);
1598                 if (!stripe->recover) {
1599                         stripe_free(stripe, rec->mem_cache_client);
1600                         return -ENOMEM;
1601                 }
1602
1603                 SetStripeRecover(stripe);
1604                 stripe->io.size = rec->io_size;
1605                 list_add_tail(stripe->lists + LIST_RECOVER, &rec->stripes);
1606                 /* Don't add recovery stripes to LRU list! */
1607         }
1608
1609         /*
1610          * Allocate the stripe objetcs from the
1611          * cache and add them to the LRU list.
1612          */
1613         r = sc_grow(sc, stripes, SC_KEEP);
1614         if (!r)
1615                 atomic_set(&sc->stripes_last, stripes);
1616
1617         return r;
1618 }
1619
1620 /* Destroy the stripe cache. */
1621 static void sc_exit(struct stripe_cache *sc)
1622 {
1623         struct raid_set *rs = RS(sc);
1624
1625         if (sc->kc.cache) {
1626                 stripe_recover_free(rs);
1627                 BUG_ON(sc_shrink(sc, atomic_read(&sc->stripes)));
1628                 kmem_cache_destroy(sc->kc.cache);
1629                 sc->kc.cache = NULL;
1630
1631                 if (sc->mem_cache_client && !IS_ERR(sc->mem_cache_client))
1632                         dm_mem_cache_client_destroy(sc->mem_cache_client);
1633
1634                 if (sc->dm_io_client && !IS_ERR(sc->dm_io_client))
1635                         dm_io_client_destroy(sc->dm_io_client);
1636
1637                 hash_exit(&sc->hash);
1638         }
1639 }
1640
1641 /*
1642  * Calculate RAID address
1643  *
1644  * Delivers tuple with the index of the data disk holding the chunk
1645  * in the set, the parity disks index and the start of the stripe
1646  * within the address space of the set (used as the stripe cache hash key).
1647  */
1648 /* thx MD. */
1649 static struct raid_address *raid_address(struct raid_set *rs, sector_t sector,
1650                                          struct raid_address *addr)
1651 {
1652         sector_t stripe, tmp;
1653
1654         /*
1655          * chunk_number = sector / chunk_size
1656          * stripe_number = chunk_number / data_devs
1657          * di = stripe % data_devs;
1658          */
1659         stripe = sector >> rs->set.chunk_shift;
1660         addr->di = sector_div(stripe, rs->set.data_devs);
1661
1662         switch (rs->set.raid_type->level) {
1663         case raid4:
1664                 addr->pi = rs->set.pi;
1665                 goto check_shift_di;
1666         case raid5:
1667                 tmp = stripe;
1668                 addr->pi = sector_div(tmp, rs->set.raid_devs);
1669
1670                 switch (rs->set.raid_type->algorithm) {
1671                 case left_asym:         /* Left asymmetric. */
1672                         addr->pi = rs->set.data_devs - addr->pi;
1673                 case right_asym:        /* Right asymmetric. */
1674 check_shift_di:
1675                         if (addr->di >= addr->pi)
1676                                 addr->di++;
1677                         break;
1678                 case left_sym:          /* Left symmetric. */
1679                         addr->pi = rs->set.data_devs - addr->pi;
1680                 case right_sym:         /* Right symmetric. */
1681                         addr->di = (addr->pi + addr->di + 1) %
1682                                    rs->set.raid_devs;
1683                         break;
1684                 case none: /* Ain't happen: RAID4 algorithm placeholder. */
1685                         BUG();
1686                 }
1687         }
1688
1689         /*
1690          * Start offset of the stripes chunk on any single device of the RAID
1691          * set, adjusted in case io size differs from chunk size.
1692          */
1693         addr->key = (stripe << rs->set.chunk_shift) +
1694                     (sector & rs->set.io_inv_mask);
1695         return addr;
1696 }
1697
1698 /*
1699  * Copy data across between stripe pages and bio vectors.
1700  *
1701  * Pay attention to data alignment in stripe and bio pages.
1702  */
1703 static void bio_copy_page_list(int rw, struct stripe *stripe,
1704                                struct page_list *pl, struct bio *bio)
1705 {
1706         unsigned i, page_offset;
1707         void *page_addr;
1708         struct raid_set *rs = RS(stripe->sc);
1709         struct bio_vec *bv;
1710
1711         /* Get start page in page list for this sector. */
1712         i = (bio->bi_sector & rs->set.io_mask) / SECTORS_PER_PAGE;
1713         pl = pl_elem(pl, i);
1714         BUG_ON(!pl);
1715         BUG_ON(!pl->page);
1716
1717         page_addr = page_address(pl->page);
1718         page_offset = to_bytes(bio->bi_sector & (SECTORS_PER_PAGE - 1));
1719
1720         /* Walk all segments and copy data across between bio_vecs and pages. */
1721         bio_for_each_segment(bv, bio, i) {
1722                 int len = bv->bv_len, size;
1723                 unsigned bio_offset = 0;
1724                 void *bio_addr = __bio_kmap_atomic(bio, i, KM_USER0);
1725 redo:
1726                 size = (page_offset + len > PAGE_SIZE) ?
1727                        PAGE_SIZE - page_offset : len;
1728
1729                 if (rw == READ)
1730                         memcpy(bio_addr + bio_offset,
1731                                page_addr + page_offset, size);
1732                 else
1733                         memcpy(page_addr + page_offset,
1734                                bio_addr + bio_offset, size);
1735
1736                 page_offset += size;
1737                 if (page_offset == PAGE_SIZE) {
1738                         /*
1739                          * We reached the end of the chunk page ->
1740                          * need to refer to the next one to copy more data.
1741                          */
1742                         len -= size;
1743                         if (len) {
1744                                 /* Get next page. */
1745                                 pl = pl->next;
1746                                 BUG_ON(!pl);
1747                                 BUG_ON(!pl->page);
1748                                 page_addr = page_address(pl->page);
1749                                 page_offset = 0;
1750                                 bio_offset += size;
1751                                 /* REMOVEME: statistics. */
1752                                 atomic_inc(rs->stats + S_BIO_COPY_PL_NEXT);
1753                                 goto redo;
1754                         }
1755                 }
1756
1757                 __bio_kunmap_atomic(bio_addr, KM_USER0);
1758         }
1759 }
1760
1761 /*
1762  * Xor optimization macros.
1763  */
1764 /* Xor data pointer declaration and initialization macros. */
1765 #define DECLARE_2       unsigned long *d0 = data[0], *d1 = data[1]
1766 #define DECLARE_3       DECLARE_2, *d2 = data[2]
1767 #define DECLARE_4       DECLARE_3, *d3 = data[3]
1768 #define DECLARE_5       DECLARE_4, *d4 = data[4]
1769 #define DECLARE_6       DECLARE_5, *d5 = data[5]
1770 #define DECLARE_7       DECLARE_6, *d6 = data[6]
1771 #define DECLARE_8       DECLARE_7, *d7 = data[7]
1772
1773 /* Xor unrole macros. */
1774 #define D2(n)   d0[n] = d0[n] ^ d1[n]
1775 #define D3(n)   D2(n) ^ d2[n]
1776 #define D4(n)   D3(n) ^ d3[n]
1777 #define D5(n)   D4(n) ^ d4[n]
1778 #define D6(n)   D5(n) ^ d5[n]
1779 #define D7(n)   D6(n) ^ d6[n]
1780 #define D8(n)   D7(n) ^ d7[n]
1781
1782 #define X_2(macro, offset)      macro(offset); macro(offset + 1);
1783 #define X_4(macro, offset)      X_2(macro, offset); X_2(macro, offset + 2);
1784 #define X_8(macro, offset)      X_4(macro, offset); X_4(macro, offset + 4);
1785 #define X_16(macro, offset)     X_8(macro, offset); X_8(macro, offset + 8);
1786 #define X_32(macro, offset)     X_16(macro, offset); X_16(macro, offset + 16);
1787 #define X_64(macro, offset)     X_32(macro, offset); X_32(macro, offset + 32);
1788
1789 /* Define a _xor_#chunks_#xors_per_run() function. */
1790 #define _XOR(chunks, xors_per_run) \
1791 static void _xor ## chunks ## _ ## xors_per_run(unsigned long **data) \
1792 { \
1793         unsigned end = XOR_SIZE / sizeof(data[0]), i; \
1794         DECLARE_ ## chunks; \
1795 \
1796         for (i = 0; i < end; i += xors_per_run) { \
1797                 X_ ## xors_per_run(D ## chunks, i); \
1798         } \
1799 }
1800
1801 /* Define xor functions for 2 - 8 chunks and xors per run. */
1802 #define MAKE_XOR_PER_RUN(xors_per_run) \
1803         _XOR(2, xors_per_run); _XOR(3, xors_per_run); \
1804         _XOR(4, xors_per_run); _XOR(5, xors_per_run); \
1805         _XOR(6, xors_per_run); _XOR(7, xors_per_run); \
1806         _XOR(8, xors_per_run);
1807
1808 MAKE_XOR_PER_RUN(8)     /* Define _xor_*_8() functions. */
1809 MAKE_XOR_PER_RUN(16)    /* Define _xor_*_16() functions. */
1810 MAKE_XOR_PER_RUN(32)    /* Define _xor_*_32() functions. */
1811 MAKE_XOR_PER_RUN(64)    /* Define _xor_*_64() functions. */
1812
1813 #define MAKE_XOR(xors_per_run) \
1814 struct { \
1815         void (*f)(unsigned long **); \
1816 } static xor_funcs ## xors_per_run[] = { \
1817         { NULL }, /* NULL pointers to optimize indexing in xor(). */ \
1818         { NULL }, \
1819         { _xor2_ ## xors_per_run }, \
1820         { _xor3_ ## xors_per_run }, \
1821         { _xor4_ ## xors_per_run }, \
1822         { _xor5_ ## xors_per_run }, \
1823         { _xor6_ ## xors_per_run }, \
1824         { _xor7_ ## xors_per_run }, \
1825         { _xor8_ ## xors_per_run }, \
1826 }; \
1827 \
1828 static void xor_ ## xors_per_run(unsigned n, unsigned long **data) \
1829 { \
1830         /* Call respective function for amount of chunks. */ \
1831         xor_funcs ## xors_per_run[n].f(data); \
1832 }
1833
1834 /* Define xor_8() - xor_64 functions. */
1835 MAKE_XOR(8)
1836 MAKE_XOR(16)
1837 MAKE_XOR(32)
1838 MAKE_XOR(64)
1839
1840 /* Maximum number of chunks, which can be xor'ed in one go. */
1841 #define XOR_CHUNKS_MAX  (ARRAY_SIZE(xor_funcs8) - 1)
1842
1843 static void xor_blocks_wrapper(unsigned n, unsigned long **data)
1844 {
1845         BUG_ON(n < 2 || n > MAX_XOR_BLOCKS + 1);
1846         xor_blocks(n - 1, XOR_SIZE, (void *) data[0], (void **) data + 1);
1847 }
1848
1849 struct xor_func {
1850         xor_function_t f;
1851         const char *name;
1852 } static xor_funcs[] = {
1853         { xor_8,   "xor_8"  },
1854         { xor_16,  "xor_16" },
1855         { xor_32,  "xor_32" },
1856         { xor_64,  "xor_64" },
1857         { xor_blocks_wrapper, "xor_blocks" },
1858 };
1859
1860 /*
1861  * Check, if chunk has to be xored in/out:
1862  *
1863  * o if writes are queued
1864  * o if writes are merged
1865  * o if stripe is to be reconstructed
1866  * o if recovery stripe
1867  */
1868 static inline int chunk_must_xor(struct stripe_chunk *chunk)
1869 {
1870         if (ChunkUptodate(chunk)) {
1871                 BUG_ON(!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) &&
1872                        !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)));
1873
1874                 if (!bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)) ||
1875                     !bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED)))
1876                         return 1;
1877
1878                 if (StripeReconstruct(chunk->stripe) ||
1879                     StripeRecover(chunk->stripe))
1880                         return 1;
1881         }
1882
1883         return 0;
1884 }
1885
1886 /*
1887  * Calculate crc.
1888  *
1889  * This indexes into the chunks of a stripe and their pages.
1890  *
1891  * All chunks will be xored into the indexed (@pi)
1892  * chunk in maximum groups of xor.chunks.
1893  *
1894  */
1895 static void xor(struct stripe *stripe, unsigned pi, unsigned sector)
1896 {
1897         struct raid_set *rs = RS(stripe->sc);
1898         unsigned max_chunks = rs->xor.chunks, n = 1,
1899                  o = sector / SECTORS_PER_PAGE, /* Offset into the page_list. */
1900                  p = rs->set.raid_devs;
1901         unsigned long **d = rs->data;
1902         xor_function_t xor_f = rs->xor.f->f;
1903
1904         BUG_ON(sector > stripe->io.size);
1905
1906         /* Address of parity page to xor into. */
1907         d[0] = page_address(pl_elem(PL(stripe, pi), o)->page);
1908
1909         while (p--) {
1910                 /* Preset pointers to data pages. */
1911                 if (p != pi && chunk_must_xor(CHUNK(stripe, p)))
1912                         d[n++] = page_address(pl_elem(PL(stripe, p), o)->page);
1913
1914                 /* If max chunks -> xor. */
1915                 if (n == max_chunks) {
1916                         xor_f(n, d);
1917                         n = 1;
1918                 }
1919         }
1920
1921         /* If chunks -> xor. */
1922         if (n > 1)
1923                 xor_f(n, d);
1924 }
1925
1926 /* Common xor loop through all stripe page lists. */
1927 static void common_xor(struct stripe *stripe, sector_t count,
1928                        unsigned off, unsigned pi)
1929 {
1930         unsigned sector;
1931
1932         BUG_ON(!count);
1933         for (sector = off; sector < count; sector += SECTORS_PER_PAGE)
1934                 xor(stripe, pi, sector);
1935
1936         /* Set parity page uptodate and clean. */
1937         chunk_set(CHUNK(stripe, pi), CLEAN);
1938         atomic_inc(RS(stripe->sc)->stats + S_XORS); /* REMOVEME: statistics. */
1939 }
1940
1941 /*
1942  * Calculate parity sectors on intact stripes.
1943  *
1944  * Need to calculate raid address for recover stripe, because its
1945  * chunk sizes differs and is typically larger than io chunk size.
1946  */
1947 static void parity_xor(struct stripe *stripe)
1948 {
1949         struct raid_set *rs = RS(stripe->sc);
1950         unsigned chunk_size = rs->set.chunk_size, io_size = stripe->io.size,
1951                  xor_size = chunk_size > io_size ? io_size : chunk_size;
1952         sector_t off;
1953
1954         /* This can be the recover stripe with a larger io size. */
1955         for (off = 0; off < io_size; off += xor_size) {
1956                 /*
1957                  * Recover stripe is likely bigger than regular io
1958                  * ones and has no precalculated parity disk index ->
1959                  * need to calculate RAID address.
1960                  */
1961                 if (unlikely(StripeRecover(stripe))) {
1962                         struct raid_address addr;
1963
1964                         raid_address(rs,
1965                                      (stripe->key + off) * rs->set.data_devs,
1966                                      &addr);
1967                         stripe->idx.parity = addr.pi;
1968                         stripe_zero_pl_part(stripe, addr.pi, off, xor_size);
1969                 }
1970
1971                 common_xor(stripe, xor_size, off, stripe->idx.parity);
1972                 chunk_set(CHUNK(stripe, stripe->idx.parity), DIRTY);
1973         }
1974 }
1975
1976 /* Reconstruct missing chunk. */
1977 static void stripe_reconstruct(struct stripe *stripe)
1978 {
1979         struct raid_set *rs = RS(stripe->sc);
1980         int p = rs->set.raid_devs, pr = stripe->idx.recover;
1981
1982         BUG_ON(pr < 0);
1983
1984         /* Check if all but the chunk to be reconstructed are uptodate. */
1985         while (p--)
1986                 BUG_ON(p != pr && !ChunkUptodate(CHUNK(stripe, p)));
1987
1988         /* REMOVEME: statistics. */
1989         atomic_inc(rs->stats + (RSDegraded(rs) ? S_RECONSTRUCT_EI :
1990                                                  S_RECONSTRUCT_DEV));
1991         /* Zero chunk to be reconstructed. */
1992         stripe_zero_chunk(stripe, pr);
1993         common_xor(stripe, stripe->io.size, 0, pr);
1994         stripe->idx.recover = -1;
1995 }
1996
1997 /*
1998  * Recovery io throttling
1999  */
2000 /* Conditionally reset io counters. */
2001 static int recover_io_reset(struct raid_set *rs)
2002 {
2003         unsigned long j = jiffies;
2004
2005         /* Pay attention to jiffies overflows. */
2006         if (j > rs->recover.last_jiffies + HZ / 20 ||
2007             j < rs->recover.last_jiffies) {
2008                 atomic_set(rs->recover.io_count + IO_WORK, 0);
2009                 atomic_set(rs->recover.io_count + IO_RECOVER, 0);
2010                 rs->recover.last_jiffies = j;
2011                 return 1;
2012         }
2013
2014         return 0;
2015 }
2016
2017 /* Count ios. */
2018 static void recover_io_count(struct stripe *stripe)
2019 {
2020         struct raid_set *rs = RS(stripe->sc);
2021
2022         recover_io_reset(rs);
2023         atomic_inc(rs->recover.io_count +
2024                    (StripeRecover(stripe) ? IO_RECOVER : IO_WORK));
2025 }
2026
2027 /* Try getting a stripe either from the hash or from the LRU list. */
2028 static struct stripe *stripe_find(struct raid_set *rs,
2029                                   struct raid_address *addr)
2030 {
2031         int r;
2032         struct stripe_cache *sc = &rs->sc;
2033         struct stripe *stripe;
2034
2035         /* Try stripe from hash. */
2036         stripe = stripe_lookup(sc, addr->key);
2037         if (stripe) {
2038                 r = stripe_get(stripe);
2039                 if (r)
2040                         goto get_lock_failed;
2041
2042                 atomic_inc(rs->stats + S_HITS_1ST); /* REMOVEME: statistics. */
2043         } else {
2044                 /* Not in hash -> try to get an LRU stripe. */
2045                 stripe = stripe_lru_pop(sc);
2046                 if (stripe) {
2047                         /*
2048                          * An LRU stripe may not be referenced
2049                          * and may never have ios pending!
2050                          */
2051                         BUG_ON(stripe_ref(stripe));
2052                         BUG_ON(stripe_io_ref(stripe));
2053
2054                         /* Remove from hash if on before reuse. */
2055                         stripe_hash_del(stripe);
2056
2057                         /* Invalidate before reinserting with changed key. */
2058                         stripe_invalidate(stripe);
2059
2060                         stripe->key = addr->key;
2061                         stripe->region = dm_rh_sector_to_region(rs->recover.rh,
2062                                                                 addr->key);
2063                         stripe->idx.parity = addr->pi;
2064                         r = stripe_get(stripe);
2065                         if (r)
2066                                 goto get_lock_failed;
2067
2068                         /* Insert stripe into the stripe hash. */
2069                         stripe_insert(&sc->hash, stripe);
2070                         /* REMOVEME: statistics. */
2071                         atomic_inc(rs->stats + S_INSCACHE);
2072                 }
2073         }
2074
2075         return stripe;
2076
2077 get_lock_failed:
2078         stripe_put(stripe);
2079         return NULL;
2080 }
2081
2082 /*
2083  * Process end io
2084  *
2085  * I need to do it here because I can't in interrupt
2086  */
2087 /* End io all bios on a bio list. */
2088 static void bio_list_endio(struct stripe *stripe, struct bio_list *bl,
2089                            int p, int error)
2090 {
2091         struct raid_set *rs = RS(stripe->sc);
2092         struct bio *bio;
2093         struct page_list *pl = PL(stripe, p);
2094         struct stripe_chunk *chunk = CHUNK(stripe, p);
2095
2096         /* Update region counters. */
2097         while ((bio = bio_list_pop(bl))) {
2098                 if (bio_data_dir(bio) == WRITE)
2099                         /* Drop io pending count for any writes. */
2100                         dm_rh_dec(rs->recover.rh, stripe->region);
2101                 else if (!error)
2102                         /* Copy data accross. */
2103                         bio_copy_page_list(READ, stripe, pl, bio);
2104
2105                 bio_endio(bio, error);
2106
2107                 /* REMOVEME: statistics. */
2108                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
2109                            S_BIOS_ENDIO_READ : S_BIOS_ENDIO_WRITE));
2110
2111                 chunk_put(chunk);
2112                 stripe_put(stripe);
2113                 io_put(rs);     /* Wake any suspend waiters on last bio. */
2114         }
2115 }
2116
2117 /*
2118  * End io all reads/writes on a stripe copying
2119  * read data accross from stripe to bios and
2120  * decrementing region counters for writes.
2121  *
2122  * Processing of ios depeding on state:
2123  * o no chunk error -> endio ok
2124  * o degraded:
2125  *   - chunk error and read -> ignore to be requeued
2126  *   - chunk error and write -> endio ok
2127  * o dead (more than parity_devs failed) and chunk_error-> endio failed
2128  */
2129 static void stripe_endio(int rw, struct stripe *stripe)
2130 {
2131         struct raid_set *rs = RS(stripe->sc);
2132         unsigned p = rs->set.raid_devs;
2133         int write = (rw != READ);
2134
2135         while (p--) {
2136                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2137                 struct bio_list *bl;
2138
2139                 BUG_ON(ChunkLocked(chunk));
2140
2141                 bl = BL_CHUNK(chunk, rw);
2142                 if (bio_list_empty(bl))
2143                         continue;
2144
2145                 if (unlikely(ChunkError(chunk) || !ChunkUptodate(chunk))) {
2146                         /* RAID set dead. */
2147                         if (unlikely(RSDead(rs)))
2148                                 bio_list_endio(stripe, bl, p, -EIO);
2149                         /* RAID set degraded. */
2150                         else if (write)
2151                                 bio_list_endio(stripe, bl, p, 0);
2152                 } else {
2153                         BUG_ON(!RSDegraded(rs) && ChunkDirty(chunk));
2154                         bio_list_endio(stripe, bl, p, 0);
2155                 }
2156         }
2157 }
2158
2159 /* Fail all ios hanging off all bio lists of a stripe. */
2160 static void stripe_fail_io(struct stripe *stripe)
2161 {
2162         struct raid_set *rs = RS(stripe->sc);
2163         unsigned p = rs->set.raid_devs;
2164
2165         while (p--) {
2166                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2167                 int i = ARRAY_SIZE(chunk->bl);
2168
2169                 /* Fail all bios on all bio lists of the stripe. */
2170                 while (i--) {
2171                         struct bio_list *bl = chunk->bl + i;
2172
2173                         if (!bio_list_empty(bl))
2174                                 bio_list_endio(stripe, bl, p, -EIO);
2175                 }
2176         }
2177
2178         /* Put stripe on LRU list. */
2179         BUG_ON(stripe_io_ref(stripe));
2180         BUG_ON(stripe_ref(stripe));
2181 }
2182
2183 /* Unlock all required chunks. */
2184 static void stripe_chunks_unlock(struct stripe *stripe)
2185 {
2186         unsigned p = RS(stripe->sc)->set.raid_devs;
2187         struct stripe_chunk *chunk;
2188
2189         while (p--) {
2190                 chunk = CHUNK(stripe, p);
2191
2192                 if (TestClearChunkUnlock(chunk))
2193                         ClearChunkLocked(chunk);
2194         }
2195 }
2196
2197 /*
2198  * Queue reads and writes to a stripe by hanging
2199  * their bios off the stripesets read/write lists.
2200  */
2201 static int stripe_queue_bio(struct raid_set *rs, struct bio *bio,
2202                             struct bio_list *reject)
2203 {
2204         struct raid_address addr;
2205         struct stripe *stripe;
2206
2207         stripe = stripe_find(rs, raid_address(rs, bio->bi_sector, &addr));
2208         if (stripe) {
2209                 int r = 0, rw = bio_data_dir(bio);
2210
2211                 /* Distinguish reads and writes. */
2212                 bio_list_add(BL(stripe, addr.di, rw), bio);
2213
2214                 if (rw == READ)
2215                         /* REMOVEME: statistics. */
2216                         atomic_inc(rs->stats + S_BIOS_ADDED_READ);
2217                 else {
2218                         /* Inrement pending write count on region. */
2219                         dm_rh_inc(rs->recover.rh, stripe->region);
2220                         r = 1;
2221
2222                         /* REMOVEME: statistics. */
2223                         atomic_inc(rs->stats + S_BIOS_ADDED_WRITE);
2224                 }
2225
2226                 /*
2227                  * Put on io (flush) list in case of
2228                  * initial bio queued to chunk.
2229                  */
2230                 if (chunk_get(CHUNK(stripe, addr.di)) == 1)
2231                         stripe_flush_add(stripe);
2232
2233                 return r;
2234         }
2235
2236         /* Got no stripe from cache or failed to lock it -> reject bio. */
2237         bio_list_add(reject, bio);
2238         atomic_inc(rs->stats + S_IOS_POST); /* REMOVEME: statistics. */
2239         return 0;
2240 }
2241
2242 /*
2243  * Handle all stripes by handing them to the daemon, because we can't
2244  * map their chunk pages to copy the data in interrupt context.
2245  *
2246  * We don't want to handle them here either, while interrupts are disabled.
2247  */
2248
2249 /* Read/write endio function for dm-io (interrupt context). */
2250 static void endio(unsigned long error, void *context)
2251 {
2252         struct stripe_chunk *chunk = context;
2253
2254         if (unlikely(error)) {
2255                 chunk_set(chunk, ERROR);
2256                 /* REMOVEME: statistics. */
2257                 atomic_inc(RS(chunk->stripe->sc)->stats + S_STRIPE_ERROR);
2258         } else
2259                 chunk_set(chunk, CLEAN);
2260
2261         /*
2262          * For recovery stripes, I need to reset locked locked
2263          * here, because those aren't processed in do_endios().
2264          */
2265         if (unlikely(StripeRecover(chunk->stripe)))
2266                 ClearChunkLocked(chunk);
2267         else
2268                 SetChunkUnlock(chunk);
2269
2270         /* Indirectly puts stripe on cache's endio list via stripe_io_put(). */
2271         stripe_put_references(chunk->stripe);
2272 }
2273
2274 /* Read/Write a chunk asynchronously. */
2275 static void stripe_chunk_rw(struct stripe *stripe, unsigned p)
2276 {
2277         struct stripe_cache *sc = stripe->sc;
2278         struct raid_set *rs = RS(sc);
2279         struct dm_mem_cache_object *obj = stripe->obj + p;
2280         struct page_list *pl = obj->pl;
2281         struct stripe_chunk *chunk = CHUNK(stripe, p);
2282         struct raid_dev *dev = rs->dev + p;
2283         struct dm_io_region io = {
2284                 .bdev = dev->dev->bdev,
2285                 .sector = stripe->key,
2286                 .count = stripe->io.size,
2287         };
2288         struct dm_io_request control = {
2289                 .bi_rw = ChunkDirty(chunk) ? WRITE : READ,
2290                 .mem = {
2291                         .type = DM_IO_PAGE_LIST,
2292                         .ptr.pl = pl,
2293                         .offset = 0,
2294                 },
2295                 .notify = {
2296                         .fn = endio,
2297                         .context = chunk,
2298                 },
2299                 .client = StripeRecover(stripe) ? rs->recover.dm_io_client :
2300                                                   sc->dm_io_client,
2301         };
2302
2303         BUG_ON(ChunkLocked(chunk));
2304         BUG_ON(!ChunkUptodate(chunk) && ChunkDirty(chunk));
2305         BUG_ON(ChunkUptodate(chunk) && !ChunkDirty(chunk));
2306
2307         /*
2308          * Don't rw past end of device, which can happen, because
2309          * typically sectors_per_dev isn't divisible by io_size.
2310          */
2311         if (unlikely(io.sector + io.count > rs->set.sectors_per_dev))
2312                 io.count = rs->set.sectors_per_dev - io.sector;
2313
2314         BUG_ON(!io.count);
2315         io.sector += dev->start;        /* Add <offset>. */
2316         if (RSRecover(rs))
2317                 recover_io_count(stripe);       /* Recovery io accounting. */
2318
2319         /* REMOVEME: statistics. */
2320         atomic_inc(rs->stats + (ChunkDirty(chunk) ? S_DM_IO_WRITE :
2321                                                     S_DM_IO_READ));
2322         SetChunkLocked(chunk);
2323         SetDevIoQueued(dev);
2324         BUG_ON(dm_io(&control, 1, &io, NULL));
2325 }
2326
2327 /*
2328  * Write dirty or read not uptodate page lists of a stripe.
2329  */
2330 static int stripe_chunks_rw(struct stripe *stripe)
2331 {
2332         int r;
2333         struct raid_set *rs = RS(stripe->sc);
2334
2335         /*
2336          * Increment the pending count on the stripe
2337          * first, so that we don't race in endio().
2338          *
2339          * An inc (IO) is needed for any chunk unless !ChunkIo(chunk):
2340          *
2341          * o not uptodate
2342          * o dirtied by writes merged
2343          * o dirtied by parity calculations
2344          */
2345         r = for_each_io_dev(stripe, stripe_get_references);
2346         if (r) {
2347                 /* Io needed: chunks are either not uptodate or dirty. */
2348                 int max;        /* REMOVEME: */
2349                 struct stripe_cache *sc = &rs->sc;
2350
2351                 /* Submit actual io. */
2352                 for_each_io_dev(stripe, stripe_chunk_rw);
2353
2354                 /* REMOVEME: statistics */
2355                 max = sc_active(sc);
2356                 if (atomic_read(&sc->active_stripes_max) < max)
2357                         atomic_set(&sc->active_stripes_max, max);
2358
2359                 atomic_inc(rs->stats + S_FLUSHS);
2360                 /* END REMOVEME: statistics */
2361         }
2362
2363         return r;
2364 }
2365
2366 /* Merge in all writes hence dirtying respective chunks. */
2367 static void stripe_merge_writes(struct stripe *stripe)
2368 {
2369         unsigned p = RS(stripe->sc)->set.raid_devs;
2370
2371         while (p--) {
2372                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2373                 struct bio_list *write = BL_CHUNK(chunk, WRITE_QUEUED);
2374
2375                 if (!bio_list_empty(write)) {
2376                         struct bio *bio;
2377                         struct page_list *pl = stripe->obj[p].pl;
2378
2379                         /*
2380                          * We can play with the lists without holding a lock,
2381                          * because it is just us accessing them anyway.
2382                          */
2383                         bio_list_for_each(bio, write)
2384                                 bio_copy_page_list(WRITE, stripe, pl, bio);
2385
2386                         bio_list_merge(BL_CHUNK(chunk, WRITE_MERGED), write);
2387                         bio_list_init(write);
2388                         chunk_set(chunk, DIRTY);
2389                 }
2390         }
2391 }
2392
2393 /* Queue all writes to get merged. */
2394 static int stripe_queue_writes(struct stripe *stripe)
2395 {
2396         int r = 0;
2397         unsigned p = RS(stripe->sc)->set.raid_devs;
2398
2399         while (p--) {
2400                 struct stripe_chunk *chunk = CHUNK(stripe, p);
2401                 struct bio_list *write = BL_CHUNK(chunk, WRITE);
2402
2403                 if (!bio_list_empty(write)) {
2404                         bio_list_merge(BL_CHUNK(chunk, WRITE_QUEUED), write);
2405                         bio_list_init(write);
2406 SetChunkIo(chunk);
2407                         r = 1;
2408                 }
2409         }
2410
2411         return r;
2412 }
2413
2414
2415 /* Check, if a chunk gets completely overwritten. */
2416 static int stripe_check_chunk_overwrite(struct stripe *stripe, unsigned p)
2417 {
2418         unsigned sectors = 0;
2419         struct bio *bio;
2420         struct bio_list *bl = BL(stripe, p, WRITE_QUEUED);
2421
2422         bio_list_for_each(bio, bl)
2423                 sectors += bio_sectors(bio);
2424
2425         BUG_ON(sectors > RS(stripe->sc)->set.io_size);
2426         return sectors == RS(stripe->sc)->set.io_size;
2427 }
2428
2429 /*
2430  * Avoid io on broken/reconstructed drive in order to
2431  * reconstruct date on endio.
2432  *
2433  * (*1*) We set StripeReconstruct() in here, so that _do_endios()
2434  *       will trigger a reconstruct call before resetting it.
2435  */
2436 static int stripe_chunk_set_io_flags(struct stripe *stripe, int pr)
2437 {
2438         struct stripe_chunk *chunk = CHUNK(stripe, pr);
2439
2440         /*
2441          * Allow io on all chunks but the indexed one,
2442          * because we're either degraded or prohibit it
2443          * on the one for later reconstruction.
2444          */
2445         /* Includes ClearChunkIo(), ClearChunkUptodate(). */
2446         stripe_chunk_invalidate(chunk);
2447         stripe->idx.recover = pr;
2448         SetStripeReconstruct(stripe);
2449
2450         /* REMOVEME: statistics. */
2451         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2452         return -EPERM;
2453 }
2454
2455 /* Chunk locked/uptodate and device failed tests. */
2456 static struct stripe_chunk *
2457 stripe_chunk_check(struct stripe *stripe, unsigned p, unsigned *chunks_uptodate)
2458 {
2459         struct raid_set *rs = RS(stripe->sc);
2460         struct stripe_chunk *chunk = CHUNK(stripe, p);
2461
2462         /* Can't access active chunks. */
2463         if (ChunkLocked(chunk)) {
2464                 /* REMOVEME: statistics. */
2465                 atomic_inc(rs->stats + S_CHUNK_LOCKED);
2466                 return NULL;
2467         }
2468
2469         /* Can't access broken devive. */
2470         if (ChunkError(chunk) || DevFailed(rs->dev + p))
2471                 return NULL;
2472
2473         /* Can access uptodate chunks. */
2474         if (ChunkUptodate(chunk)) {
2475                 (*chunks_uptodate)++;
2476                 return NULL;
2477         }
2478
2479         return chunk;
2480 }
2481
2482 /*
2483  * Degraded/reconstruction mode.
2484  *
2485  * Check stripe state to figure which chunks don't need IO.
2486  *
2487  * Returns 0 for fully operational, -EPERM for degraded/resynchronizing.
2488  */
2489 static int stripe_check_reconstruct(struct stripe *stripe)
2490 {
2491         struct raid_set *rs = RS(stripe->sc);
2492
2493         if (RSDead(rs)) {
2494                 ClearStripeReconstruct(stripe);
2495                 ClearStripeReconstructed(stripe);
2496                 stripe_allow_io(stripe);
2497                 return 0;
2498         }
2499
2500         /* Avoid further reconstruction setting, when already set. */
2501         if (StripeReconstruct(stripe)) {
2502                 /* REMOVEME: statistics. */
2503                 atomic_inc(rs->stats + S_RECONSTRUCT_SET);
2504                 return -EBUSY;
2505         }
2506
2507         /* Initially allow io on all chunks. */
2508         stripe_allow_io(stripe);
2509
2510         /* Return if stripe is already reconstructed. */
2511         if (StripeReconstructed(stripe)) {
2512                 atomic_inc(rs->stats + S_RECONSTRUCTED);
2513                 return 0;
2514         }
2515
2516         /*
2517          * Degraded/reconstruction mode (device failed) ->
2518          * avoid io on the failed device.
2519          */
2520         if (unlikely(RSDegraded(rs))) {
2521                 /* REMOVEME: statistics. */
2522                 atomic_inc(rs->stats + S_DEGRADED);
2523                 /* Allow IO on all devices but the dead one. */
2524                 BUG_ON(rs->set.ei < 0);
2525                 return stripe_chunk_set_io_flags(stripe, rs->set.ei);
2526         } else {
2527                 int sync, pi = dev_for_parity(stripe, &sync);
2528
2529                 /*
2530                  * Reconstruction mode (ie. a particular (replaced) device or
2531                  * some (rotating) parity chunk is being resynchronized) ->
2532                  *   o make sure all needed chunks are read in
2533                  *   o writes are allowed to go through
2534                  */
2535                 if (!sync) {
2536                         /* REMOVEME: statistics. */
2537                         atomic_inc(rs->stats + S_NOSYNC);
2538                         /* Allow IO on all devs but the one to reconstruct. */
2539                         return stripe_chunk_set_io_flags(stripe, pi);
2540                 }
2541         }
2542
2543         return 0;
2544 }
2545
2546 /*
2547  * Check, if stripe is ready to merge writes.
2548  * I.e. if all chunks present to allow to merge bios.
2549  *
2550  * We prohibit io on:
2551  *
2552  * o chunks without bios
2553  * o chunks which get completely written over
2554  */
2555 static int stripe_merge_possible(struct stripe *stripe, int nosync)
2556 {
2557         struct raid_set *rs = RS(stripe->sc);
2558         unsigned chunks_overwrite = 0, chunks_prohibited = 0,
2559                  chunks_uptodate = 0, p = rs->set.raid_devs;
2560
2561         /* Walk all chunks. */
2562         while (p--) {
2563                 struct stripe_chunk *chunk;
2564
2565                 /* Prohibit io on broken devices. */
2566                 if (DevFailed(rs->dev + p)) {
2567                         chunk = CHUNK(stripe, p);
2568                         goto prohibit_io;
2569                 }
2570
2571                 /* We can't optimize any further if no chunk. */
2572                 chunk = stripe_chunk_check(stripe, p, &chunks_uptodate);
2573                 if (!chunk || nosync)
2574                         continue;
2575
2576                 /*
2577                  * We have a chunk, which is not uptodate.
2578                  *
2579                  * If this is not parity and we don't have
2580                  * reads queued, we can optimize further.
2581                  */
2582                 if (p != stripe->idx.parity &&
2583                     bio_list_empty(BL_CHUNK(chunk, READ)) &&
2584                     bio_list_empty(BL_CHUNK(chunk, WRITE_MERGED))) {
2585                         if (bio_list_empty(BL_CHUNK(chunk, WRITE_QUEUED)))
2586                                 goto prohibit_io;
2587                         else if (RSCheckOverwrite(rs) &&
2588                                  stripe_check_chunk_overwrite(stripe, p))
2589                                 /* Completely overwritten chunk. */
2590                                 chunks_overwrite++;
2591                 }
2592
2593                 /* Allow io for chunks with bios and overwritten ones. */
2594                 SetChunkIo(chunk);
2595                 continue;
2596
2597 prohibit_io:
2598                 /* No io for broken devices or for chunks w/o bios. */
2599                 ClearChunkIo(chunk);
2600                 chunks_prohibited++;
2601                 /* REMOVEME: statistics. */
2602                 atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2603         }
2604
2605         /* All data chunks will get written over. */
2606         if (chunks_overwrite == rs->set.data_devs)
2607                 atomic_inc(rs->stats + S_OVERWRITE); /* REMOVEME: statistics.*/
2608         else if (chunks_uptodate + chunks_prohibited < rs->set.raid_devs) {
2609                 /* We don't have enough chunks to merge. */
2610                 atomic_inc(rs->stats + S_CANT_MERGE); /* REMOVEME: statistics.*/
2611                 return -EPERM;
2612         }
2613
2614         /*
2615          * If we have all chunks up to date or overwrite them, we
2616          * just zero the parity chunk and let stripe_rw() recreate it.
2617          */
2618         if (chunks_uptodate == rs->set.raid_devs ||
2619             chunks_overwrite == rs->set.data_devs) {
2620                 stripe_zero_chunk(stripe, stripe->idx.parity);
2621                 BUG_ON(StripeReconstruct(stripe));
2622                 SetStripeReconstruct(stripe);   /* Enforce xor in caller. */
2623         } else {
2624                 /*
2625                  * With less chunks, we xor parity out.
2626                  *
2627                  * (*4*) We rely on !StripeReconstruct() in chunk_must_xor(),
2628                  *       so that only chunks with queued or merged writes
2629                  *       are being xored.
2630                  */
2631                 parity_xor(stripe);
2632         }
2633
2634         /*
2635          * We do have enough chunks to merge.
2636          * All chunks are uptodate or get written over.
2637          */
2638         atomic_inc(rs->stats + S_CAN_MERGE); /* REMOVEME: statistics. */
2639         return 0;
2640 }
2641
2642 /*
2643  * Avoid reading chunks in case we're fully operational.
2644  *
2645  * We prohibit io on any chunks without bios but the parity chunk.
2646  */
2647 static void stripe_avoid_reads(struct stripe *stripe)
2648 {
2649         struct raid_set *rs = RS(stripe->sc);
2650         unsigned dummy = 0, p = rs->set.raid_devs;
2651
2652         /* Walk all chunks. */
2653         while (p--) {
2654                 struct stripe_chunk *chunk =
2655                         stripe_chunk_check(stripe, p, &dummy);
2656
2657                 if (!chunk)
2658                         continue;
2659
2660                 /* If parity or any bios pending -> allow io. */
2661                 if (chunk_ref(chunk) || p == stripe->idx.parity)
2662                         SetChunkIo(chunk);
2663                 else {
2664                         ClearChunkIo(chunk);
2665                         /* REMOVEME: statistics. */
2666                         atomic_inc(RS(stripe->sc)->stats + S_PROHIBITCHUNKIO);
2667                 }
2668         }
2669 }
2670
2671 /*
2672  * Read/write a stripe.
2673  *
2674  * All stripe read/write activity goes through this function
2675  * unless recovery, which has to call stripe_chunk_rw() directly.
2676  *
2677  * Make sure we don't try already merged stripes in order
2678  * to avoid data corruption.
2679  *
2680  * Check the state of the RAID set and if degraded (or
2681  * resynchronizing for reads), read in all other chunks but
2682  * the one on the dead/resynchronizing device in order to be
2683  * able to reconstruct the missing one in _do_endios().
2684  *
2685  * Can be called on active stripes in order
2686  * to dispatch new io on inactive chunks.
2687  *
2688  * States to cover:
2689  *   o stripe to read and/or write
2690  *   o stripe with error to reconstruct
2691  */
2692 static void stripe_rw(struct stripe *stripe)
2693 {
2694         int nosync, r;
2695         struct raid_set *rs = RS(stripe->sc);
2696
2697         /*
2698          * Check, if a chunk needs to be reconstructed
2699          * because of a degraded set or a region out of sync.
2700          */
2701         nosync = stripe_check_reconstruct(stripe);
2702         switch (nosync) {
2703         case -EBUSY:
2704                 return; /* Wait for stripe reconstruction to finish. */
2705         case -EPERM:
2706                 goto io;
2707         }
2708
2709         /*
2710          * If we don't have merged writes pending, we can schedule
2711          * queued writes to be merged next without corrupting data.
2712          */
2713         if (!StripeMerged(stripe)) {
2714                 r = stripe_queue_writes(stripe);
2715                 if (r)
2716                         /* Writes got queued -> flag RBW. */
2717                         SetStripeRBW(stripe);
2718         }
2719
2720         /*
2721          * Merge all writes hanging off uptodate/overwritten
2722          * chunks of the stripe.
2723          */
2724         if (StripeRBW(stripe)) {
2725                 r = stripe_merge_possible(stripe, nosync);
2726                 if (!r) { /* Merge possible. */
2727                         struct stripe_chunk *chunk;
2728
2729                         /*
2730                          * I rely on valid parity in order
2731                          * to xor a fraction of chunks out
2732                          * of parity and back in.
2733                          */
2734                         stripe_merge_writes(stripe);    /* Merge writes in. */
2735                         parity_xor(stripe);             /* Update parity. */
2736                         ClearStripeReconstruct(stripe); /* Reset xor enforce. */
2737                         SetStripeMerged(stripe);        /* Writes merged. */
2738                         ClearStripeRBW(stripe);         /* Disable RBW. */
2739
2740                         /*
2741                          * REMOVEME: sanity check on parity chunk
2742                          *           states after writes got merged.
2743                          */
2744                         chunk = CHUNK(stripe, stripe->idx.parity);
2745                         BUG_ON(ChunkLocked(chunk));
2746                         BUG_ON(!ChunkUptodate(chunk));
2747                         BUG_ON(!ChunkDirty(chunk));
2748                         BUG_ON(!ChunkIo(chunk));
2749                 }
2750         } else if (!nosync && !StripeMerged(stripe))
2751                 /* Read avoidance if not degraded/resynchronizing/merged. */
2752                 stripe_avoid_reads(stripe);
2753
2754 io:
2755         /* Now submit any reads/writes for non-uptodate or dirty chunks. */
2756         r = stripe_chunks_rw(stripe);
2757         if (!r) {
2758                 /*
2759                  * No io submitted because of chunk io
2760                  * prohibited or locked chunks/failed devices
2761                  * -> push to end io list for processing.
2762                  */
2763                 stripe_endio_push(stripe);
2764                 atomic_inc(rs->stats + S_NO_RW); /* REMOVEME: statistics. */
2765         }
2766 }
2767
2768 /*
2769  * Recovery functions
2770  */
2771 /* Read a stripe off a raid set for recovery. */
2772 static int stripe_recover_read(struct stripe *stripe, int pi)
2773 {
2774         BUG_ON(stripe_io_ref(stripe));
2775
2776         /* Invalidate all chunks so that they get read in. */
2777         stripe_chunks_invalidate(stripe);
2778         stripe_allow_io(stripe); /* Allow io on all recovery chunks. */
2779
2780         /*
2781          * If we are reconstructing a perticular device, we can avoid
2782          * reading the respective chunk in, because we're going to
2783          * reconstruct it anyway.
2784          *
2785          * We can't do that for resynchronization of rotating parity,
2786          * because the recovery stripe chunk size is typically larger
2787          * than the sets chunk size.
2788          */
2789         if (pi > -1)
2790                 ClearChunkIo(CHUNK(stripe, pi));
2791
2792         return stripe_chunks_rw(stripe);
2793 }
2794
2795 /* Write a stripe to a raid set for recovery. */
2796 static int stripe_recover_write(struct stripe *stripe, int pi)
2797 {
2798         BUG_ON(stripe_io_ref(stripe));
2799
2800         /*
2801          * If this is a reconstruct of a particular device, then
2802          * reconstruct the respective chunk, else create parity chunk.
2803          */
2804         if (pi > -1) {
2805                 stripe_zero_chunk(stripe, pi);
2806                 common_xor(stripe, stripe->io.size, 0, pi);
2807                 chunk_set(CHUNK(stripe, pi), DIRTY);
2808         } else
2809                 parity_xor(stripe);
2810
2811         return stripe_chunks_rw(stripe);
2812 }
2813
2814 /* Read/write a recovery stripe. */
2815 static int stripe_recover_rw(struct stripe *stripe)
2816 {
2817         int r = 0, sync = 0;
2818
2819         /* Read/write flip-flop. */
2820         if (TestClearStripeRBW(stripe)) {
2821                 SetStripeMerged(stripe);
2822                 stripe->key = stripe->recover->pos;
2823                 r = stripe_recover_read(stripe, dev_for_parity(stripe, &sync));
2824                 BUG_ON(!r);
2825         } else if (TestClearStripeMerged(stripe)) {
2826                 r = stripe_recover_write(stripe, dev_for_parity(stripe, &sync));
2827                 BUG_ON(!r);
2828         }
2829
2830         BUG_ON(sync);
2831         return r;
2832 }
2833
2834 /* Recover bandwidth available ?. */
2835 static int recover_bandwidth(struct raid_set *rs)
2836 {
2837         int r, work;
2838
2839         /* On reset or when bios delayed -> allow recovery. */
2840         r = recover_io_reset(rs);
2841         if (r || RSBandwidth(rs))
2842                 goto out;
2843
2844         work = atomic_read(rs->recover.io_count + IO_WORK);
2845         if (work) {
2846                 /* Pay attention to larger recover stripe size. */
2847                 int recover = atomic_read(rs->recover.io_count + IO_RECOVER) *
2848                                           rs->recover.io_size / rs->set.io_size;
2849
2850                 /*
2851                  * Don't use more than given bandwidth
2852                  * of the work io for recovery.
2853                  */
2854                 if (recover > work / rs->recover.bandwidth_work) {
2855                         /* REMOVEME: statistics. */
2856                         atomic_inc(rs->stats + S_NO_BANDWIDTH);
2857                         return 0;
2858                 }
2859         }
2860
2861 out:
2862         atomic_inc(rs->stats + S_BANDWIDTH);    /* REMOVEME: statistics. */
2863         return 1;
2864 }
2865
2866 /* Try to get a region to recover. */
2867 static int stripe_recover_get_region(struct stripe *stripe)
2868 {
2869         struct raid_set *rs = RS(stripe->sc);
2870         struct recover *rec = &rs->recover;
2871         struct recover_addr *addr = stripe->recover;
2872         struct dm_dirty_log *dl = rec->dl;
2873         struct dm_rh_client *rh = rec->rh;
2874
2875         BUG_ON(!dl);
2876         BUG_ON(!rh);
2877
2878         /* Return, that we have region first to finish it during suspension. */
2879         if (addr->reg)
2880                 return 1;
2881
2882         if (RSSuspend(rs))
2883                 return -EPERM;
2884
2885         if (dl->type->get_sync_count(dl) >= rec->nr_regions)
2886                 return -ENOENT;
2887
2888         /* If we don't have enough bandwidth, we don't proceed recovering. */
2889         if (!recover_bandwidth(rs))
2890                 return -EAGAIN;
2891
2892         /* Start quiescing a region. */
2893         dm_rh_recovery_prepare(rh);
2894         addr->reg = dm_rh_recovery_start(rh);
2895         if (!addr->reg)
2896                 return -EAGAIN;
2897
2898         addr->pos = dm_rh_region_to_sector(rh, dm_rh_get_region_key(addr->reg));
2899         addr->end = addr->pos + dm_rh_get_region_size(rh);
2900
2901         /*
2902          * Take one global io reference out for the
2903          * whole region, which is going to be released
2904          * when the region is completely done with.
2905          */
2906         io_get(rs);
2907         return 0;
2908 }
2909
2910 /* Update region hash state. */
2911 enum recover_type { REC_FAILURE = 0, REC_SUCCESS = 1 };
2912 static void recover_rh_update(struct stripe *stripe, enum recover_type success)
2913 {
2914         struct recover_addr *addr = stripe->recover;
2915         struct raid_set *rs = RS(stripe->sc);
2916         struct recover *rec = &rs->recover;
2917
2918         if (!addr->reg) {
2919                 DMERR("%s- Called w/o region", __func__);
2920                 return;
2921         }
2922
2923         dm_rh_recovery_end(addr->reg, success);
2924         if (success)
2925                 rec->nr_regions_recovered++;
2926
2927         addr->reg = NULL;
2928
2929         /*
2930          * Completely done with this region ->
2931          * release the 1st io reference.
2932          */
2933         io_put(rs);
2934 }
2935
2936 /* Set start of recovery state. */
2937 static void set_start_recovery(struct raid_set *rs)
2938 {
2939         /* Initialize recovery. */
2940         rs->recover.start_jiffies = jiffies;
2941         rs->recover.end_jiffies = 0;
2942 }
2943
2944 /* Set end of recovery state. */
2945 static void set_end_recovery(struct raid_set *rs)
2946 {
2947         ClearRSRecover(rs);
2948         rs->set.dev_to_init = -1;
2949
2950         /* Check for jiffies overrun. */
2951         rs->recover.end_jiffies = jiffies;
2952         if (rs->recover.end_jiffies < rs->recover.start_jiffies)
2953                 rs->recover.end_jiffies = ~0;
2954 }
2955
2956 /* Handle recovery on one recovery stripe. */
2957 static int _do_recovery(struct stripe *stripe)
2958 {
2959         int r;
2960         struct raid_set *rs = RS(stripe->sc);
2961         struct recover_addr *addr = stripe->recover;
2962
2963         /* If recovery is active -> return. */
2964         if (stripe_io_ref(stripe))
2965                 return 1;
2966
2967         /* IO error is fatal for recovery -> stop it. */
2968         if (unlikely(StripeError(stripe)))
2969                 goto err;
2970
2971         /* Recovery end required. */
2972         if (!RSRecover(rs))
2973                 goto err;
2974
2975         /* Get a region to recover. */
2976         r = stripe_recover_get_region(stripe);
2977         switch (r) {
2978         case 0: /* Got a new region: flag initial read before write. */
2979                 SetStripeRBW(stripe);
2980         case 1: /* Have a region in the works. */
2981                 break;
2982         case -EAGAIN:
2983                 /* No bandwidth/quiesced region yet, try later. */
2984                 if (!io_ref(rs))
2985                         wake_do_raid_delayed(rs, HZ / 4);
2986         case -EPERM:
2987                 /* Suspend. */
2988                 return 1;
2989         case -ENOENT:   /* No more regions to recover. */
2990                 schedule_work(&rs->io.ws_do_table_event);
2991                 return 0;
2992         default:
2993                 BUG();
2994         }
2995
2996         /* Read/write a recover stripe. */
2997         r = stripe_recover_rw(stripe);
2998         if (r)
2999                 /* IO initiated. */
3000                 return 1;
3001
3002         /* Read and write finished-> update recovery position within region. */
3003         addr->pos += stripe->io.size;
3004
3005         /* If we're at end of region, update region hash. */
3006         if (addr->pos >= addr->end ||
3007             addr->pos >= rs->set.sectors_per_dev)
3008                 recover_rh_update(stripe, REC_SUCCESS);
3009         else
3010                 /* Prepare to read next region segment. */
3011                 SetStripeRBW(stripe);
3012
3013         /* Schedule myself for another round... */
3014         wake_do_raid(rs);
3015         return 1;
3016
3017 err:
3018         /* FIXME: rather try recovering other regions on error? */
3019         rs_check_degrade(stripe);
3020         recover_rh_update(stripe, REC_FAILURE);
3021
3022         /* Check state of partially recovered array. */
3023         if (RSDegraded(rs) && !RSDead(rs) &&
3024             rs->set.dev_to_init != -1 &&
3025             rs->set.ei != rs->set.dev_to_init)
3026                 /* Broken drive != drive to recover -> FATAL. */
3027                 SetRSDead(rs);
3028
3029         if (StripeError(stripe)) {
3030                 char buf[BDEVNAME_SIZE];
3031
3032                 DMERR("stopping recovery due to "
3033                       "ERROR on /dev/%s, stripe at offset %llu",
3034                       bdevname(rs->dev[rs->set.ei].dev->bdev, buf),
3035                       (unsigned long long) stripe->key);
3036
3037         }
3038
3039         /* Make sure, that all quiesced regions get released. */
3040         while (addr->reg) {
3041                 dm_rh_recovery_end(addr->reg, -EIO);
3042                 addr->reg = dm_rh_recovery_start(rs->recover.rh);
3043         }
3044
3045         return 0;
3046 }
3047
3048 /* Called by main io daemon to recover regions. */
3049 static void do_recovery(struct raid_set *rs)
3050 {
3051         if (RSRecover(rs)) {
3052                 int r = 0;
3053                 struct stripe *stripe;
3054
3055                 list_for_each_entry(stripe, &rs->recover.stripes,
3056                                     lists[LIST_RECOVER])
3057                         r += _do_recovery(stripe);
3058
3059                 if (!r) {
3060                         set_end_recovery(rs);
3061                         stripe_recover_free(rs);
3062                 }
3063         }
3064 }
3065
3066 /*
3067  * END recovery functions
3068  */
3069
3070 /* End io process all stripes handed in by endio() callback. */
3071 static void _do_endios(struct raid_set *rs, struct stripe *stripe,
3072                        struct list_head *flush_list)
3073 {
3074         /* First unlock all required chunks. */
3075         stripe_chunks_unlock(stripe);
3076
3077         /*
3078          * If an io error on a stripe occured, degrade the RAID set
3079          * and try to endio as many bios as possible. If any bios can't
3080          * be endio processed, requeue the stripe (stripe_ref() != 0).
3081          */
3082         if (TestClearStripeError(stripe)) {
3083                 /*
3084                  * FIXME: if read, rewrite the failed chunk after reconstruction
3085                  *        in order to trigger disk bad sector relocation.
3086                  */
3087                 rs_check_degrade(stripe); /* Resets ChunkError(). */
3088                 ClearStripeReconstruct(stripe);
3089                 ClearStripeReconstructed(stripe);
3090         }
3091
3092         /* Got to reconstruct a missing chunk. */
3093         if (StripeReconstruct(stripe)) {
3094                 /*
3095                  * (*2*) We use StripeReconstruct() to allow for
3096                  *       all chunks to be xored into the reconstructed
3097                  *       one (see chunk_must_xor()).
3098                  */
3099                 stripe_reconstruct(stripe);
3100
3101                 /*
3102                  * (*3*) Now we reset StripeReconstruct() and flag
3103                  *       StripeReconstructed() to show to stripe_rw(),
3104                  *       that we have reconstructed a missing chunk.
3105                  */
3106                 ClearStripeReconstruct(stripe);
3107                 SetStripeReconstructed(stripe);
3108
3109                 /* FIXME: reschedule to be written in case of read. */
3110                 // if (!StripeRBW(stripe)) {
3111                 //      chunk_set(CHUNK(stripe, pr), DIRTY);
3112                 //      stripe_chunks_rw(stripe);
3113                 // }
3114         }
3115
3116         /*
3117          * Now that we eventually got a complete stripe, we
3118          * can process the rest of the end ios on reads.
3119          */
3120         stripe_endio(READ, stripe);
3121
3122         /* End io all merged writes. */
3123         if (TestClearStripeMerged(stripe))
3124                 stripe_endio(WRITE_MERGED, stripe);
3125
3126         /* If RAID set is dead -> fail any ios to dead drives. */
3127         if (RSDead(rs)) {
3128                 DMERR_LIMIT("RAID set dead: failing ios to dead devices");
3129                 stripe_fail_io(stripe);
3130         }
3131
3132         /*
3133          * We have stripe references still,
3134          * beacuse of read befeore writes or IO errors ->
3135          * got to put on flush list for processing.
3136          */
3137         if (stripe_ref(stripe)) {
3138                 BUG_ON(!list_empty(stripe->lists + LIST_LRU));
3139                 list_add_tail(stripe->lists + LIST_FLUSH, flush_list);
3140                 atomic_inc(rs->stats + S_REQUEUE); /* REMOVEME: statistics. */
3141         } else
3142                 stripe_lru_add(stripe);
3143 }
3144
3145 /* Pop any endio stripes off of the endio list and belabour them. */
3146 static void do_endios(struct raid_set *rs)
3147 {
3148         struct stripe_cache *sc = &rs->sc;
3149         struct stripe *stripe;
3150         /* IO flush list for sorted requeued stripes. */
3151         struct list_head flush_list;
3152
3153         INIT_LIST_HEAD(&flush_list);
3154
3155         while ((stripe = stripe_endio_pop(sc))) {
3156                 /* Avoid endio on stripes with newly io'ed chunks. */
3157                 if (!stripe_io_ref(stripe))
3158                         _do_endios(rs, stripe, &flush_list);
3159         }
3160
3161         /*
3162          * Insert any requeued stripes in the proper
3163          * order at the beginning of the io (flush) list.
3164          */
3165         list_splice(&flush_list, sc->lists + LIST_FLUSH);
3166 }
3167
3168 /* Flush any stripes on the io list. */
3169 static void do_flush(struct raid_set *rs)
3170 {
3171         struct stripe *stripe;
3172
3173         while ((stripe = stripe_io_pop(&rs->sc)))
3174                 stripe_rw(stripe); /* Read/write stripe. */
3175 }
3176
3177 /* Stripe cache resizing. */
3178 static void do_sc_resize(struct raid_set *rs)
3179 {
3180         unsigned set = atomic_read(&rs->sc.stripes_to_set);
3181
3182         if (set) {
3183                 unsigned cur = atomic_read(&rs->sc.stripes);
3184                 int r = (set > cur) ? sc_grow(&rs->sc, set - cur, SC_GROW) :
3185                                       sc_shrink(&rs->sc, cur - set);
3186
3187                 /* Flag end of resizeing if ok. */
3188                 if (!r)
3189                         atomic_set(&rs->sc.stripes_to_set, 0);
3190         }
3191 }
3192
3193 /*
3194  * Process all ios
3195  *
3196  * We do different things with the io depending
3197  * on the state of the region that it is in:
3198  *
3199  * o reads: hang off stripe cache or postpone if full
3200  *
3201  * o writes:
3202  *
3203  *  CLEAN/DIRTY/NOSYNC: increment pending and hang io off stripe's stripe set.
3204  *                      In case stripe cache is full or busy, postpone the io.
3205  *
3206  *  RECOVERING:         delay the io until recovery of the region completes.
3207  *
3208  */
3209 static void do_ios(struct raid_set *rs, struct bio_list *ios)
3210 {
3211         int r;
3212         unsigned flush = 0, delay = 0;
3213         sector_t sector;
3214         struct dm_rh_client *rh = rs->recover.rh;
3215         struct bio *bio;
3216         struct bio_list reject;
3217
3218         bio_list_init(&reject);
3219
3220         /*
3221          * Classify each io:
3222          *    o delay writes to recovering regions (let reads go through)
3223          *    o queue io to all other regions
3224          */
3225         while ((bio = bio_list_pop(ios))) {
3226                 /*
3227                  * In case we get a barrier bio, push it back onto
3228                  * the input queue unless all work queues are empty
3229                  * and the stripe cache is inactive.
3230                  */
3231                 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
3232                         /* REMOVEME: statistics. */
3233                         atomic_inc(rs->stats + S_BARRIER);
3234                         if (delay ||
3235                             !list_empty(rs->sc.lists + LIST_FLUSH) ||
3236                             !bio_list_empty(&reject) ||
3237                             sc_active(&rs->sc)) {
3238                                 bio_list_push(ios, bio);
3239                                 break;
3240                         }
3241                 }
3242
3243                 /* Check for recovering regions. */
3244                 sector = _sector(rs, bio);
3245                 r = region_state(rs, sector, DM_RH_RECOVERING);
3246                 if (unlikely(r && bio_data_dir(bio) == WRITE)) {
3247                         delay++;
3248                         /* Wait writing to recovering regions. */
3249                         dm_rh_delay_by_region(rh, bio,
3250                                               dm_rh_sector_to_region(rh,
3251                                                                      sector));
3252                         /* REMOVEME: statistics.*/
3253                         atomic_inc(rs->stats + S_DELAYED_BIOS);
3254                         atomic_inc(rs->stats + S_SUM_DELAYED_BIOS);
3255
3256                         /* Force bandwidth tests in recovery. */
3257                         SetRSBandwidth(rs);
3258                 } else {
3259                         /*
3260                          * Process ios to non-recovering regions by queueing
3261                          * them to stripes (does dm_rh_inc()) for writes).
3262                          */
3263                         flush += stripe_queue_bio(rs, bio, &reject);
3264                 }
3265         }
3266
3267         if (flush) {
3268                 /* FIXME: better error handling. */
3269                 r = dm_rh_flush(rh); /* Writes got queued -> flush dirty log. */
3270                 if (r)
3271                         DMERR_LIMIT("dirty log flush");
3272         }
3273
3274         /* Merge any rejected bios back to the head of the input list. */
3275         bio_list_merge_head(ios, &reject);
3276 }
3277
3278 /* Unplug: let any queued io role on the sets devices. */
3279 static void do_unplug(struct raid_set *rs)
3280 {
3281         struct raid_dev *dev = rs->dev + rs->set.raid_devs;
3282
3283         while (dev-- > rs->dev) {
3284                 /* Only call any device unplug function, if io got queued. */
3285                 if (TestClearDevIoQueued(dev))
3286                         blk_unplug(bdev_get_queue(dev->dev->bdev));
3287         }
3288 }
3289
3290 /* Send an event in case we're getting too busy. */
3291 static void do_busy_event(struct raid_set *rs)
3292 {
3293         if (sc_busy(rs)) {
3294                 if (!TestSetRSScBusy(rs))
3295                         schedule_work(&rs->io.ws_do_table_event);
3296         }
3297
3298         ClearRSScBusy(rs);
3299 }
3300
3301 /* Throw an event. */
3302 static void do_table_event(struct work_struct *ws)
3303 {
3304         struct raid_set *rs = container_of(ws, struct raid_set,
3305                                            io.ws_do_table_event);
3306         dm_table_event(rs->ti->table);
3307 }
3308
3309
3310 /*-----------------------------------------------------------------
3311  * RAID daemon
3312  *---------------------------------------------------------------*/
3313 /*
3314  * o belabour all end ios
3315  * o update the region hash states
3316  * o optionally shrink the stripe cache
3317  * o optionally do recovery
3318  * o unplug any component raid devices with queued bios
3319  * o grab the input queue
3320  * o work an all requeued or new ios and perform stripe cache flushs
3321  * o unplug any component raid devices with queued bios
3322  * o check, if the stripe cache gets too busy and throw an event if so
3323  */
3324 static void do_raid(struct work_struct *ws)
3325 {
3326         struct raid_set *rs = container_of(ws, struct raid_set,
3327                                            io.dws_do_raid.work);
3328         struct bio_list *ios = &rs->io.work, *ios_in = &rs->io.in;
3329
3330         /*
3331          * We always need to end io, so that ios can get errored in
3332          * case the set failed and the region counters get decremented
3333          * before we update region hash states and go any further.
3334          */
3335         do_endios(rs);
3336         dm_rh_update_states(rs->recover.rh, 1);
3337
3338         /*
3339          * Now that we've end io'd, which may have put stripes on the LRU list
3340          * to allow for shrinking, we resize the stripe cache if requested.
3341          */
3342         do_sc_resize(rs);
3343
3344         /* Try to recover regions. */
3345         do_recovery(rs);
3346         do_unplug(rs);          /* Unplug the sets device queues. */
3347
3348         /* Quickly grab all new ios queued and add them to the work list. */
3349         mutex_lock(&rs->io.in_lock);
3350         bio_list_merge(ios, ios_in);
3351         bio_list_init(ios_in);
3352         mutex_unlock(&rs->io.in_lock);
3353
3354         if (!bio_list_empty(ios))
3355                 do_ios(rs, ios); /* Got ios to work into the cache. */
3356
3357         do_flush(rs);           /* Flush any stripes on io list. */
3358         do_unplug(rs);          /* Unplug the sets device queues. */
3359         do_busy_event(rs);      /* Check if we got too busy. */
3360 }
3361
3362 /*
3363  * Callback for region hash to dispatch
3364  * delayed bios queued to recovered regions
3365  * (gets called via dm_rh_update_states()).
3366  */
3367 static void dispatch_delayed_bios(void *context, struct bio_list *bl)
3368 {
3369         struct raid_set *rs = context;
3370         struct bio *bio;
3371
3372         /* REMOVEME: statistics; decrement pending delayed bios counter. */
3373         bio_list_for_each(bio, bl)
3374                 atomic_dec(rs->stats + S_DELAYED_BIOS);
3375
3376         /* Merge region hash private list to work list. */
3377         bio_list_merge_head(&rs->io.work, bl);
3378         bio_list_init(bl);
3379         ClearRSBandwidth(rs);
3380 }
3381
3382 /*************************************************************
3383  * Constructor helpers
3384  *************************************************************/
3385 /* Calculate MB/sec. */
3386 static unsigned mbpers(struct raid_set *rs, unsigned speed)
3387 {
3388         return to_bytes(speed * rs->set.data_devs *
3389                         rs->recover.io_size * HZ >> 10) >> 10;
3390 }
3391
3392 /*
3393  * Discover fastest xor algorithm and # of chunks combination.
3394  */
3395 /* Calculate speed for algorithm and # of chunks. */
3396 static unsigned xor_speed(struct stripe *stripe)
3397 {
3398         unsigned r = 0;
3399         unsigned long j;
3400
3401         /* Wait for next tick. */
3402         for (j = jiffies; j == jiffies; )
3403                 ;
3404
3405         /* Do xors for a full tick. */
3406         for (j = jiffies; j == jiffies; ) {
3407                 mb();
3408                 common_xor(stripe, stripe->io.size, 0, 0);
3409                 mb();
3410                 r++;
3411         }
3412
3413         return r;
3414 }
3415
3416 /* Optimize xor algorithm for this RAID set. */
3417 static unsigned xor_optimize(struct raid_set *rs)
3418 {
3419         unsigned chunks_max = 2, p = rs->set.raid_devs, speed_max = 0;
3420         struct xor_func *f = ARRAY_END(xor_funcs), *f_max = NULL;
3421         struct stripe *stripe;
3422
3423         BUG_ON(list_empty(&rs->recover.stripes));
3424         stripe = list_first_entry(&rs->recover.stripes, struct stripe,
3425                                   lists[LIST_RECOVER]);
3426
3427         /* Must set uptodate so that xor() will belabour chunks. */
3428         while (p--)
3429                 SetChunkUptodate(CHUNK(stripe, p));
3430
3431         /* Try all xor functions. */
3432         while (f-- > xor_funcs) {
3433                 unsigned speed;
3434
3435                 /* Set actual xor function for common_xor(). */
3436                 rs->xor.f = f;
3437                 rs->xor.chunks = (f->f == xor_blocks_wrapper ?
3438                                   (MAX_XOR_BLOCKS + 1) : XOR_CHUNKS_MAX) + 1;
3439
3440                 while (rs->xor.chunks-- > 2) {
3441                         speed = xor_speed(stripe);
3442                         if (speed > speed_max) {
3443                                 speed_max = speed;
3444                                 chunks_max = rs->xor.chunks;
3445                                 f_max = f;
3446                         }
3447                 }
3448         }
3449
3450         /* Memorize optimum parameters. */
3451         rs->xor.f = f_max;
3452         rs->xor.chunks = chunks_max;
3453         return speed_max;
3454 }
3455
3456 /*
3457  * Allocate a RAID context (a RAID set)
3458  */
3459 /* Structure for variable RAID parameters. */
3460 struct variable_parms {
3461         int bandwidth;
3462         int bandwidth_parm;
3463         int chunk_size;
3464         int chunk_size_parm;
3465         int io_size;
3466         int io_size_parm;
3467         int stripes;
3468         int stripes_parm;
3469         int recover_io_size;
3470         int recover_io_size_parm;
3471         int raid_parms;
3472         int recovery;
3473         int recovery_stripes;
3474         int recovery_stripes_parm;
3475 };
3476
3477 static struct raid_set *
3478 context_alloc(struct raid_type *raid_type, struct variable_parms *p,
3479               unsigned raid_devs, sector_t sectors_per_dev,
3480               struct dm_target *ti, unsigned dl_parms, char **argv)
3481 {
3482         int r;
3483         size_t len;
3484         sector_t region_size, ti_len;
3485         struct raid_set *rs = NULL;
3486         struct dm_dirty_log *dl;
3487         struct recover *rec;
3488
3489         /*
3490          * Create the dirty log
3491          *
3492          * We need to change length for the dirty log constructor,
3493          * because we want an amount of regions for all stripes derived
3494          * from the single device size, so that we can keep region
3495          * size = 2^^n independant of the number of devices
3496          */
3497         ti_len = ti->len;
3498         ti->len = sectors_per_dev;
3499         dl = dm_dirty_log_create(argv[0], ti, NULL, dl_parms, argv + 2);
3500         ti->len = ti_len;
3501         if (!dl)
3502                 goto bad_dirty_log;
3503
3504         /* Chunk size *must* be smaller than region size. */
3505         region_size = dl->type->get_region_size(dl);
3506         if (p->chunk_size > region_size)
3507                 goto bad_chunk_size;
3508
3509         /* Recover io size *must* be smaller than region size as well. */
3510         if (p->recover_io_size > region_size)
3511                 goto bad_recover_io_size;
3512
3513         /* Size and allocate the RAID set structure. */
3514         len = sizeof(*rs->data) + sizeof(*rs->dev);
3515         if (dm_array_too_big(sizeof(*rs), len, raid_devs))
3516                 goto bad_array;
3517
3518         len = sizeof(*rs) + raid_devs * len;
3519         rs = kzalloc(len, GFP_KERNEL);
3520         if (!rs)
3521                 goto bad_alloc;
3522
3523         rec = &rs->recover;
3524         atomic_set(&rs->io.in_process, 0);
3525         atomic_set(&rs->io.in_process_max, 0);
3526         rec->io_size = p->recover_io_size;
3527
3528         /* Pointer to data array. */
3529         rs->data = (unsigned long **)
3530                    ((void *) rs->dev + raid_devs * sizeof(*rs->dev));
3531         rec->dl = dl;
3532         rs->set.raid_devs = raid_devs;
3533         rs->set.data_devs = raid_devs - raid_type->parity_devs;
3534         rs->set.raid_type = raid_type;
3535
3536         rs->set.raid_parms = p->raid_parms;
3537         rs->set.chunk_size_parm = p->chunk_size_parm;
3538         rs->set.io_size_parm = p->io_size_parm;
3539         rs->sc.stripes_parm = p->stripes_parm;
3540         rec->io_size_parm = p->recover_io_size_parm;
3541         rec->bandwidth_parm = p->bandwidth_parm;
3542         rec->recovery = p->recovery;
3543         rec->recovery_stripes = p->recovery_stripes;
3544
3545         /*
3546          * Set chunk and io size and respective shifts
3547          * (used to avoid divisions)
3548          */
3549         rs->set.chunk_size = p->chunk_size;
3550         rs->set.chunk_shift = ffs(p->chunk_size) - 1;
3551
3552         rs->set.io_size = p->io_size;
3553         rs->set.io_mask = p->io_size - 1;
3554         /* Mask to adjust address key in case io_size != chunk_size. */
3555         rs->set.io_inv_mask = (p->chunk_size - 1) & ~rs->set.io_mask;
3556
3557         rs->set.sectors_per_dev = sectors_per_dev;
3558
3559         rs->set.ei = -1;        /* Indicate no failed device. */
3560         atomic_set(&rs->set.failed_devs, 0);
3561
3562         rs->ti = ti;
3563
3564         atomic_set(rec->io_count + IO_WORK, 0);
3565         atomic_set(rec->io_count + IO_RECOVER, 0);
3566
3567         /* Initialize io lock and queues. */
3568         mutex_init(&rs->io.in_lock);
3569         bio_list_init(&rs->io.in);
3570         bio_list_init(&rs->io.work);
3571
3572         init_waitqueue_head(&rs->io.suspendq);  /* Suspend waiters (dm-io). */
3573
3574         rec->nr_regions = dm_sector_div_up(sectors_per_dev, region_size);
3575         rec->rh = dm_region_hash_create(rs, dispatch_delayed_bios,
3576                         wake_dummy, wake_do_raid, 0, p->recovery_stripes,
3577                         dl, region_size, rec->nr_regions);
3578         if (IS_ERR(rec->rh))
3579                 goto bad_rh;
3580
3581         /* Initialize stripe cache. */
3582         r = sc_init(rs, p->stripes);
3583         if (r)
3584                 goto bad_sc;
3585
3586         /* REMOVEME: statistics. */
3587         stats_reset(rs);
3588         ClearRSDevelStats(rs);  /* Disnable development status. */
3589         return rs;
3590
3591 bad_dirty_log:
3592         TI_ERR_RET("Error creating dirty log", ERR_PTR(-ENOMEM));
3593
3594 bad_chunk_size:
3595         dm_dirty_log_destroy(dl);
3596         TI_ERR_RET("Chunk size larger than region size", ERR_PTR(-EINVAL));
3597
3598 bad_recover_io_size:
3599         dm_dirty_log_destroy(dl);
3600         TI_ERR_RET("Recover stripe io size larger than region size",
3601                         ERR_PTR(-EINVAL));
3602
3603 bad_array:
3604         dm_dirty_log_destroy(dl);
3605         TI_ERR_RET("Arry too big", ERR_PTR(-EINVAL));
3606
3607 bad_alloc:
3608         dm_dirty_log_destroy(dl);
3609         TI_ERR_RET("Cannot allocate raid context", ERR_PTR(-ENOMEM));
3610
3611 bad_rh:
3612         dm_dirty_log_destroy(dl);
3613         ti->error = DM_MSG_PREFIX "Error creating dirty region hash";
3614         goto free_rs;
3615
3616 bad_sc:
3617         dm_region_hash_destroy(rec->rh); /* Destroys dirty log too. */
3618         sc_exit(&rs->sc);
3619         ti->error = DM_MSG_PREFIX "Error creating stripe cache";
3620 free_rs:
3621         kfree(rs);
3622         return ERR_PTR(-ENOMEM);
3623 }
3624
3625 /* Free a RAID context (a RAID set). */
3626 static void context_free(struct raid_set *rs, unsigned p)
3627 {
3628         while (p--)
3629                 dm_put_device(rs->ti, rs->dev[p].dev);
3630
3631         sc_exit(&rs->sc);
3632         dm_region_hash_destroy(rs->recover.rh); /* Destroys dirty log too. */
3633         kfree(rs);
3634 }
3635
3636 /* Create work queue and initialize delayed work. */
3637 static int rs_workqueue_init(struct raid_set *rs)
3638 {
3639         struct dm_target *ti = rs->ti;
3640
3641         rs->io.wq = create_singlethread_workqueue(DAEMON);
3642         if (!rs->io.wq)
3643                 TI_ERR_RET("failed to create " DAEMON, -ENOMEM);
3644
3645         INIT_DELAYED_WORK(&rs->io.dws_do_raid, do_raid);
3646         INIT_WORK(&rs->io.ws_do_table_event, do_table_event);
3647         return 0;
3648 }
3649
3650 /* Return pointer to raid_type structure for raid name. */
3651 static struct raid_type *get_raid_type(char *name)
3652 {
3653         struct raid_type *r = ARRAY_END(raid_types);
3654
3655         while (r-- > raid_types) {
3656                 if (!strcmp(r->name, name))
3657                         return r;
3658         }
3659
3660         return NULL;
3661 }
3662
3663 /* FIXME: factor out to dm core. */
3664 static int multiple(sector_t a, sector_t b, sector_t *n)
3665 {
3666         sector_t r = a;
3667
3668         sector_div(r, b);
3669         *n = r;
3670         return a == r * b;
3671 }
3672
3673 /* Log RAID set information to kernel log. */
3674 static void rs_log(struct raid_set *rs, unsigned speed)
3675 {
3676         unsigned p;
3677         char buf[BDEVNAME_SIZE];
3678
3679         for (p = 0; p < rs->set.raid_devs; p++)
3680                 DMINFO("/dev/%s is raid disk %u%s",
3681                                 bdevname(rs->dev[p].dev->bdev, buf), p,
3682                                 (p == rs->set.pi) ? " (parity)" : "");
3683
3684         DMINFO("%d/%d/%d sectors chunk/io/recovery size, %u stripes\n"
3685                "algorithm \"%s\", %u chunks with %uMB/s\n"
3686                "%s set with net %u/%u devices",
3687                rs->set.chunk_size, rs->set.io_size, rs->recover.io_size,
3688                atomic_read(&rs->sc.stripes),
3689                rs->xor.f->name, rs->xor.chunks, mbpers(rs, speed),
3690                rs->set.raid_type->descr, rs->set.data_devs, rs->set.raid_devs);
3691 }
3692
3693 /* Get all devices and offsets. */
3694 static int dev_parms(struct raid_set *rs, char **argv, int *p)
3695 {
3696         struct dm_target *ti = rs->ti;
3697
3698         for (*p = 0; *p < rs->set.raid_devs; (*p)++, argv += 2) {
3699                 int r;
3700                 unsigned long long tmp;
3701                 struct raid_dev *dev = rs->dev + *p;
3702
3703                 /* Get offset and device. */
3704                 if (sscanf(argv[1], "%llu", &tmp) != 1 ||
3705                     tmp > rs->set.sectors_per_dev)
3706                         TI_ERR("Invalid RAID device offset parameter");
3707
3708                 dev->start = tmp;
3709                 r = dm_get_device(ti, *argv, dm_table_get_mode(ti->table), &dev->dev);
3710                 if (r)
3711                         TI_ERR_RET("RAID device lookup failure", r);
3712
3713                 r = raid_dev_lookup(rs, dev);
3714                 if (r != -ENODEV && r < *p) {
3715                         (*p)++; /* Ensure dm_put_device() on actual device. */
3716                         TI_ERR_RET("Duplicate RAID device", -ENXIO);
3717                 }
3718         }
3719
3720         return 0;
3721 }
3722
3723 /* Set recovery bandwidth. */
3724 static void
3725 recover_set_bandwidth(struct raid_set *rs, unsigned bandwidth)
3726 {
3727         rs->recover.bandwidth = bandwidth;
3728         rs->recover.bandwidth_work = 100 / bandwidth;
3729 }
3730
3731 /* Handle variable number of RAID parameters. */
3732 static int get_raid_variable_parms(struct dm_target *ti, char **argv,
3733                                    struct variable_parms *vp)
3734 {
3735         int p, value;
3736         struct {
3737                 int action; /* -1: skip, 0: no pwer2 check, 1: power2 check */
3738                 char *errmsg;
3739                 int min, max;
3740                 int *var, *var2, *var3;
3741         } argctr[] = {
3742                 { 1,
3743                   "Invalid chunk size; must be -1 or 2^^n and <= 16384",
3744                   IO_SIZE_MIN, CHUNK_SIZE_MAX,
3745                   &vp->chunk_size_parm, &vp->chunk_size, &vp->io_size },
3746                 { 0,
3747                   "Invalid number of stripes: must be -1 or >= 8 and <= 16384",
3748                   STRIPES_MIN, STRIPES_MAX,
3749                   &vp->stripes_parm, &vp->stripes, NULL },
3750                 { 1,
3751                   "Invalid io size; must -1 or >= 8, 2^^n and less equal "
3752                   "min(BIO_MAX_SECTORS/2, chunk size)",
3753                   IO_SIZE_MIN, 0, /* Needs to be updated in loop below. */
3754                   &vp->io_size_parm, &vp->io_size, NULL },
3755                 { 1,
3756                   "Invalid recovery io size; must be -1 or "
3757                   "2^^n and less equal BIO_MAX_SECTORS/2",
3758                   RECOVER_IO_SIZE_MIN, BIO_MAX_SECTORS / 2,
3759                   &vp->recover_io_size_parm, &vp->recover_io_size, NULL },
3760                 { 0,
3761                   "Invalid recovery bandwidth percentage; "
3762                   "must be -1 or > 0 and <= 100",
3763                   BANDWIDTH_MIN, BANDWIDTH_MAX,
3764                   &vp->bandwidth_parm, &vp->bandwidth, NULL },
3765                 /* Handle sync argument seperately in loop. */
3766                 { -1,
3767                   "Invalid recovery switch; must be \"sync\" or \"nosync\"" },
3768                 { 0,
3769                   "Invalid number of recovery stripes;"
3770                   "must be -1, > 0 and <= 16384",
3771                   RECOVERY_STRIPES_MIN, RECOVERY_STRIPES_MAX,
3772                   &vp->recovery_stripes_parm, &vp->recovery_stripes, NULL },
3773         }, *varp;
3774
3775         /* Fetch # of variable raid parameters. */
3776         if (sscanf(*(argv++), "%d", &vp->raid_parms) != 1 ||
3777             !range_ok(vp->raid_parms, 0, 7))
3778                 TI_ERR("Bad variable raid parameters number");
3779
3780         /* Preset variable RAID parameters. */
3781         vp->chunk_size = CHUNK_SIZE_DEFAULT;
3782         vp->io_size = IO_SIZE_DEFAULT;
3783         vp->stripes = STRIPES_DEFAULT;
3784         vp->recover_io_size = RECOVER_IO_SIZE_DEFAULT;
3785         vp->bandwidth = BANDWIDTH_DEFAULT;
3786         vp->recovery = 1;
3787         vp->recovery_stripes = RECOVERY_STRIPES_DEFAULT;
3788
3789         /* Walk the array of argument constraints for all given ones. */
3790         for (p = 0, varp = argctr; p < vp->raid_parms; p++, varp++) {
3791                 BUG_ON(varp >= ARRAY_END(argctr));
3792
3793                 /* Special case for "[no]sync" string argument. */
3794                 if (varp->action < 0) {
3795                         if (!strcmp(*argv, "sync"))
3796                                 ;
3797                         else if (!strcmp(*argv, "nosync"))
3798                                 vp->recovery = 0;
3799                         else
3800                                 TI_ERR(varp->errmsg);
3801
3802                         argv++;
3803                         continue;
3804                 }
3805
3806                 /*
3807                  * Special case for io_size depending
3808                  * on previously set chunk size.
3809                  */
3810                 if (p == 2)
3811                         varp->max = min(BIO_MAX_SECTORS / 2, vp->chunk_size);
3812
3813                 if (sscanf(*(argv++), "%d", &value) != 1 ||
3814                     (value != -1 &&
3815                      ((varp->action && !POWER_OF_2(value)) ||
3816                       !range_ok(value, varp->min, varp->max))))
3817                         TI_ERR(varp->errmsg);
3818
3819                 *varp->var = value;
3820                 if (value != -1) {
3821                         if (varp->var2)
3822                                 *varp->var2 = value;
3823                         if (varp->var3)
3824                                 *varp->var3 = value;
3825                 }
3826         }
3827
3828         return 0;
3829 }
3830
3831 /* Parse optional locking parameters. */
3832 static int get_raid_locking_parms(struct dm_target *ti, char **argv,
3833                                   int *locking_parms,
3834                                   struct dm_raid45_locking_type **locking_type)
3835 {
3836         if (!strnicmp(argv[0], "locking", strlen(argv[0]))) {
3837                 char *lckstr = argv[1];
3838                 size_t lcksz = strlen(lckstr);
3839
3840                 if (!strnicmp(lckstr, "none", lcksz)) {
3841                         *locking_type = &locking_none;
3842                         *locking_parms = 2;
3843                 } else if (!strnicmp(lckstr, "cluster", lcksz)) {
3844                         DMERR("locking type \"%s\" not yet implemented",
3845                               lckstr);
3846                         return -EINVAL;
3847                 } else {
3848                         DMERR("unknown locking type \"%s\"", lckstr);
3849                         return -EINVAL;
3850                 }
3851         }
3852
3853         *locking_parms = 0;
3854         *locking_type = &locking_none;
3855         return 0;
3856 }
3857
3858 /* Set backing device read ahead properties of RAID set. */
3859 static void rs_set_read_ahead(struct raid_set *rs,
3860                               unsigned sectors, unsigned stripes)
3861 {
3862         unsigned ra_pages = dm_div_up(sectors, SECTORS_PER_PAGE);
3863         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3864         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3865
3866         /* Set read-ahead for the RAID set and the component devices. */
3867         if (ra_pages) {
3868                 unsigned p = rs->set.raid_devs;
3869
3870                 bdi->ra_pages = stripes * ra_pages * rs->set.data_devs;
3871
3872                 while (p--) {
3873                         struct request_queue *q =
3874                                 bdev_get_queue(rs->dev[p].dev->bdev);
3875
3876                         q->backing_dev_info.ra_pages = ra_pages;
3877                 }
3878         }
3879
3880         dm_put(md);
3881 }
3882
3883 /* Set congested function. */
3884 static void rs_set_congested_fn(struct raid_set *rs)
3885 {
3886         struct mapped_device *md = dm_table_get_md(rs->ti->table);
3887         struct backing_dev_info *bdi = &dm_disk(md)->queue->backing_dev_info;
3888
3889         /* Set congested function and data. */
3890         bdi->congested_fn = rs_congested;
3891         bdi->congested_data = rs;
3892         dm_put(md);
3893 }
3894
3895 /*
3896  * Construct a RAID4/5 mapping:
3897  *
3898  * log_type #log_params <log_params> \
3899  * raid_type [#parity_dev] #raid_variable_params <raid_params> \
3900  * [locking "none"/"cluster"]
3901  * #raid_devs #dev_to_initialize [<dev_path> <offset>]{3,}
3902  *
3903  * log_type = "core"/"disk",
3904  * #log_params = 1-3 (1-2 for core dirty log type, 3 for disk dirty log only)
3905  * log_params = [dirty_log_path] region_size [[no]sync])
3906  *
3907  * raid_type = "raid4", "raid5_la", "raid5_ra", "raid5_ls", "raid5_rs"
3908  *
3909  * #parity_dev = N if raid_type = "raid4"
3910  * o N = -1: pick default = last device
3911  * o N >= 0 and < #raid_devs: parity device index
3912  *
3913  * #raid_variable_params = 0-7; raid_params (-1 = default):
3914  *   [chunk_size [#stripes [io_size [recover_io_size \
3915  *    [%recovery_bandwidth [recovery_switch [#recovery_stripes]]]]]]]
3916  *   o chunk_size (unit to calculate drive addresses; must be 2^^n, > 8
3917  *     and <= CHUNK_SIZE_MAX)
3918  *   o #stripes is number of stripes allocated to stripe cache
3919  *     (must be > 1 and < STRIPES_MAX)
3920  *   o io_size (io unit size per device in sectors; must be 2^^n and > 8)
3921  *   o recover_io_size (io unit size per device for recovery in sectors;
3922  must be 2^^n, > SECTORS_PER_PAGE and <= region_size)
3923  *   o %recovery_bandwith is the maximum amount spend for recovery during
3924  *     application io (1-100%)
3925  *   o recovery switch = [sync|nosync]
3926  *   o #recovery_stripes is the number of recovery stripes used for
3927  *     parallel recovery of the RAID set
3928  * If raid_variable_params = 0, defaults will be used.
3929  * Any raid_variable_param can be set to -1 to apply a default
3930  *
3931  * #raid_devs = N (N >= 3)
3932  *
3933  * #dev_to_initialize = N
3934  * -1: initialize parity on all devices
3935  * >= 0 and < #raid_devs: initialize raid_path; used to force reconstruction
3936  * of a failed devices content after replacement
3937  *
3938  * <dev_path> = device_path (eg, /dev/sdd1)
3939  * <offset>   = begin at offset on <dev_path>
3940  *
3941  */
3942 #define MIN_PARMS       13
3943 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
3944 {
3945         int dev_to_init, dl_parms, i, locking_parms,
3946             parity_parm, pi = -1, r, raid_devs;
3947         unsigned speed;
3948         sector_t tmp, sectors_per_dev;
3949         struct dm_raid45_locking_type *locking;
3950         struct raid_set *rs;
3951         struct raid_type *raid_type;
3952         struct variable_parms parms;
3953
3954         /* Ensure minimum number of parameters. */
3955         if (argc < MIN_PARMS)
3956                 TI_ERR("Not enough parameters");
3957
3958         /* Fetch # of dirty log parameters. */
3959         if (sscanf(argv[1], "%d", &dl_parms) != 1 ||
3960             !range_ok(dl_parms, 1, 4711)) /* ;-) */
3961                 TI_ERR("Bad dirty log parameters number");
3962
3963         /* Check raid_type. */
3964         raid_type = get_raid_type(argv[dl_parms + 2]);
3965         if (!raid_type)
3966                 TI_ERR("Bad raid type");
3967
3968         /* In case of RAID4, parity drive is selectable. */
3969         parity_parm = !!(raid_type->level == raid4);
3970
3971         /* Handle variable number of RAID parameters. */
3972         r = get_raid_variable_parms(ti, argv + dl_parms + parity_parm + 3,
3973                                     &parms);
3974         if (r)
3975                 return r;
3976
3977         /* Handle any locking parameters. */
3978         r = get_raid_locking_parms(ti,
3979                                    argv + dl_parms + parity_parm +
3980                                    parms.raid_parms + 4,
3981                                    &locking_parms, &locking);
3982         if (r)
3983                 return r;
3984
3985         /* # of raid devices. */
3986         i = dl_parms + parity_parm + parms.raid_parms + locking_parms + 4;
3987         if (sscanf(argv[i], "%d", &raid_devs) != 1 ||
3988             raid_devs < raid_type->minimal_devs)
3989                 TI_ERR("Invalid number of raid devices");
3990
3991         /* In case of RAID4, check parity drive index is in limits. */
3992         if (raid_type->level == raid4) {
3993                 /* Fetch index of parity device. */
3994                 if (sscanf(argv[dl_parms + 3], "%d", &pi) != 1 ||
3995                     (pi != -1 && !range_ok(pi, 0, raid_devs - 1)))
3996                         TI_ERR("Invalid RAID4 parity device index");
3997         }
3998
3999         /*
4000          * Index of device to initialize starts at 0
4001          *
4002          * o -1 -> don't initialize a selected device;
4003          *         initialize parity conforming to algorithm
4004          * o 0..raid_devs-1 -> initialize respective device
4005          *   (used for reconstruction of a replaced device)
4006          */
4007         if (sscanf(argv[dl_parms + parity_parm + parms.raid_parms +
4008                    locking_parms + 5], "%d", &dev_to_init) != 1 ||
4009             !range_ok(dev_to_init, -1, raid_devs - 1))
4010                 TI_ERR("Invalid number for raid device to initialize");
4011
4012         /* Check # of raid device arguments. */
4013         if (argc - dl_parms - parity_parm - parms.raid_parms - 6 !=
4014             2 * raid_devs)
4015                 TI_ERR("Wrong number of raid device/offset arguments");
4016
4017         /*
4018          * Check that the table length is devisable
4019          * w/o rest by (raid_devs - parity_devs)
4020          */
4021         if (!multiple(ti->len, raid_devs - raid_type->parity_devs,
4022                       &sectors_per_dev))
4023                 TI_ERR("Target length not divisible by number of data devices");
4024
4025         /*
4026          * Check that the device size is
4027          * devisable w/o rest by chunk size
4028          */
4029         if (!multiple(sectors_per_dev, parms.chunk_size, &tmp))
4030                 TI_ERR("Device length not divisible by chunk_size");
4031
4032         /****************************************************************
4033          * Now that we checked the constructor arguments ->
4034          * let's allocate the RAID set
4035          ****************************************************************/
4036         rs = context_alloc(raid_type, &parms, raid_devs, sectors_per_dev,
4037                            ti, dl_parms, argv);
4038         if (IS_ERR(rs))
4039                 return PTR_ERR(rs);
4040
4041
4042         rs->set.dev_to_init = rs->set.dev_to_init_parm = dev_to_init;
4043         rs->set.pi = rs->set.pi_parm = pi;
4044
4045         /* Set RAID4 parity drive index. */
4046         if (raid_type->level == raid4)
4047                 rs->set.pi = (pi == -1) ? rs->set.data_devs : pi;
4048
4049         recover_set_bandwidth(rs, parms.bandwidth);
4050
4051         /* Use locking type to lock stripe access. */
4052         rs->locking = locking;
4053
4054         /* Get the device/offset tupels. */
4055         argv += dl_parms + 6 + parity_parm + parms.raid_parms;
4056         r = dev_parms(rs, argv, &i);
4057         if (r)
4058                 goto err;
4059
4060         /* Set backing device information (eg. read ahead). */
4061         rs_set_read_ahead(rs, 2 * rs->set.chunk_size, 4 /* stripes */);
4062         rs_set_congested_fn(rs); /* Set congested function. */
4063         SetRSCheckOverwrite(rs); /* Allow chunk overwrite checks. */
4064         speed = xor_optimize(rs); /* Select best xor algorithm. */
4065
4066         /* Set for recovery of any nosync regions. */
4067         if (parms.recovery)
4068                 SetRSRecover(rs);
4069         else {
4070                 /*
4071                  * Need to free recovery stripe(s) here in case
4072                  * of nosync, because xor_optimize uses one.
4073                  */
4074                 set_start_recovery(rs);
4075                 set_end_recovery(rs);
4076                 stripe_recover_free(rs);
4077         }
4078
4079         /*
4080          * Make sure that dm core only hands maximum io size
4081          * length down and pays attention to io boundaries.
4082          */
4083         ti->split_io = rs->set.io_size;
4084         ti->private = rs;
4085
4086         /* Initialize work queue to handle this RAID set's io. */
4087         r = rs_workqueue_init(rs);
4088         if (r)
4089                 goto err;
4090
4091         rs_log(rs, speed); /* Log information about RAID set. */
4092         return 0;
4093
4094 err:
4095         context_free(rs, i);
4096         return r;
4097 }
4098
4099 /*
4100  * Destruct a raid mapping
4101  */
4102 static void raid_dtr(struct dm_target *ti)
4103 {
4104         struct raid_set *rs = ti->private;
4105
4106         destroy_workqueue(rs->io.wq);
4107         context_free(rs, rs->set.raid_devs);
4108 }
4109
4110 /* Raid mapping function. */
4111 static int raid_map(struct dm_target *ti, struct bio *bio,
4112                     union map_info *map_context)
4113 {
4114         /* I don't want to waste stripe cache capacity. */
4115         if (bio_rw(bio) == READA)
4116                 return -EIO;
4117         else {
4118                 struct raid_set *rs = ti->private;
4119
4120                 /*
4121                  * Get io reference to be waiting for to drop
4122                  * to zero on device suspension/destruction.
4123                  */
4124                 io_get(rs);
4125                 bio->bi_sector -= ti->begin;    /* Remap sector. */
4126
4127                 /* Queue io to RAID set. */
4128                 mutex_lock(&rs->io.in_lock);
4129                 bio_list_add(&rs->io.in, bio);
4130                 mutex_unlock(&rs->io.in_lock);
4131
4132                 /* Wake daemon to process input list. */
4133                 wake_do_raid(rs);
4134
4135                 /* REMOVEME: statistics. */
4136                 atomic_inc(rs->stats + (bio_data_dir(bio) == READ ?
4137                                         S_BIOS_READ : S_BIOS_WRITE));
4138                 return DM_MAPIO_SUBMITTED;      /* Handle later. */
4139         }
4140 }
4141
4142 /* Device suspend. */
4143 static void raid_presuspend(struct dm_target *ti)
4144 {
4145         struct raid_set *rs = ti->private;
4146         struct dm_dirty_log *dl = rs->recover.dl;
4147
4148         SetRSSuspend(rs);
4149
4150         if (RSRecover(rs))
4151                 dm_rh_stop_recovery(rs->recover.rh);
4152
4153         cancel_delayed_work(&rs->io.dws_do_raid);
4154         flush_workqueue(rs->io.wq);
4155         wait_ios(rs);   /* Wait for completion of all ios being processed. */
4156
4157         if (dl->type->presuspend && dl->type->presuspend(dl))
4158                 /* FIXME: need better error handling. */
4159                 DMWARN("log presuspend failed");
4160 }
4161
4162 static void raid_postsuspend(struct dm_target *ti)
4163 {
4164         struct raid_set *rs = ti->private;
4165         struct dm_dirty_log *dl = rs->recover.dl;
4166
4167         if (dl->type->postsuspend && dl->type->postsuspend(dl))
4168                 /* FIXME: need better error handling. */
4169                 DMWARN("log postsuspend failed");
4170
4171 }
4172
4173 /* Device resume. */
4174 static void raid_resume(struct dm_target *ti)
4175 {
4176         struct raid_set *rs = ti->private;
4177         struct recover *rec = &rs->recover;
4178         struct dm_dirty_log *dl = rec->dl;
4179
4180         if (dl->type->resume && dl->type->resume(dl))
4181                 /* Resume dirty log. */
4182                 /* FIXME: need better error handling. */
4183                 DMWARN("log resume failed");
4184
4185         rec->nr_regions_to_recover =
4186                 rec->nr_regions - dl->type->get_sync_count(dl);
4187
4188         /* Restart any unfinished recovery. */
4189         if (RSRecover(rs)) {
4190                 set_start_recovery(rs);
4191                 dm_rh_start_recovery(rec->rh);
4192         }
4193
4194         ClearRSSuspend(rs);
4195         wake_do_raid(rs);
4196 }
4197
4198 /* Return stripe cache size. */
4199 static unsigned sc_size(struct raid_set *rs)
4200 {
4201         return to_sector(atomic_read(&rs->sc.stripes) *
4202                          (sizeof(struct stripe) +
4203                           (sizeof(struct stripe_chunk) +
4204                            (sizeof(struct page_list) +
4205                             to_bytes(rs->set.io_size) *
4206                             rs->set.raid_devs)) +
4207                           (rs->recover.end_jiffies ?
4208                            0 : rs->recover.recovery_stripes *
4209                            to_bytes(rs->set.raid_devs * rs->recover.io_size))));
4210 }
4211
4212 /* REMOVEME: status output for development. */
4213 static void raid_devel_stats(struct dm_target *ti, char *result,
4214                              unsigned *size, unsigned maxlen)
4215 {
4216         unsigned sz = *size;
4217         unsigned long j;
4218         char buf[BDEVNAME_SIZE], *p;
4219         struct stats_map *sm;
4220         struct raid_set *rs = ti->private;
4221         struct recover *rec = &rs->recover;
4222         struct timespec ts;
4223
4224         DMEMIT("%s %s %u\n", version, rs->xor.f->name, rs->xor.chunks);
4225         DMEMIT("act_ios=%d ", io_ref(rs));
4226         DMEMIT("act_ios_max=%d\n", atomic_read(&rs->io.in_process_max));
4227         DMEMIT("act_stripes=%d ", sc_active(&rs->sc));
4228         DMEMIT("act_stripes_max=%d\n",
4229                atomic_read(&rs->sc.active_stripes_max));
4230
4231         for (sm = stats_map; sm < ARRAY_END(stats_map); sm++)
4232                 DMEMIT("%s%d", sm->str, atomic_read(rs->stats + sm->type));
4233
4234         DMEMIT(" checkovr=%s\n", RSCheckOverwrite(rs) ? "on" : "off");
4235         DMEMIT("sc=%u/%u/%u/%u/%u/%u/%u\n", rs->set.chunk_size,
4236                atomic_read(&rs->sc.stripes), rs->set.io_size,
4237                rec->recovery_stripes, rec->io_size, rs->sc.hash.buckets,
4238                sc_size(rs));
4239
4240         j = (rec->end_jiffies ? rec->end_jiffies : jiffies) -
4241             rec->start_jiffies;
4242         jiffies_to_timespec(j, &ts);
4243         sprintf(buf, "%ld.%ld", ts.tv_sec, ts.tv_nsec);
4244         p = strchr(buf, '.');
4245         p[3] = 0;
4246
4247         DMEMIT("rg=%llu/%llu/%llu/%u %s\n",
4248                (unsigned long long) rec->nr_regions_recovered,
4249                (unsigned long long) rec->nr_regions_to_recover,
4250                (unsigned long long) rec->nr_regions, rec->bandwidth, buf);
4251
4252         *size = sz;
4253 }
4254
4255 static int raid_status(struct dm_target *ti, status_type_t type,
4256                        char *result, unsigned maxlen)
4257 {
4258         unsigned p, sz = 0;
4259         char buf[BDEVNAME_SIZE];
4260         struct raid_set *rs = ti->private;
4261         int raid_parms[] = {
4262                 rs->set.chunk_size_parm,
4263                 rs->sc.stripes_parm,
4264                 rs->set.io_size_parm,
4265                 rs->recover.io_size_parm,
4266                 rs->recover.bandwidth_parm,
4267                 -2,
4268                 rs->recover.recovery_stripes,
4269         };
4270
4271         switch (type) {
4272         case STATUSTYPE_INFO:
4273                 /* REMOVEME: statistics. */
4274                 if (RSDevelStats(rs))
4275                         raid_devel_stats(ti, result, &sz, maxlen);
4276
4277                 DMEMIT("%u ", rs->set.raid_devs);
4278
4279                 for (p = 0; p < rs->set.raid_devs; p++)
4280                         DMEMIT("%s ",
4281                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev));
4282
4283                 DMEMIT("1 ");
4284                 for (p = 0; p < rs->set.raid_devs; p++) {
4285                         DMEMIT("%c", !DevFailed(rs->dev + p) ? 'A' : 'D');
4286
4287                         if (p == rs->set.pi)
4288                                 DMEMIT("p");
4289
4290                         if (rs->set.dev_to_init == p)
4291                                 DMEMIT("i");
4292                 }
4293
4294                 break;
4295         case STATUSTYPE_TABLE:
4296                 sz = rs->recover.dl->type->status(rs->recover.dl, type,
4297                                                   result, maxlen);
4298                 DMEMIT("%s %u ", rs->set.raid_type->name,
4299                        rs->set.raid_parms);
4300
4301                 for (p = 0; p < rs->set.raid_parms; p++) {
4302                         if (raid_parms[p] > -2)
4303                                 DMEMIT("%d ", raid_parms[p]);
4304                         else
4305                                 DMEMIT("%s ", rs->recover.recovery ?
4306                                               "sync" : "nosync");
4307                 }
4308
4309                 DMEMIT("%u %d ", rs->set.raid_devs, rs->set.dev_to_init);
4310
4311                 for (p = 0; p < rs->set.raid_devs; p++)
4312                         DMEMIT("%s %llu ",
4313                                format_dev_t(buf, rs->dev[p].dev->bdev->bd_dev),
4314                                (unsigned long long) rs->dev[p].start);
4315         }
4316
4317         return 0;
4318 }
4319
4320 /*
4321  * Message interface
4322  */
4323 enum raid_msg_actions {
4324         act_bw,                 /* Recovery bandwidth switch. */
4325         act_dev,                /* Device failure switch. */
4326         act_overwrite,          /* Stripe overwrite check. */
4327         act_stats,              /* Development statistics switch. */
4328         act_sc,                 /* Stripe cache switch. */
4329
4330         act_on,                 /* Set entity on. */
4331         act_off,                /* Set entity off. */
4332         act_reset,              /* Reset entity. */
4333
4334         act_set = act_on,       /* Set # absolute. */
4335         act_grow = act_off,     /* Grow # by an amount. */
4336         act_shrink = act_reset, /* Shrink # by an amount. */
4337 };
4338
4339 /* Turn a delta into an absolute value. */
4340 static int _absolute(unsigned long action, int act, int r)
4341 {
4342         /* Make delta absolute. */
4343         if (test_bit(act_set, &action))
4344                 ;
4345         else if (test_bit(act_grow, &action))
4346                 r += act;
4347         else if (test_bit(act_shrink, &action))
4348                 r = act - r;
4349         else
4350                 r = -EINVAL;
4351
4352         return r;
4353 }
4354
4355  /* Change recovery io bandwidth. */
4356 static int bandwidth_change(struct dm_msg *msg, void *context)
4357 {
4358         struct raid_set *rs = context;
4359         int act = rs->recover.bandwidth;
4360         int bandwidth = DM_MSG_INT_ARG(msg);
4361
4362         if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4363                 /* Make delta bandwidth absolute. */
4364                 bandwidth = _absolute(msg->action, act, bandwidth);
4365
4366                 /* Check range. */
4367                 if (range_ok(bandwidth, BANDWIDTH_MIN, BANDWIDTH_MAX)) {
4368                         recover_set_bandwidth(rs, bandwidth);
4369                         return 0;
4370                 }
4371         }
4372
4373         set_bit(dm_msg_ret_arg, &msg->ret);
4374         set_bit(dm_msg_ret_inval, &msg->ret);
4375         return -EINVAL;
4376 }
4377
4378 /* Set/reset development feature flags. */
4379 static int devel_flags(struct dm_msg *msg, void *context)
4380 {
4381         struct raid_set *rs = context;
4382
4383         if (test_bit(act_on, &msg->action))
4384                 return test_and_set_bit(msg->spec->parm,
4385                                         &rs->io.flags) ? -EPERM : 0;
4386         else if (test_bit(act_off, &msg->action))
4387                 return test_and_clear_bit(msg->spec->parm,
4388                                           &rs->io.flags) ? 0 : -EPERM;
4389         else if (test_bit(act_reset, &msg->action)) {
4390                 if (test_bit(act_stats, &msg->action)) {
4391                         stats_reset(rs);
4392                         goto on;
4393                 } else if (test_bit(act_overwrite, &msg->action)) {
4394 on:
4395                         set_bit(msg->spec->parm, &rs->io.flags);
4396                         return 0;
4397                 }
4398         }
4399
4400         return -EINVAL;
4401 }
4402
4403 /* Resize the stripe cache. */
4404 static int sc_resize(struct dm_msg *msg, void *context)
4405 {
4406         int act, stripes;
4407         struct raid_set *rs = context;
4408
4409         /* Deny permission in case the daemon is still resizing!. */
4410         if (atomic_read(&rs->sc.stripes_to_set))
4411                 return -EPERM;
4412
4413         stripes = DM_MSG_INT_ARG(msg);
4414         if (stripes > 0) {
4415                 act = atomic_read(&rs->sc.stripes);
4416
4417                 /* Make delta stripes absolute. */
4418                 stripes = _absolute(msg->action, act, stripes);
4419
4420                 /*
4421                  * Check range and that the # of stripes changes.
4422                  * We leave the resizing to the wroker.
4423                  */
4424                 if (range_ok(stripes, STRIPES_MIN, STRIPES_MAX) &&
4425                     stripes != atomic_read(&rs->sc.stripes)) {
4426                         atomic_set(&rs->sc.stripes_to_set, stripes);
4427                         wake_do_raid(rs);
4428                         return 0;
4429                 }
4430         }
4431
4432         set_bit(dm_msg_ret_arg, &msg->ret);
4433         set_bit(dm_msg_ret_inval, &msg->ret);
4434         return -EINVAL;
4435 }
4436
4437 /* Parse the RAID message action. */
4438 /*
4439  * 'ba[ndwidth] {se[t],g[row],sh[rink]} #'      # e.g 'ba se 50'
4440  * "o[verwrite]  {on,of[f],r[eset]}'            # e.g. 'o of'
4441  * 'sta[tistics] {on,of[f],r[eset]}'            # e.g. 'stat of'
4442  * 'str[ipecache] {se[t],g[row],sh[rink]} #'    # e.g. 'stripe set 1024'
4443  *
4444  */
4445 static int raid_message(struct dm_target *ti, unsigned argc, char **argv)
4446 {
4447         /* Variables to store the parsed parameters im. */
4448         static int i[2];
4449         static unsigned long *i_arg[] = {
4450                 (unsigned long *) i + 0,
4451                 (unsigned long *) i + 1,
4452         };
4453
4454         /* Declare all message option strings. */
4455         static char *str_sgs[] = { "set", "grow", "shrink" };
4456         static char *str_oor[] = { "on", "off", "reset" };
4457
4458         /* Declare all actions. */
4459         static unsigned long act_sgs[] = { act_set, act_grow, act_shrink };
4460         static unsigned long act_oor[] = { act_on, act_off, act_reset };
4461
4462         /* Bandwidth option. */
4463         static struct dm_message_option bw_opt = { 3, str_sgs, act_sgs };
4464         static struct dm_message_argument bw_args = {
4465                 1, i_arg, { dm_msg_int_t }
4466         };
4467
4468         static struct dm_message_argument null_args = {
4469                 0, NULL, { dm_msg_int_t }
4470         };
4471
4472         /* Overwrite and statistics option. */
4473         static struct dm_message_option ovr_stats_opt = { 3, str_oor, act_oor };
4474
4475         /* Sripecache option. */
4476         static struct dm_message_option stripe_opt = { 3, str_sgs, act_sgs };
4477
4478         /* Declare messages. */
4479         static struct dm_msg_spec specs[] = {
4480                 { "bandwidth", act_bw, &bw_opt, &bw_args,
4481                   0, bandwidth_change },
4482                 { "overwrite", act_overwrite, &ovr_stats_opt, &null_args,
4483                   RS_CHECK_OVERWRITE, devel_flags },
4484                 { "statistics", act_stats, &ovr_stats_opt, &null_args,
4485                   RS_DEVEL_STATS, devel_flags },
4486                 { "stripecache", act_sc, &stripe_opt, &bw_args,
4487                   0, sc_resize },
4488         };
4489
4490         /* The message for the parser. */
4491         struct dm_msg msg = {
4492                 .num_specs = ARRAY_SIZE(specs),
4493                 .specs = specs,
4494         };
4495
4496         return dm_message_parse(TARGET, &msg, ti->private, argc, argv);
4497 }
4498 /*
4499  * END message interface
4500  */
4501
4502 static struct target_type raid_target = {
4503         .name = "raid45",
4504         .version = {1, 0, 0},
4505         .module = THIS_MODULE,
4506         .ctr = raid_ctr,
4507         .dtr = raid_dtr,
4508         .map = raid_map,
4509         .presuspend = raid_presuspend,
4510         .postsuspend = raid_postsuspend,
4511         .resume = raid_resume,
4512         .status = raid_status,
4513         .message = raid_message,
4514 };
4515
4516 static void init_exit(const char *bad_msg, const char *good_msg, int r)
4517 {
4518         if (r)
4519                 DMERR("Failed to %sregister target [%d]", bad_msg, r);
4520         else
4521                 DMINFO("%s %s", good_msg, version);
4522 }
4523
4524 static int __init dm_raid_init(void)
4525 {
4526         int r = dm_register_target(&raid_target);
4527
4528         init_exit("", "initialized", r);
4529         return r;
4530 }
4531
4532 static void __exit dm_raid_exit(void)
4533 {
4534         dm_unregister_target(&raid_target);
4535         init_exit("un", "exit", 0);
4536 }
4537
4538 /* Module hooks. */
4539 module_init(dm_raid_init);
4540 module_exit(dm_raid_exit);
4541
4542 MODULE_DESCRIPTION(DM_NAME " raid4/5 target");
4543 MODULE_AUTHOR("Heinz Mauelshagen <hjm@redhat.com>");
4544 MODULE_LICENSE("GPL");
4545 MODULE_ALIAS("dm-raid4");
4546 MODULE_ALIAS("dm-raid5");