md: report failure if a 'set faulty' request doesn't.
[linux-flexiantxendom0-3.2.10.git] / drivers / md / md.c
1 /*
2    md.c : Multiple Devices driver for Linux
3           Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5      completely rewritten, based on the MD driver code from Marc Zyngier
6
7    Changes:
8
9    - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10    - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11    - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12    - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13    - kmod support by: Cyrus Durgin
14    - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15    - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17    - lots of fixes and improvements to the RAID1/RAID5 and generic
18      RAID code (such as request based resynchronization):
19
20      Neil Brown <neilb@cse.unsw.edu.au>.
21
22    - persistent bitmap code
23      Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25    This program is free software; you can redistribute it and/or modify
26    it under the terms of the GNU General Public License as published by
27    the Free Software Foundation; either version 2, or (at your option)
28    any later version.
29
30    You should have received a copy of the GNU General Public License
31    (for example /usr/src/linux/COPYING); if not, write to the Free
32    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */
34
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
39 #include <linux/mutex.h>
40 #include <linux/buffer_head.h> /* for invalidate_bdev */
41 #include <linux/poll.h>
42 #include <linux/ctype.h>
43 #include <linux/string.h>
44 #include <linux/hdreg.h>
45 #include <linux/proc_fs.h>
46 #include <linux/random.h>
47 #include <linux/reboot.h>
48 #include <linux/file.h>
49 #include <linux/compat.h>
50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h>
54 #include "md.h"
55 #include "bitmap.h"
56
57 #define DEBUG 0
58 #define dprintk(x...) ((void)(DEBUG && printk(x)))
59
60 #ifndef MODULE
61 static void autostart_arrays(int part);
62 #endif
63
64 static LIST_HEAD(pers_list);
65 static DEFINE_SPINLOCK(pers_lock);
66
67 static void md_print_devices(void);
68
69 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70 static struct workqueue_struct *md_wq;
71 static struct workqueue_struct *md_misc_wq;
72
73 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
74
75 /*
76  * Default number of read corrections we'll attempt on an rdev
77  * before ejecting it from the array. We divide the read error
78  * count by 2 for every hour elapsed between read errors.
79  */
80 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
81 /*
82  * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
83  * is 1000 KB/sec, so the extra system load does not show up that much.
84  * Increase it if you want to have more _guaranteed_ speed. Note that
85  * the RAID driver will use the maximum available bandwidth if the IO
86  * subsystem is idle. There is also an 'absolute maximum' reconstruction
87  * speed limit - in case reconstruction slows down your system despite
88  * idle IO detection.
89  *
90  * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
91  * or /sys/block/mdX/md/sync_speed_{min,max}
92  */
93
94 static int sysctl_speed_limit_min = 1000;
95 static int sysctl_speed_limit_max = 200000;
96 static inline int speed_min(mddev_t *mddev)
97 {
98         return mddev->sync_speed_min ?
99                 mddev->sync_speed_min : sysctl_speed_limit_min;
100 }
101
102 static inline int speed_max(mddev_t *mddev)
103 {
104         return mddev->sync_speed_max ?
105                 mddev->sync_speed_max : sysctl_speed_limit_max;
106 }
107
108 static struct ctl_table_header *raid_table_header;
109
110 static ctl_table raid_table[] = {
111         {
112                 .procname       = "speed_limit_min",
113                 .data           = &sysctl_speed_limit_min,
114                 .maxlen         = sizeof(int),
115                 .mode           = S_IRUGO|S_IWUSR,
116                 .proc_handler   = proc_dointvec,
117         },
118         {
119                 .procname       = "speed_limit_max",
120                 .data           = &sysctl_speed_limit_max,
121                 .maxlen         = sizeof(int),
122                 .mode           = S_IRUGO|S_IWUSR,
123                 .proc_handler   = proc_dointvec,
124         },
125         { }
126 };
127
128 static ctl_table raid_dir_table[] = {
129         {
130                 .procname       = "raid",
131                 .maxlen         = 0,
132                 .mode           = S_IRUGO|S_IXUGO,
133                 .child          = raid_table,
134         },
135         { }
136 };
137
138 static ctl_table raid_root_table[] = {
139         {
140                 .procname       = "dev",
141                 .maxlen         = 0,
142                 .mode           = 0555,
143                 .child          = raid_dir_table,
144         },
145         {  }
146 };
147
148 static const struct block_device_operations md_fops;
149
150 static int start_readonly;
151
152 /* bio_clone_mddev
153  * like bio_clone, but with a local bio set
154  */
155
156 static void mddev_bio_destructor(struct bio *bio)
157 {
158         mddev_t *mddev, **mddevp;
159
160         mddevp = (void*)bio;
161         mddev = mddevp[-1];
162
163         bio_free(bio, mddev->bio_set);
164 }
165
166 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
167                             mddev_t *mddev)
168 {
169         struct bio *b;
170         mddev_t **mddevp;
171
172         if (!mddev || !mddev->bio_set)
173                 return bio_alloc(gfp_mask, nr_iovecs);
174
175         b = bio_alloc_bioset(gfp_mask, nr_iovecs,
176                              mddev->bio_set);
177         if (!b)
178                 return NULL;
179         mddevp = (void*)b;
180         mddevp[-1] = mddev;
181         b->bi_destructor = mddev_bio_destructor;
182         return b;
183 }
184 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
185
186 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
187                             mddev_t *mddev)
188 {
189         struct bio *b;
190         mddev_t **mddevp;
191
192         if (!mddev || !mddev->bio_set)
193                 return bio_clone(bio, gfp_mask);
194
195         b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
196                              mddev->bio_set);
197         if (!b)
198                 return NULL;
199         mddevp = (void*)b;
200         mddevp[-1] = mddev;
201         b->bi_destructor = mddev_bio_destructor;
202         __bio_clone(b, bio);
203         if (bio_integrity(bio)) {
204                 int ret;
205
206                 ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
207
208                 if (ret < 0) {
209                         bio_put(b);
210                         return NULL;
211                 }
212         }
213
214         return b;
215 }
216 EXPORT_SYMBOL_GPL(bio_clone_mddev);
217
218 void md_trim_bio(struct bio *bio, int offset, int size)
219 {
220         /* 'bio' is a cloned bio which we need to trim to match
221          * the given offset and size.
222          * This requires adjusting bi_sector, bi_size, and bi_io_vec
223          */
224         int i;
225         struct bio_vec *bvec;
226         int sofar = 0;
227
228         size <<= 9;
229         if (offset == 0 && size == bio->bi_size)
230                 return;
231
232         bio->bi_sector += offset;
233         bio->bi_size = size;
234         offset <<= 9;
235         clear_bit(BIO_SEG_VALID, &bio->bi_flags);
236
237         while (bio->bi_idx < bio->bi_vcnt &&
238                bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
239                 /* remove this whole bio_vec */
240                 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
241                 bio->bi_idx++;
242         }
243         if (bio->bi_idx < bio->bi_vcnt) {
244                 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
245                 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
246         }
247         /* avoid any complications with bi_idx being non-zero*/
248         if (bio->bi_idx) {
249                 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
250                         (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
251                 bio->bi_vcnt -= bio->bi_idx;
252                 bio->bi_idx = 0;
253         }
254         /* Make sure vcnt and last bv are not too big */
255         bio_for_each_segment(bvec, bio, i) {
256                 if (sofar + bvec->bv_len > size)
257                         bvec->bv_len = size - sofar;
258                 if (bvec->bv_len == 0) {
259                         bio->bi_vcnt = i;
260                         break;
261                 }
262                 sofar += bvec->bv_len;
263         }
264 }
265 EXPORT_SYMBOL_GPL(md_trim_bio);
266
267 /*
268  * We have a system wide 'event count' that is incremented
269  * on any 'interesting' event, and readers of /proc/mdstat
270  * can use 'poll' or 'select' to find out when the event
271  * count increases.
272  *
273  * Events are:
274  *  start array, stop array, error, add device, remove device,
275  *  start build, activate spare
276  */
277 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
278 static atomic_t md_event_count;
279 void md_new_event(mddev_t *mddev)
280 {
281         atomic_inc(&md_event_count);
282         wake_up(&md_event_waiters);
283 }
284 EXPORT_SYMBOL_GPL(md_new_event);
285
286 /* Alternate version that can be called from interrupts
287  * when calling sysfs_notify isn't needed.
288  */
289 static void md_new_event_inintr(mddev_t *mddev)
290 {
291         atomic_inc(&md_event_count);
292         wake_up(&md_event_waiters);
293 }
294
295 /*
296  * Enables to iterate over all existing md arrays
297  * all_mddevs_lock protects this list.
298  */
299 static LIST_HEAD(all_mddevs);
300 static DEFINE_SPINLOCK(all_mddevs_lock);
301
302
303 /*
304  * iterates through all used mddevs in the system.
305  * We take care to grab the all_mddevs_lock whenever navigating
306  * the list, and to always hold a refcount when unlocked.
307  * Any code which breaks out of this loop while own
308  * a reference to the current mddev and must mddev_put it.
309  */
310 #define for_each_mddev(mddev,tmp)                                       \
311                                                                         \
312         for (({ spin_lock(&all_mddevs_lock);                            \
313                 tmp = all_mddevs.next;                                  \
314                 mddev = NULL;});                                        \
315              ({ if (tmp != &all_mddevs)                                 \
316                         mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
317                 spin_unlock(&all_mddevs_lock);                          \
318                 if (mddev) mddev_put(mddev);                            \
319                 mddev = list_entry(tmp, mddev_t, all_mddevs);           \
320                 tmp != &all_mddevs;});                                  \
321              ({ spin_lock(&all_mddevs_lock);                            \
322                 tmp = tmp->next;})                                      \
323                 )
324
325
326 /* Rather than calling directly into the personality make_request function,
327  * IO requests come here first so that we can check if the device is
328  * being suspended pending a reconfiguration.
329  * We hold a refcount over the call to ->make_request.  By the time that
330  * call has finished, the bio has been linked into some internal structure
331  * and so is visible to ->quiesce(), so we don't need the refcount any more.
332  */
333 static int md_make_request(struct request_queue *q, struct bio *bio)
334 {
335         const int rw = bio_data_dir(bio);
336         mddev_t *mddev = q->queuedata;
337         int rv;
338         int cpu;
339         unsigned int sectors;
340
341         if (mddev == NULL || mddev->pers == NULL
342             || !mddev->ready) {
343                 bio_io_error(bio);
344                 return 0;
345         }
346         smp_rmb(); /* Ensure implications of  'active' are visible */
347         rcu_read_lock();
348         if (mddev->suspended) {
349                 DEFINE_WAIT(__wait);
350                 for (;;) {
351                         prepare_to_wait(&mddev->sb_wait, &__wait,
352                                         TASK_UNINTERRUPTIBLE);
353                         if (!mddev->suspended)
354                                 break;
355                         rcu_read_unlock();
356                         schedule();
357                         rcu_read_lock();
358                 }
359                 finish_wait(&mddev->sb_wait, &__wait);
360         }
361         atomic_inc(&mddev->active_io);
362         rcu_read_unlock();
363
364         /*
365          * save the sectors now since our bio can
366          * go away inside make_request
367          */
368         sectors = bio_sectors(bio);
369         rv = mddev->pers->make_request(mddev, bio);
370
371         cpu = part_stat_lock();
372         part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
373         part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
374         part_stat_unlock();
375
376         if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
377                 wake_up(&mddev->sb_wait);
378
379         return rv;
380 }
381
382 /* mddev_suspend makes sure no new requests are submitted
383  * to the device, and that any requests that have been submitted
384  * are completely handled.
385  * Once ->stop is called and completes, the module will be completely
386  * unused.
387  */
388 void mddev_suspend(mddev_t *mddev)
389 {
390         BUG_ON(mddev->suspended);
391         mddev->suspended = 1;
392         synchronize_rcu();
393         wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
394         mddev->pers->quiesce(mddev, 1);
395 }
396 EXPORT_SYMBOL_GPL(mddev_suspend);
397
398 void mddev_resume(mddev_t *mddev)
399 {
400         mddev->suspended = 0;
401         wake_up(&mddev->sb_wait);
402         mddev->pers->quiesce(mddev, 0);
403
404         md_wakeup_thread(mddev->thread);
405         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
406 }
407 EXPORT_SYMBOL_GPL(mddev_resume);
408
409 int mddev_congested(mddev_t *mddev, int bits)
410 {
411         return mddev->suspended;
412 }
413 EXPORT_SYMBOL(mddev_congested);
414
415 /*
416  * Generic flush handling for md
417  */
418
419 static void md_end_flush(struct bio *bio, int err)
420 {
421         mdk_rdev_t *rdev = bio->bi_private;
422         mddev_t *mddev = rdev->mddev;
423
424         rdev_dec_pending(rdev, mddev);
425
426         if (atomic_dec_and_test(&mddev->flush_pending)) {
427                 /* The pre-request flush has finished */
428                 queue_work(md_wq, &mddev->flush_work);
429         }
430         bio_put(bio);
431 }
432
433 static void md_submit_flush_data(struct work_struct *ws);
434
435 static void submit_flushes(struct work_struct *ws)
436 {
437         mddev_t *mddev = container_of(ws, mddev_t, flush_work);
438         mdk_rdev_t *rdev;
439
440         INIT_WORK(&mddev->flush_work, md_submit_flush_data);
441         atomic_set(&mddev->flush_pending, 1);
442         rcu_read_lock();
443         list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
444                 if (rdev->raid_disk >= 0 &&
445                     !test_bit(Faulty, &rdev->flags)) {
446                         /* Take two references, one is dropped
447                          * when request finishes, one after
448                          * we reclaim rcu_read_lock
449                          */
450                         struct bio *bi;
451                         atomic_inc(&rdev->nr_pending);
452                         atomic_inc(&rdev->nr_pending);
453                         rcu_read_unlock();
454                         bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
455                         bi->bi_end_io = md_end_flush;
456                         bi->bi_private = rdev;
457                         bi->bi_bdev = rdev->bdev;
458                         atomic_inc(&mddev->flush_pending);
459                         submit_bio(WRITE_FLUSH, bi);
460                         rcu_read_lock();
461                         rdev_dec_pending(rdev, mddev);
462                 }
463         rcu_read_unlock();
464         if (atomic_dec_and_test(&mddev->flush_pending))
465                 queue_work(md_wq, &mddev->flush_work);
466 }
467
468 static void md_submit_flush_data(struct work_struct *ws)
469 {
470         mddev_t *mddev = container_of(ws, mddev_t, flush_work);
471         struct bio *bio = mddev->flush_bio;
472
473         if (bio->bi_size == 0)
474                 /* an empty barrier - all done */
475                 bio_endio(bio, 0);
476         else {
477                 bio->bi_rw &= ~REQ_FLUSH;
478                 if (mddev->pers->make_request(mddev, bio))
479                         generic_make_request(bio);
480         }
481
482         mddev->flush_bio = NULL;
483         wake_up(&mddev->sb_wait);
484 }
485
486 void md_flush_request(mddev_t *mddev, struct bio *bio)
487 {
488         spin_lock_irq(&mddev->write_lock);
489         wait_event_lock_irq(mddev->sb_wait,
490                             !mddev->flush_bio,
491                             mddev->write_lock, /*nothing*/);
492         mddev->flush_bio = bio;
493         spin_unlock_irq(&mddev->write_lock);
494
495         INIT_WORK(&mddev->flush_work, submit_flushes);
496         queue_work(md_wq, &mddev->flush_work);
497 }
498 EXPORT_SYMBOL(md_flush_request);
499
500 /* Support for plugging.
501  * This mirrors the plugging support in request_queue, but does not
502  * require having a whole queue or request structures.
503  * We allocate an md_plug_cb for each md device and each thread it gets
504  * plugged on.  This links tot the private plug_handle structure in the
505  * personality data where we keep a count of the number of outstanding
506  * plugs so other code can see if a plug is active.
507  */
508 struct md_plug_cb {
509         struct blk_plug_cb cb;
510         mddev_t *mddev;
511 };
512
513 static void plugger_unplug(struct blk_plug_cb *cb)
514 {
515         struct md_plug_cb *mdcb = container_of(cb, struct md_plug_cb, cb);
516         if (atomic_dec_and_test(&mdcb->mddev->plug_cnt))
517                 md_wakeup_thread(mdcb->mddev->thread);
518         kfree(mdcb);
519 }
520
521 /* Check that an unplug wakeup will come shortly.
522  * If not, wakeup the md thread immediately
523  */
524 int mddev_check_plugged(mddev_t *mddev)
525 {
526         struct blk_plug *plug = current->plug;
527         struct md_plug_cb *mdcb;
528
529         if (!plug)
530                 return 0;
531
532         list_for_each_entry(mdcb, &plug->cb_list, cb.list) {
533                 if (mdcb->cb.callback == plugger_unplug &&
534                     mdcb->mddev == mddev) {
535                         /* Already on the list, move to top */
536                         if (mdcb != list_first_entry(&plug->cb_list,
537                                                     struct md_plug_cb,
538                                                     cb.list))
539                                 list_move(&mdcb->cb.list, &plug->cb_list);
540                         return 1;
541                 }
542         }
543         /* Not currently on the callback list */
544         mdcb = kmalloc(sizeof(*mdcb), GFP_ATOMIC);
545         if (!mdcb)
546                 return 0;
547
548         mdcb->mddev = mddev;
549         mdcb->cb.callback = plugger_unplug;
550         atomic_inc(&mddev->plug_cnt);
551         list_add(&mdcb->cb.list, &plug->cb_list);
552         return 1;
553 }
554 EXPORT_SYMBOL_GPL(mddev_check_plugged);
555
556 static inline mddev_t *mddev_get(mddev_t *mddev)
557 {
558         atomic_inc(&mddev->active);
559         return mddev;
560 }
561
562 static void mddev_delayed_delete(struct work_struct *ws);
563
564 static void mddev_put(mddev_t *mddev)
565 {
566         struct bio_set *bs = NULL;
567
568         if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
569                 return;
570         if (!mddev->raid_disks && list_empty(&mddev->disks) &&
571             mddev->ctime == 0 && !mddev->hold_active) {
572                 /* Array is not configured at all, and not held active,
573                  * so destroy it */
574                 list_del(&mddev->all_mddevs);
575                 bs = mddev->bio_set;
576                 mddev->bio_set = NULL;
577                 if (mddev->gendisk) {
578                         /* We did a probe so need to clean up.  Call
579                          * queue_work inside the spinlock so that
580                          * flush_workqueue() after mddev_find will
581                          * succeed in waiting for the work to be done.
582                          */
583                         INIT_WORK(&mddev->del_work, mddev_delayed_delete);
584                         queue_work(md_misc_wq, &mddev->del_work);
585                 } else
586                         kfree(mddev);
587         }
588         spin_unlock(&all_mddevs_lock);
589         if (bs)
590                 bioset_free(bs);
591 }
592
593 void mddev_init(mddev_t *mddev)
594 {
595         mutex_init(&mddev->open_mutex);
596         mutex_init(&mddev->reconfig_mutex);
597         mutex_init(&mddev->bitmap_info.mutex);
598         INIT_LIST_HEAD(&mddev->disks);
599         INIT_LIST_HEAD(&mddev->all_mddevs);
600         init_timer(&mddev->safemode_timer);
601         atomic_set(&mddev->active, 1);
602         atomic_set(&mddev->openers, 0);
603         atomic_set(&mddev->active_io, 0);
604         atomic_set(&mddev->plug_cnt, 0);
605         spin_lock_init(&mddev->write_lock);
606         atomic_set(&mddev->flush_pending, 0);
607         init_waitqueue_head(&mddev->sb_wait);
608         init_waitqueue_head(&mddev->recovery_wait);
609         mddev->reshape_position = MaxSector;
610         mddev->resync_min = 0;
611         mddev->resync_max = MaxSector;
612         mddev->level = LEVEL_NONE;
613 }
614 EXPORT_SYMBOL_GPL(mddev_init);
615
616 static mddev_t * mddev_find(dev_t unit)
617 {
618         mddev_t *mddev, *new = NULL;
619
620         if (unit && MAJOR(unit) != MD_MAJOR)
621                 unit &= ~((1<<MdpMinorShift)-1);
622
623  retry:
624         spin_lock(&all_mddevs_lock);
625
626         if (unit) {
627                 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
628                         if (mddev->unit == unit) {
629                                 mddev_get(mddev);
630                                 spin_unlock(&all_mddevs_lock);
631                                 kfree(new);
632                                 return mddev;
633                         }
634
635                 if (new) {
636                         list_add(&new->all_mddevs, &all_mddevs);
637                         spin_unlock(&all_mddevs_lock);
638                         new->hold_active = UNTIL_IOCTL;
639                         return new;
640                 }
641         } else if (new) {
642                 /* find an unused unit number */
643                 static int next_minor = 512;
644                 int start = next_minor;
645                 int is_free = 0;
646                 int dev = 0;
647                 while (!is_free) {
648                         dev = MKDEV(MD_MAJOR, next_minor);
649                         next_minor++;
650                         if (next_minor > MINORMASK)
651                                 next_minor = 0;
652                         if (next_minor == start) {
653                                 /* Oh dear, all in use. */
654                                 spin_unlock(&all_mddevs_lock);
655                                 kfree(new);
656                                 return NULL;
657                         }
658                                 
659                         is_free = 1;
660                         list_for_each_entry(mddev, &all_mddevs, all_mddevs)
661                                 if (mddev->unit == dev) {
662                                         is_free = 0;
663                                         break;
664                                 }
665                 }
666                 new->unit = dev;
667                 new->md_minor = MINOR(dev);
668                 new->hold_active = UNTIL_STOP;
669                 list_add(&new->all_mddevs, &all_mddevs);
670                 spin_unlock(&all_mddevs_lock);
671                 return new;
672         }
673         spin_unlock(&all_mddevs_lock);
674
675         new = kzalloc(sizeof(*new), GFP_KERNEL);
676         if (!new)
677                 return NULL;
678
679         new->unit = unit;
680         if (MAJOR(unit) == MD_MAJOR)
681                 new->md_minor = MINOR(unit);
682         else
683                 new->md_minor = MINOR(unit) >> MdpMinorShift;
684
685         mddev_init(new);
686
687         goto retry;
688 }
689
690 static inline int mddev_lock(mddev_t * mddev)
691 {
692         return mutex_lock_interruptible(&mddev->reconfig_mutex);
693 }
694
695 static inline int mddev_is_locked(mddev_t *mddev)
696 {
697         return mutex_is_locked(&mddev->reconfig_mutex);
698 }
699
700 static inline int mddev_trylock(mddev_t * mddev)
701 {
702         return mutex_trylock(&mddev->reconfig_mutex);
703 }
704
705 static struct attribute_group md_redundancy_group;
706
707 static void mddev_unlock(mddev_t * mddev)
708 {
709         if (mddev->to_remove) {
710                 /* These cannot be removed under reconfig_mutex as
711                  * an access to the files will try to take reconfig_mutex
712                  * while holding the file unremovable, which leads to
713                  * a deadlock.
714                  * So hold set sysfs_active while the remove in happeing,
715                  * and anything else which might set ->to_remove or my
716                  * otherwise change the sysfs namespace will fail with
717                  * -EBUSY if sysfs_active is still set.
718                  * We set sysfs_active under reconfig_mutex and elsewhere
719                  * test it under the same mutex to ensure its correct value
720                  * is seen.
721                  */
722                 struct attribute_group *to_remove = mddev->to_remove;
723                 mddev->to_remove = NULL;
724                 mddev->sysfs_active = 1;
725                 mutex_unlock(&mddev->reconfig_mutex);
726
727                 if (mddev->kobj.sd) {
728                         if (to_remove != &md_redundancy_group)
729                                 sysfs_remove_group(&mddev->kobj, to_remove);
730                         if (mddev->pers == NULL ||
731                             mddev->pers->sync_request == NULL) {
732                                 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
733                                 if (mddev->sysfs_action)
734                                         sysfs_put(mddev->sysfs_action);
735                                 mddev->sysfs_action = NULL;
736                         }
737                 }
738                 mddev->sysfs_active = 0;
739         } else
740                 mutex_unlock(&mddev->reconfig_mutex);
741
742         md_wakeup_thread(mddev->thread);
743 }
744
745 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
746 {
747         mdk_rdev_t *rdev;
748
749         list_for_each_entry(rdev, &mddev->disks, same_set)
750                 if (rdev->desc_nr == nr)
751                         return rdev;
752
753         return NULL;
754 }
755
756 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
757 {
758         mdk_rdev_t *rdev;
759
760         list_for_each_entry(rdev, &mddev->disks, same_set)
761                 if (rdev->bdev->bd_dev == dev)
762                         return rdev;
763
764         return NULL;
765 }
766
767 static struct mdk_personality *find_pers(int level, char *clevel)
768 {
769         struct mdk_personality *pers;
770         list_for_each_entry(pers, &pers_list, list) {
771                 if (level != LEVEL_NONE && pers->level == level)
772                         return pers;
773                 if (strcmp(pers->name, clevel)==0)
774                         return pers;
775         }
776         return NULL;
777 }
778
779 /* return the offset of the super block in 512byte sectors */
780 static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
781 {
782         sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
783         return MD_NEW_SIZE_SECTORS(num_sectors);
784 }
785
786 static int alloc_disk_sb(mdk_rdev_t * rdev)
787 {
788         if (rdev->sb_page)
789                 MD_BUG();
790
791         rdev->sb_page = alloc_page(GFP_KERNEL);
792         if (!rdev->sb_page) {
793                 printk(KERN_ALERT "md: out of memory.\n");
794                 return -ENOMEM;
795         }
796
797         return 0;
798 }
799
800 static void free_disk_sb(mdk_rdev_t * rdev)
801 {
802         if (rdev->sb_page) {
803                 put_page(rdev->sb_page);
804                 rdev->sb_loaded = 0;
805                 rdev->sb_page = NULL;
806                 rdev->sb_start = 0;
807                 rdev->sectors = 0;
808         }
809         if (rdev->bb_page) {
810                 put_page(rdev->bb_page);
811                 rdev->bb_page = NULL;
812         }
813 }
814
815
816 static void super_written(struct bio *bio, int error)
817 {
818         mdk_rdev_t *rdev = bio->bi_private;
819         mddev_t *mddev = rdev->mddev;
820
821         if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
822                 printk("md: super_written gets error=%d, uptodate=%d\n",
823                        error, test_bit(BIO_UPTODATE, &bio->bi_flags));
824                 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
825                 md_error(mddev, rdev);
826         }
827
828         if (atomic_dec_and_test(&mddev->pending_writes))
829                 wake_up(&mddev->sb_wait);
830         bio_put(bio);
831 }
832
833 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
834                    sector_t sector, int size, struct page *page)
835 {
836         /* write first size bytes of page to sector of rdev
837          * Increment mddev->pending_writes before returning
838          * and decrement it on completion, waking up sb_wait
839          * if zero is reached.
840          * If an error occurred, call md_error
841          */
842         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
843
844         bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
845         bio->bi_sector = sector;
846         bio_add_page(bio, page, size, 0);
847         bio->bi_private = rdev;
848         bio->bi_end_io = super_written;
849
850         atomic_inc(&mddev->pending_writes);
851         submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
852 }
853
854 void md_super_wait(mddev_t *mddev)
855 {
856         /* wait for all superblock writes that were scheduled to complete */
857         DEFINE_WAIT(wq);
858         for(;;) {
859                 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
860                 if (atomic_read(&mddev->pending_writes)==0)
861                         break;
862                 schedule();
863         }
864         finish_wait(&mddev->sb_wait, &wq);
865 }
866
867 static void bi_complete(struct bio *bio, int error)
868 {
869         complete((struct completion*)bio->bi_private);
870 }
871
872 int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
873                  struct page *page, int rw, bool metadata_op)
874 {
875         struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
876         struct completion event;
877         int ret;
878
879         rw |= REQ_SYNC;
880
881         bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
882                 rdev->meta_bdev : rdev->bdev;
883         if (metadata_op)
884                 bio->bi_sector = sector + rdev->sb_start;
885         else
886                 bio->bi_sector = sector + rdev->data_offset;
887         bio_add_page(bio, page, size, 0);
888         init_completion(&event);
889         bio->bi_private = &event;
890         bio->bi_end_io = bi_complete;
891         submit_bio(rw, bio);
892         wait_for_completion(&event);
893
894         ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
895         bio_put(bio);
896         return ret;
897 }
898 EXPORT_SYMBOL_GPL(sync_page_io);
899
900 static int read_disk_sb(mdk_rdev_t * rdev, int size)
901 {
902         char b[BDEVNAME_SIZE];
903         if (!rdev->sb_page) {
904                 MD_BUG();
905                 return -EINVAL;
906         }
907         if (rdev->sb_loaded)
908                 return 0;
909
910
911         if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
912                 goto fail;
913         rdev->sb_loaded = 1;
914         return 0;
915
916 fail:
917         printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
918                 bdevname(rdev->bdev,b));
919         return -EINVAL;
920 }
921
922 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
923 {
924         return  sb1->set_uuid0 == sb2->set_uuid0 &&
925                 sb1->set_uuid1 == sb2->set_uuid1 &&
926                 sb1->set_uuid2 == sb2->set_uuid2 &&
927                 sb1->set_uuid3 == sb2->set_uuid3;
928 }
929
930 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
931 {
932         int ret;
933         mdp_super_t *tmp1, *tmp2;
934
935         tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
936         tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
937
938         if (!tmp1 || !tmp2) {
939                 ret = 0;
940                 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
941                 goto abort;
942         }
943
944         *tmp1 = *sb1;
945         *tmp2 = *sb2;
946
947         /*
948          * nr_disks is not constant
949          */
950         tmp1->nr_disks = 0;
951         tmp2->nr_disks = 0;
952
953         ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
954 abort:
955         kfree(tmp1);
956         kfree(tmp2);
957         return ret;
958 }
959
960
961 static u32 md_csum_fold(u32 csum)
962 {
963         csum = (csum & 0xffff) + (csum >> 16);
964         return (csum & 0xffff) + (csum >> 16);
965 }
966
967 static unsigned int calc_sb_csum(mdp_super_t * sb)
968 {
969         u64 newcsum = 0;
970         u32 *sb32 = (u32*)sb;
971         int i;
972         unsigned int disk_csum, csum;
973
974         disk_csum = sb->sb_csum;
975         sb->sb_csum = 0;
976
977         for (i = 0; i < MD_SB_BYTES/4 ; i++)
978                 newcsum += sb32[i];
979         csum = (newcsum & 0xffffffff) + (newcsum>>32);
980
981
982 #ifdef CONFIG_ALPHA
983         /* This used to use csum_partial, which was wrong for several
984          * reasons including that different results are returned on
985          * different architectures.  It isn't critical that we get exactly
986          * the same return value as before (we always csum_fold before
987          * testing, and that removes any differences).  However as we
988          * know that csum_partial always returned a 16bit value on
989          * alphas, do a fold to maximise conformity to previous behaviour.
990          */
991         sb->sb_csum = md_csum_fold(disk_csum);
992 #else
993         sb->sb_csum = disk_csum;
994 #endif
995         return csum;
996 }
997
998
999 /*
1000  * Handle superblock details.
1001  * We want to be able to handle multiple superblock formats
1002  * so we have a common interface to them all, and an array of
1003  * different handlers.
1004  * We rely on user-space to write the initial superblock, and support
1005  * reading and updating of superblocks.
1006  * Interface methods are:
1007  *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
1008  *      loads and validates a superblock on dev.
1009  *      if refdev != NULL, compare superblocks on both devices
1010  *    Return:
1011  *      0 - dev has a superblock that is compatible with refdev
1012  *      1 - dev has a superblock that is compatible and newer than refdev
1013  *          so dev should be used as the refdev in future
1014  *     -EINVAL superblock incompatible or invalid
1015  *     -othererror e.g. -EIO
1016  *
1017  *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
1018  *      Verify that dev is acceptable into mddev.
1019  *       The first time, mddev->raid_disks will be 0, and data from
1020  *       dev should be merged in.  Subsequent calls check that dev
1021  *       is new enough.  Return 0 or -EINVAL
1022  *
1023  *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
1024  *     Update the superblock for rdev with data in mddev
1025  *     This does not write to disc.
1026  *
1027  */
1028
1029 struct super_type  {
1030         char                *name;
1031         struct module       *owner;
1032         int                 (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
1033                                           int minor_version);
1034         int                 (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
1035         void                (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
1036         unsigned long long  (*rdev_size_change)(mdk_rdev_t *rdev,
1037                                                 sector_t num_sectors);
1038 };
1039
1040 /*
1041  * Check that the given mddev has no bitmap.
1042  *
1043  * This function is called from the run method of all personalities that do not
1044  * support bitmaps. It prints an error message and returns non-zero if mddev
1045  * has a bitmap. Otherwise, it returns 0.
1046  *
1047  */
1048 int md_check_no_bitmap(mddev_t *mddev)
1049 {
1050         if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1051                 return 0;
1052         printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
1053                 mdname(mddev), mddev->pers->name);
1054         return 1;
1055 }
1056 EXPORT_SYMBOL(md_check_no_bitmap);
1057
1058 /*
1059  * load_super for 0.90.0 
1060  */
1061 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1062 {
1063         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1064         mdp_super_t *sb;
1065         int ret;
1066
1067         /*
1068          * Calculate the position of the superblock (512byte sectors),
1069          * it's at the end of the disk.
1070          *
1071          * It also happens to be a multiple of 4Kb.
1072          */
1073         rdev->sb_start = calc_dev_sboffset(rdev);
1074
1075         ret = read_disk_sb(rdev, MD_SB_BYTES);
1076         if (ret) return ret;
1077
1078         ret = -EINVAL;
1079
1080         bdevname(rdev->bdev, b);
1081         sb = page_address(rdev->sb_page);
1082
1083         if (sb->md_magic != MD_SB_MAGIC) {
1084                 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
1085                        b);
1086                 goto abort;
1087         }
1088
1089         if (sb->major_version != 0 ||
1090             sb->minor_version < 90 ||
1091             sb->minor_version > 91) {
1092                 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
1093                         sb->major_version, sb->minor_version,
1094                         b);
1095                 goto abort;
1096         }
1097
1098         if (sb->raid_disks <= 0)
1099                 goto abort;
1100
1101         if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1102                 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1103                         b);
1104                 goto abort;
1105         }
1106
1107         rdev->preferred_minor = sb->md_minor;
1108         rdev->data_offset = 0;
1109         rdev->sb_size = MD_SB_BYTES;
1110         rdev->badblocks.shift = -1;
1111
1112         if (sb->level == LEVEL_MULTIPATH)
1113                 rdev->desc_nr = -1;
1114         else
1115                 rdev->desc_nr = sb->this_disk.number;
1116
1117         if (!refdev) {
1118                 ret = 1;
1119         } else {
1120                 __u64 ev1, ev2;
1121                 mdp_super_t *refsb = page_address(refdev->sb_page);
1122                 if (!uuid_equal(refsb, sb)) {
1123                         printk(KERN_WARNING "md: %s has different UUID to %s\n",
1124                                 b, bdevname(refdev->bdev,b2));
1125                         goto abort;
1126                 }
1127                 if (!sb_equal(refsb, sb)) {
1128                         printk(KERN_WARNING "md: %s has same UUID"
1129                                " but different superblock to %s\n",
1130                                b, bdevname(refdev->bdev, b2));
1131                         goto abort;
1132                 }
1133                 ev1 = md_event(sb);
1134                 ev2 = md_event(refsb);
1135                 if (ev1 > ev2)
1136                         ret = 1;
1137                 else 
1138                         ret = 0;
1139         }
1140         rdev->sectors = rdev->sb_start;
1141
1142         if (rdev->sectors < sb->size * 2 && sb->level > 1)
1143                 /* "this cannot possibly happen" ... */
1144                 ret = -EINVAL;
1145
1146  abort:
1147         return ret;
1148 }
1149
1150 /*
1151  * validate_super for 0.90.0
1152  */
1153 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1154 {
1155         mdp_disk_t *desc;
1156         mdp_super_t *sb = page_address(rdev->sb_page);
1157         __u64 ev1 = md_event(sb);
1158
1159         rdev->raid_disk = -1;
1160         clear_bit(Faulty, &rdev->flags);
1161         clear_bit(In_sync, &rdev->flags);
1162         clear_bit(WriteMostly, &rdev->flags);
1163
1164         if (mddev->raid_disks == 0) {
1165                 mddev->major_version = 0;
1166                 mddev->minor_version = sb->minor_version;
1167                 mddev->patch_version = sb->patch_version;
1168                 mddev->external = 0;
1169                 mddev->chunk_sectors = sb->chunk_size >> 9;
1170                 mddev->ctime = sb->ctime;
1171                 mddev->utime = sb->utime;
1172                 mddev->level = sb->level;
1173                 mddev->clevel[0] = 0;
1174                 mddev->layout = sb->layout;
1175                 mddev->raid_disks = sb->raid_disks;
1176                 mddev->dev_sectors = sb->size * 2;
1177                 mddev->events = ev1;
1178                 mddev->bitmap_info.offset = 0;
1179                 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1180
1181                 if (mddev->minor_version >= 91) {
1182                         mddev->reshape_position = sb->reshape_position;
1183                         mddev->delta_disks = sb->delta_disks;
1184                         mddev->new_level = sb->new_level;
1185                         mddev->new_layout = sb->new_layout;
1186                         mddev->new_chunk_sectors = sb->new_chunk >> 9;
1187                 } else {
1188                         mddev->reshape_position = MaxSector;
1189                         mddev->delta_disks = 0;
1190                         mddev->new_level = mddev->level;
1191                         mddev->new_layout = mddev->layout;
1192                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1193                 }
1194
1195                 if (sb->state & (1<<MD_SB_CLEAN))
1196                         mddev->recovery_cp = MaxSector;
1197                 else {
1198                         if (sb->events_hi == sb->cp_events_hi && 
1199                                 sb->events_lo == sb->cp_events_lo) {
1200                                 mddev->recovery_cp = sb->recovery_cp;
1201                         } else
1202                                 mddev->recovery_cp = 0;
1203                 }
1204
1205                 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1206                 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1207                 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1208                 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1209
1210                 mddev->max_disks = MD_SB_DISKS;
1211
1212                 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1213                     mddev->bitmap_info.file == NULL)
1214                         mddev->bitmap_info.offset =
1215                                 mddev->bitmap_info.default_offset;
1216
1217         } else if (mddev->pers == NULL) {
1218                 /* Insist on good event counter while assembling, except
1219                  * for spares (which don't need an event count) */
1220                 ++ev1;
1221                 if (sb->disks[rdev->desc_nr].state & (
1222                             (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1223                         if (ev1 < mddev->events) 
1224                                 return -EINVAL;
1225         } else if (mddev->bitmap) {
1226                 /* if adding to array with a bitmap, then we can accept an
1227                  * older device ... but not too old.
1228                  */
1229                 if (ev1 < mddev->bitmap->events_cleared)
1230                         return 0;
1231         } else {
1232                 if (ev1 < mddev->events)
1233                         /* just a hot-add of a new device, leave raid_disk at -1 */
1234                         return 0;
1235         }
1236
1237         if (mddev->level != LEVEL_MULTIPATH) {
1238                 desc = sb->disks + rdev->desc_nr;
1239
1240                 if (desc->state & (1<<MD_DISK_FAULTY))
1241                         set_bit(Faulty, &rdev->flags);
1242                 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1243                             desc->raid_disk < mddev->raid_disks */) {
1244                         set_bit(In_sync, &rdev->flags);
1245                         rdev->raid_disk = desc->raid_disk;
1246                 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1247                         /* active but not in sync implies recovery up to
1248                          * reshape position.  We don't know exactly where
1249                          * that is, so set to zero for now */
1250                         if (mddev->minor_version >= 91) {
1251                                 rdev->recovery_offset = 0;
1252                                 rdev->raid_disk = desc->raid_disk;
1253                         }
1254                 }
1255                 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1256                         set_bit(WriteMostly, &rdev->flags);
1257         } else /* MULTIPATH are always insync */
1258                 set_bit(In_sync, &rdev->flags);
1259         return 0;
1260 }
1261
1262 /*
1263  * sync_super for 0.90.0
1264  */
1265 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1266 {
1267         mdp_super_t *sb;
1268         mdk_rdev_t *rdev2;
1269         int next_spare = mddev->raid_disks;
1270
1271
1272         /* make rdev->sb match mddev data..
1273          *
1274          * 1/ zero out disks
1275          * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1276          * 3/ any empty disks < next_spare become removed
1277          *
1278          * disks[0] gets initialised to REMOVED because
1279          * we cannot be sure from other fields if it has
1280          * been initialised or not.
1281          */
1282         int i;
1283         int active=0, working=0,failed=0,spare=0,nr_disks=0;
1284
1285         rdev->sb_size = MD_SB_BYTES;
1286
1287         sb = page_address(rdev->sb_page);
1288
1289         memset(sb, 0, sizeof(*sb));
1290
1291         sb->md_magic = MD_SB_MAGIC;
1292         sb->major_version = mddev->major_version;
1293         sb->patch_version = mddev->patch_version;
1294         sb->gvalid_words  = 0; /* ignored */
1295         memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1296         memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1297         memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1298         memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1299
1300         sb->ctime = mddev->ctime;
1301         sb->level = mddev->level;
1302         sb->size = mddev->dev_sectors / 2;
1303         sb->raid_disks = mddev->raid_disks;
1304         sb->md_minor = mddev->md_minor;
1305         sb->not_persistent = 0;
1306         sb->utime = mddev->utime;
1307         sb->state = 0;
1308         sb->events_hi = (mddev->events>>32);
1309         sb->events_lo = (u32)mddev->events;
1310
1311         if (mddev->reshape_position == MaxSector)
1312                 sb->minor_version = 90;
1313         else {
1314                 sb->minor_version = 91;
1315                 sb->reshape_position = mddev->reshape_position;
1316                 sb->new_level = mddev->new_level;
1317                 sb->delta_disks = mddev->delta_disks;
1318                 sb->new_layout = mddev->new_layout;
1319                 sb->new_chunk = mddev->new_chunk_sectors << 9;
1320         }
1321         mddev->minor_version = sb->minor_version;
1322         if (mddev->in_sync)
1323         {
1324                 sb->recovery_cp = mddev->recovery_cp;
1325                 sb->cp_events_hi = (mddev->events>>32);
1326                 sb->cp_events_lo = (u32)mddev->events;
1327                 if (mddev->recovery_cp == MaxSector)
1328                         sb->state = (1<< MD_SB_CLEAN);
1329         } else
1330                 sb->recovery_cp = 0;
1331
1332         sb->layout = mddev->layout;
1333         sb->chunk_size = mddev->chunk_sectors << 9;
1334
1335         if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1336                 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1337
1338         sb->disks[0].state = (1<<MD_DISK_REMOVED);
1339         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1340                 mdp_disk_t *d;
1341                 int desc_nr;
1342                 int is_active = test_bit(In_sync, &rdev2->flags);
1343
1344                 if (rdev2->raid_disk >= 0 &&
1345                     sb->minor_version >= 91)
1346                         /* we have nowhere to store the recovery_offset,
1347                          * but if it is not below the reshape_position,
1348                          * we can piggy-back on that.
1349                          */
1350                         is_active = 1;
1351                 if (rdev2->raid_disk < 0 ||
1352                     test_bit(Faulty, &rdev2->flags))
1353                         is_active = 0;
1354                 if (is_active)
1355                         desc_nr = rdev2->raid_disk;
1356                 else
1357                         desc_nr = next_spare++;
1358                 rdev2->desc_nr = desc_nr;
1359                 d = &sb->disks[rdev2->desc_nr];
1360                 nr_disks++;
1361                 d->number = rdev2->desc_nr;
1362                 d->major = MAJOR(rdev2->bdev->bd_dev);
1363                 d->minor = MINOR(rdev2->bdev->bd_dev);
1364                 if (is_active)
1365                         d->raid_disk = rdev2->raid_disk;
1366                 else
1367                         d->raid_disk = rdev2->desc_nr; /* compatibility */
1368                 if (test_bit(Faulty, &rdev2->flags))
1369                         d->state = (1<<MD_DISK_FAULTY);
1370                 else if (is_active) {
1371                         d->state = (1<<MD_DISK_ACTIVE);
1372                         if (test_bit(In_sync, &rdev2->flags))
1373                                 d->state |= (1<<MD_DISK_SYNC);
1374                         active++;
1375                         working++;
1376                 } else {
1377                         d->state = 0;
1378                         spare++;
1379                         working++;
1380                 }
1381                 if (test_bit(WriteMostly, &rdev2->flags))
1382                         d->state |= (1<<MD_DISK_WRITEMOSTLY);
1383         }
1384         /* now set the "removed" and "faulty" bits on any missing devices */
1385         for (i=0 ; i < mddev->raid_disks ; i++) {
1386                 mdp_disk_t *d = &sb->disks[i];
1387                 if (d->state == 0 && d->number == 0) {
1388                         d->number = i;
1389                         d->raid_disk = i;
1390                         d->state = (1<<MD_DISK_REMOVED);
1391                         d->state |= (1<<MD_DISK_FAULTY);
1392                         failed++;
1393                 }
1394         }
1395         sb->nr_disks = nr_disks;
1396         sb->active_disks = active;
1397         sb->working_disks = working;
1398         sb->failed_disks = failed;
1399         sb->spare_disks = spare;
1400
1401         sb->this_disk = sb->disks[rdev->desc_nr];
1402         sb->sb_csum = calc_sb_csum(sb);
1403 }
1404
1405 /*
1406  * rdev_size_change for 0.90.0
1407  */
1408 static unsigned long long
1409 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1410 {
1411         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1412                 return 0; /* component must fit device */
1413         if (rdev->mddev->bitmap_info.offset)
1414                 return 0; /* can't move bitmap */
1415         rdev->sb_start = calc_dev_sboffset(rdev);
1416         if (!num_sectors || num_sectors > rdev->sb_start)
1417                 num_sectors = rdev->sb_start;
1418         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1419                        rdev->sb_page);
1420         md_super_wait(rdev->mddev);
1421         return num_sectors;
1422 }
1423
1424
1425 /*
1426  * version 1 superblock
1427  */
1428
1429 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1430 {
1431         __le32 disk_csum;
1432         u32 csum;
1433         unsigned long long newcsum;
1434         int size = 256 + le32_to_cpu(sb->max_dev)*2;
1435         __le32 *isuper = (__le32*)sb;
1436         int i;
1437
1438         disk_csum = sb->sb_csum;
1439         sb->sb_csum = 0;
1440         newcsum = 0;
1441         for (i=0; size>=4; size -= 4 )
1442                 newcsum += le32_to_cpu(*isuper++);
1443
1444         if (size == 2)
1445                 newcsum += le16_to_cpu(*(__le16*) isuper);
1446
1447         csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1448         sb->sb_csum = disk_csum;
1449         return cpu_to_le32(csum);
1450 }
1451
1452 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1453                             int acknowledged);
1454 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1455 {
1456         struct mdp_superblock_1 *sb;
1457         int ret;
1458         sector_t sb_start;
1459         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1460         int bmask;
1461
1462         /*
1463          * Calculate the position of the superblock in 512byte sectors.
1464          * It is always aligned to a 4K boundary and
1465          * depeding on minor_version, it can be:
1466          * 0: At least 8K, but less than 12K, from end of device
1467          * 1: At start of device
1468          * 2: 4K from start of device.
1469          */
1470         switch(minor_version) {
1471         case 0:
1472                 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1473                 sb_start -= 8*2;
1474                 sb_start &= ~(sector_t)(4*2-1);
1475                 break;
1476         case 1:
1477                 sb_start = 0;
1478                 break;
1479         case 2:
1480                 sb_start = 8;
1481                 break;
1482         default:
1483                 return -EINVAL;
1484         }
1485         rdev->sb_start = sb_start;
1486
1487         /* superblock is rarely larger than 1K, but it can be larger,
1488          * and it is safe to read 4k, so we do that
1489          */
1490         ret = read_disk_sb(rdev, 4096);
1491         if (ret) return ret;
1492
1493
1494         sb = page_address(rdev->sb_page);
1495
1496         if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1497             sb->major_version != cpu_to_le32(1) ||
1498             le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1499             le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1500             (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1501                 return -EINVAL;
1502
1503         if (calc_sb_1_csum(sb) != sb->sb_csum) {
1504                 printk("md: invalid superblock checksum on %s\n",
1505                         bdevname(rdev->bdev,b));
1506                 return -EINVAL;
1507         }
1508         if (le64_to_cpu(sb->data_size) < 10) {
1509                 printk("md: data_size too small on %s\n",
1510                        bdevname(rdev->bdev,b));
1511                 return -EINVAL;
1512         }
1513
1514         rdev->preferred_minor = 0xffff;
1515         rdev->data_offset = le64_to_cpu(sb->data_offset);
1516         atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1517
1518         rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1519         bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1520         if (rdev->sb_size & bmask)
1521                 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1522
1523         if (minor_version
1524             && rdev->data_offset < sb_start + (rdev->sb_size/512))
1525                 return -EINVAL;
1526
1527         if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1528                 rdev->desc_nr = -1;
1529         else
1530                 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1531
1532         if (!rdev->bb_page) {
1533                 rdev->bb_page = alloc_page(GFP_KERNEL);
1534                 if (!rdev->bb_page)
1535                         return -ENOMEM;
1536         }
1537         if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538             rdev->badblocks.count == 0) {
1539                 /* need to load the bad block list.
1540                  * Currently we limit it to one page.
1541                  */
1542                 s32 offset;
1543                 sector_t bb_sector;
1544                 u64 *bbp;
1545                 int i;
1546                 int sectors = le16_to_cpu(sb->bblog_size);
1547                 if (sectors > (PAGE_SIZE / 512))
1548                         return -EINVAL;
1549                 offset = le32_to_cpu(sb->bblog_offset);
1550                 if (offset == 0)
1551                         return -EINVAL;
1552                 bb_sector = (long long)offset;
1553                 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554                                   rdev->bb_page, READ, true))
1555                         return -EIO;
1556                 bbp = (u64 *)page_address(rdev->bb_page);
1557                 rdev->badblocks.shift = sb->bblog_shift;
1558                 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559                         u64 bb = le64_to_cpu(*bbp);
1560                         int count = bb & (0x3ff);
1561                         u64 sector = bb >> 10;
1562                         sector <<= sb->bblog_shift;
1563                         count <<= sb->bblog_shift;
1564                         if (bb + 1 == 0)
1565                                 break;
1566                         if (md_set_badblocks(&rdev->badblocks,
1567                                              sector, count, 1) == 0)
1568                                 return -EINVAL;
1569                 }
1570         } else if (sb->bblog_offset == 0)
1571                 rdev->badblocks.shift = -1;
1572
1573         if (!refdev) {
1574                 ret = 1;
1575         } else {
1576                 __u64 ev1, ev2;
1577                 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1578
1579                 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1580                     sb->level != refsb->level ||
1581                     sb->layout != refsb->layout ||
1582                     sb->chunksize != refsb->chunksize) {
1583                         printk(KERN_WARNING "md: %s has strangely different"
1584                                 " superblock to %s\n",
1585                                 bdevname(rdev->bdev,b),
1586                                 bdevname(refdev->bdev,b2));
1587                         return -EINVAL;
1588                 }
1589                 ev1 = le64_to_cpu(sb->events);
1590                 ev2 = le64_to_cpu(refsb->events);
1591
1592                 if (ev1 > ev2)
1593                         ret = 1;
1594                 else
1595                         ret = 0;
1596         }
1597         if (minor_version)
1598                 rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
1599                         le64_to_cpu(sb->data_offset);
1600         else
1601                 rdev->sectors = rdev->sb_start;
1602         if (rdev->sectors < le64_to_cpu(sb->data_size))
1603                 return -EINVAL;
1604         rdev->sectors = le64_to_cpu(sb->data_size);
1605         if (le64_to_cpu(sb->size) > rdev->sectors)
1606                 return -EINVAL;
1607         return ret;
1608 }
1609
1610 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1611 {
1612         struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1613         __u64 ev1 = le64_to_cpu(sb->events);
1614
1615         rdev->raid_disk = -1;
1616         clear_bit(Faulty, &rdev->flags);
1617         clear_bit(In_sync, &rdev->flags);
1618         clear_bit(WriteMostly, &rdev->flags);
1619
1620         if (mddev->raid_disks == 0) {
1621                 mddev->major_version = 1;
1622                 mddev->patch_version = 0;
1623                 mddev->external = 0;
1624                 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1625                 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1626                 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1627                 mddev->level = le32_to_cpu(sb->level);
1628                 mddev->clevel[0] = 0;
1629                 mddev->layout = le32_to_cpu(sb->layout);
1630                 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1631                 mddev->dev_sectors = le64_to_cpu(sb->size);
1632                 mddev->events = ev1;
1633                 mddev->bitmap_info.offset = 0;
1634                 mddev->bitmap_info.default_offset = 1024 >> 9;
1635                 
1636                 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1637                 memcpy(mddev->uuid, sb->set_uuid, 16);
1638
1639                 mddev->max_disks =  (4096-256)/2;
1640
1641                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1642                     mddev->bitmap_info.file == NULL )
1643                         mddev->bitmap_info.offset =
1644                                 (__s32)le32_to_cpu(sb->bitmap_offset);
1645
1646                 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1647                         mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1648                         mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1649                         mddev->new_level = le32_to_cpu(sb->new_level);
1650                         mddev->new_layout = le32_to_cpu(sb->new_layout);
1651                         mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1652                 } else {
1653                         mddev->reshape_position = MaxSector;
1654                         mddev->delta_disks = 0;
1655                         mddev->new_level = mddev->level;
1656                         mddev->new_layout = mddev->layout;
1657                         mddev->new_chunk_sectors = mddev->chunk_sectors;
1658                 }
1659
1660         } else if (mddev->pers == NULL) {
1661                 /* Insist of good event counter while assembling, except for
1662                  * spares (which don't need an event count) */
1663                 ++ev1;
1664                 if (rdev->desc_nr >= 0 &&
1665                     rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1666                     le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1667                         if (ev1 < mddev->events)
1668                                 return -EINVAL;
1669         } else if (mddev->bitmap) {
1670                 /* If adding to array with a bitmap, then we can accept an
1671                  * older device, but not too old.
1672                  */
1673                 if (ev1 < mddev->bitmap->events_cleared)
1674                         return 0;
1675         } else {
1676                 if (ev1 < mddev->events)
1677                         /* just a hot-add of a new device, leave raid_disk at -1 */
1678                         return 0;
1679         }
1680         if (mddev->level != LEVEL_MULTIPATH) {
1681                 int role;
1682                 if (rdev->desc_nr < 0 ||
1683                     rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1684                         role = 0xffff;
1685                         rdev->desc_nr = -1;
1686                 } else
1687                         role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1688                 switch(role) {
1689                 case 0xffff: /* spare */
1690                         break;
1691                 case 0xfffe: /* faulty */
1692                         set_bit(Faulty, &rdev->flags);
1693                         break;
1694                 default:
1695                         if ((le32_to_cpu(sb->feature_map) &
1696                              MD_FEATURE_RECOVERY_OFFSET))
1697                                 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1698                         else
1699                                 set_bit(In_sync, &rdev->flags);
1700                         rdev->raid_disk = role;
1701                         break;
1702                 }
1703                 if (sb->devflags & WriteMostly1)
1704                         set_bit(WriteMostly, &rdev->flags);
1705         } else /* MULTIPATH are always insync */
1706                 set_bit(In_sync, &rdev->flags);
1707
1708         return 0;
1709 }
1710
1711 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1712 {
1713         struct mdp_superblock_1 *sb;
1714         mdk_rdev_t *rdev2;
1715         int max_dev, i;
1716         /* make rdev->sb match mddev and rdev data. */
1717
1718         sb = page_address(rdev->sb_page);
1719
1720         sb->feature_map = 0;
1721         sb->pad0 = 0;
1722         sb->recovery_offset = cpu_to_le64(0);
1723         memset(sb->pad1, 0, sizeof(sb->pad1));
1724         memset(sb->pad3, 0, sizeof(sb->pad3));
1725
1726         sb->utime = cpu_to_le64((__u64)mddev->utime);
1727         sb->events = cpu_to_le64(mddev->events);
1728         if (mddev->in_sync)
1729                 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1730         else
1731                 sb->resync_offset = cpu_to_le64(0);
1732
1733         sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1734
1735         sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1736         sb->size = cpu_to_le64(mddev->dev_sectors);
1737         sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1738         sb->level = cpu_to_le32(mddev->level);
1739         sb->layout = cpu_to_le32(mddev->layout);
1740
1741         if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1742                 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1743                 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1744         }
1745
1746         if (rdev->raid_disk >= 0 &&
1747             !test_bit(In_sync, &rdev->flags)) {
1748                 sb->feature_map |=
1749                         cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1750                 sb->recovery_offset =
1751                         cpu_to_le64(rdev->recovery_offset);
1752         }
1753
1754         if (mddev->reshape_position != MaxSector) {
1755                 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1756                 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1757                 sb->new_layout = cpu_to_le32(mddev->new_layout);
1758                 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1759                 sb->new_level = cpu_to_le32(mddev->new_level);
1760                 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1761         }
1762
1763         if (rdev->badblocks.count == 0)
1764                 /* Nothing to do for bad blocks*/ ;
1765         else if (sb->bblog_offset == 0)
1766                 /* Cannot record bad blocks on this device */
1767                 md_error(mddev, rdev);
1768         else {
1769                 struct badblocks *bb = &rdev->badblocks;
1770                 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1771                 u64 *p = bb->page;
1772                 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1773                 if (bb->changed) {
1774                         unsigned seq;
1775
1776 retry:
1777                         seq = read_seqbegin(&bb->lock);
1778
1779                         memset(bbp, 0xff, PAGE_SIZE);
1780
1781                         for (i = 0 ; i < bb->count ; i++) {
1782                                 u64 internal_bb = *p++;
1783                                 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1784                                                 | BB_LEN(internal_bb));
1785                                 *bbp++ = cpu_to_le64(store_bb);
1786                         }
1787                         if (read_seqretry(&bb->lock, seq))
1788                                 goto retry;
1789
1790                         bb->sector = (rdev->sb_start +
1791                                       (int)le32_to_cpu(sb->bblog_offset));
1792                         bb->size = le16_to_cpu(sb->bblog_size);
1793                         bb->changed = 0;
1794                 }
1795         }
1796
1797         max_dev = 0;
1798         list_for_each_entry(rdev2, &mddev->disks, same_set)
1799                 if (rdev2->desc_nr+1 > max_dev)
1800                         max_dev = rdev2->desc_nr+1;
1801
1802         if (max_dev > le32_to_cpu(sb->max_dev)) {
1803                 int bmask;
1804                 sb->max_dev = cpu_to_le32(max_dev);
1805                 rdev->sb_size = max_dev * 2 + 256;
1806                 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1807                 if (rdev->sb_size & bmask)
1808                         rdev->sb_size = (rdev->sb_size | bmask) + 1;
1809         } else
1810                 max_dev = le32_to_cpu(sb->max_dev);
1811
1812         for (i=0; i<max_dev;i++)
1813                 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1814         
1815         list_for_each_entry(rdev2, &mddev->disks, same_set) {
1816                 i = rdev2->desc_nr;
1817                 if (test_bit(Faulty, &rdev2->flags))
1818                         sb->dev_roles[i] = cpu_to_le16(0xfffe);
1819                 else if (test_bit(In_sync, &rdev2->flags))
1820                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1821                 else if (rdev2->raid_disk >= 0)
1822                         sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1823                 else
1824                         sb->dev_roles[i] = cpu_to_le16(0xffff);
1825         }
1826
1827         sb->sb_csum = calc_sb_1_csum(sb);
1828 }
1829
1830 static unsigned long long
1831 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1832 {
1833         struct mdp_superblock_1 *sb;
1834         sector_t max_sectors;
1835         if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1836                 return 0; /* component must fit device */
1837         if (rdev->sb_start < rdev->data_offset) {
1838                 /* minor versions 1 and 2; superblock before data */
1839                 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1840                 max_sectors -= rdev->data_offset;
1841                 if (!num_sectors || num_sectors > max_sectors)
1842                         num_sectors = max_sectors;
1843         } else if (rdev->mddev->bitmap_info.offset) {
1844                 /* minor version 0 with bitmap we can't move */
1845                 return 0;
1846         } else {
1847                 /* minor version 0; superblock after data */
1848                 sector_t sb_start;
1849                 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1850                 sb_start &= ~(sector_t)(4*2 - 1);
1851                 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1852                 if (!num_sectors || num_sectors > max_sectors)
1853                         num_sectors = max_sectors;
1854                 rdev->sb_start = sb_start;
1855         }
1856         sb = page_address(rdev->sb_page);
1857         sb->data_size = cpu_to_le64(num_sectors);
1858         sb->super_offset = rdev->sb_start;
1859         sb->sb_csum = calc_sb_1_csum(sb);
1860         md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1861                        rdev->sb_page);
1862         md_super_wait(rdev->mddev);
1863         return num_sectors;
1864 }
1865
1866 static struct super_type super_types[] = {
1867         [0] = {
1868                 .name   = "0.90.0",
1869                 .owner  = THIS_MODULE,
1870                 .load_super         = super_90_load,
1871                 .validate_super     = super_90_validate,
1872                 .sync_super         = super_90_sync,
1873                 .rdev_size_change   = super_90_rdev_size_change,
1874         },
1875         [1] = {
1876                 .name   = "md-1",
1877                 .owner  = THIS_MODULE,
1878                 .load_super         = super_1_load,
1879                 .validate_super     = super_1_validate,
1880                 .sync_super         = super_1_sync,
1881                 .rdev_size_change   = super_1_rdev_size_change,
1882         },
1883 };
1884
1885 static void sync_super(mddev_t *mddev, mdk_rdev_t *rdev)
1886 {
1887         if (mddev->sync_super) {
1888                 mddev->sync_super(mddev, rdev);
1889                 return;
1890         }
1891
1892         BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1893
1894         super_types[mddev->major_version].sync_super(mddev, rdev);
1895 }
1896
1897 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1898 {
1899         mdk_rdev_t *rdev, *rdev2;
1900
1901         rcu_read_lock();
1902         rdev_for_each_rcu(rdev, mddev1)
1903                 rdev_for_each_rcu(rdev2, mddev2)
1904                         if (rdev->bdev->bd_contains ==
1905                             rdev2->bdev->bd_contains) {
1906                                 rcu_read_unlock();
1907                                 return 1;
1908                         }
1909         rcu_read_unlock();
1910         return 0;
1911 }
1912
1913 static LIST_HEAD(pending_raid_disks);
1914
1915 /*
1916  * Try to register data integrity profile for an mddev
1917  *
1918  * This is called when an array is started and after a disk has been kicked
1919  * from the array. It only succeeds if all working and active component devices
1920  * are integrity capable with matching profiles.
1921  */
1922 int md_integrity_register(mddev_t *mddev)
1923 {
1924         mdk_rdev_t *rdev, *reference = NULL;
1925
1926         if (list_empty(&mddev->disks))
1927                 return 0; /* nothing to do */
1928         if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1929                 return 0; /* shouldn't register, or already is */
1930         list_for_each_entry(rdev, &mddev->disks, same_set) {
1931                 /* skip spares and non-functional disks */
1932                 if (test_bit(Faulty, &rdev->flags))
1933                         continue;
1934                 if (rdev->raid_disk < 0)
1935                         continue;
1936                 if (!reference) {
1937                         /* Use the first rdev as the reference */
1938                         reference = rdev;
1939                         continue;
1940                 }
1941                 /* does this rdev's profile match the reference profile? */
1942                 if (blk_integrity_compare(reference->bdev->bd_disk,
1943                                 rdev->bdev->bd_disk) < 0)
1944                         return -EINVAL;
1945         }
1946         if (!reference || !bdev_get_integrity(reference->bdev))
1947                 return 0;
1948         /*
1949          * All component devices are integrity capable and have matching
1950          * profiles, register the common profile for the md device.
1951          */
1952         if (blk_integrity_register(mddev->gendisk,
1953                         bdev_get_integrity(reference->bdev)) != 0) {
1954                 printk(KERN_ERR "md: failed to register integrity for %s\n",
1955                         mdname(mddev));
1956                 return -EINVAL;
1957         }
1958         printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1959         if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1960                 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1961                        mdname(mddev));
1962                 return -EINVAL;
1963         }
1964         return 0;
1965 }
1966 EXPORT_SYMBOL(md_integrity_register);
1967
1968 /* Disable data integrity if non-capable/non-matching disk is being added */
1969 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1970 {
1971         struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1972         struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1973
1974         if (!bi_mddev) /* nothing to do */
1975                 return;
1976         if (rdev->raid_disk < 0) /* skip spares */
1977                 return;
1978         if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1979                                              rdev->bdev->bd_disk) >= 0)
1980                 return;
1981         printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1982         blk_integrity_unregister(mddev->gendisk);
1983 }
1984 EXPORT_SYMBOL(md_integrity_add_rdev);
1985
1986 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1987 {
1988         char b[BDEVNAME_SIZE];
1989         struct kobject *ko;
1990         char *s;
1991         int err;
1992
1993         if (rdev->mddev) {
1994                 MD_BUG();
1995                 return -EINVAL;
1996         }
1997
1998         /* prevent duplicates */
1999         if (find_rdev(mddev, rdev->bdev->bd_dev))
2000                 return -EEXIST;
2001
2002         /* make sure rdev->sectors exceeds mddev->dev_sectors */
2003         if (rdev->sectors && (mddev->dev_sectors == 0 ||
2004                         rdev->sectors < mddev->dev_sectors)) {
2005                 if (mddev->pers) {
2006                         /* Cannot change size, so fail
2007                          * If mddev->level <= 0, then we don't care
2008                          * about aligning sizes (e.g. linear)
2009                          */
2010                         if (mddev->level > 0)
2011                                 return -ENOSPC;
2012                 } else
2013                         mddev->dev_sectors = rdev->sectors;
2014         }
2015
2016         /* Verify rdev->desc_nr is unique.
2017          * If it is -1, assign a free number, else
2018          * check number is not in use
2019          */
2020         if (rdev->desc_nr < 0) {
2021                 int choice = 0;
2022                 if (mddev->pers) choice = mddev->raid_disks;
2023                 while (find_rdev_nr(mddev, choice))
2024                         choice++;
2025                 rdev->desc_nr = choice;
2026         } else {
2027                 if (find_rdev_nr(mddev, rdev->desc_nr))
2028                         return -EBUSY;
2029         }
2030         if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2031                 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2032                        mdname(mddev), mddev->max_disks);
2033                 return -EBUSY;
2034         }
2035         bdevname(rdev->bdev,b);
2036         while ( (s=strchr(b, '/')) != NULL)
2037                 *s = '!';
2038
2039         rdev->mddev = mddev;
2040         printk(KERN_INFO "md: bind<%s>\n", b);
2041
2042         if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2043                 goto fail;
2044
2045         ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2046         if (sysfs_create_link(&rdev->kobj, ko, "block"))
2047                 /* failure here is OK */;
2048         rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2049
2050         list_add_rcu(&rdev->same_set, &mddev->disks);
2051         bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2052
2053         /* May as well allow recovery to be retried once */
2054         mddev->recovery_disabled++;
2055
2056         return 0;
2057
2058  fail:
2059         printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2060                b, mdname(mddev));
2061         return err;
2062 }
2063
2064 static void md_delayed_delete(struct work_struct *ws)
2065 {
2066         mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
2067         kobject_del(&rdev->kobj);
2068         kobject_put(&rdev->kobj);
2069 }
2070
2071 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
2072 {
2073         char b[BDEVNAME_SIZE];
2074         if (!rdev->mddev) {
2075                 MD_BUG();
2076                 return;
2077         }
2078         bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2079         list_del_rcu(&rdev->same_set);
2080         printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2081         rdev->mddev = NULL;
2082         sysfs_remove_link(&rdev->kobj, "block");
2083         sysfs_put(rdev->sysfs_state);
2084         rdev->sysfs_state = NULL;
2085         kfree(rdev->badblocks.page);
2086         rdev->badblocks.count = 0;
2087         rdev->badblocks.page = NULL;
2088         /* We need to delay this, otherwise we can deadlock when
2089          * writing to 'remove' to "dev/state".  We also need
2090          * to delay it due to rcu usage.
2091          */
2092         synchronize_rcu();
2093         INIT_WORK(&rdev->del_work, md_delayed_delete);
2094         kobject_get(&rdev->kobj);
2095         queue_work(md_misc_wq, &rdev->del_work);
2096 }
2097
2098 /*
2099  * prevent the device from being mounted, repartitioned or
2100  * otherwise reused by a RAID array (or any other kernel
2101  * subsystem), by bd_claiming the device.
2102  */
2103 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
2104 {
2105         int err = 0;
2106         struct block_device *bdev;
2107         char b[BDEVNAME_SIZE];
2108
2109         bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2110                                  shared ? (mdk_rdev_t *)lock_rdev : rdev);
2111         if (IS_ERR(bdev)) {
2112                 printk(KERN_ERR "md: could not open %s.\n",
2113                         __bdevname(dev, b));
2114                 return PTR_ERR(bdev);
2115         }
2116         rdev->bdev = bdev;
2117         return err;
2118 }
2119
2120 static void unlock_rdev(mdk_rdev_t *rdev)
2121 {
2122         struct block_device *bdev = rdev->bdev;
2123         rdev->bdev = NULL;
2124         if (!bdev)
2125                 MD_BUG();
2126         blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2127 }
2128
2129 void md_autodetect_dev(dev_t dev);
2130
2131 static void export_rdev(mdk_rdev_t * rdev)
2132 {
2133         char b[BDEVNAME_SIZE];
2134         printk(KERN_INFO "md: export_rdev(%s)\n",
2135                 bdevname(rdev->bdev,b));
2136         if (rdev->mddev)
2137                 MD_BUG();
2138         free_disk_sb(rdev);
2139 #ifndef MODULE
2140         if (test_bit(AutoDetected, &rdev->flags))
2141                 md_autodetect_dev(rdev->bdev->bd_dev);
2142 #endif
2143         unlock_rdev(rdev);
2144         kobject_put(&rdev->kobj);
2145 }
2146
2147 static void kick_rdev_from_array(mdk_rdev_t * rdev)
2148 {
2149         unbind_rdev_from_array(rdev);
2150         export_rdev(rdev);
2151 }
2152
2153 static void export_array(mddev_t *mddev)
2154 {
2155         mdk_rdev_t *rdev, *tmp;
2156
2157         rdev_for_each(rdev, tmp, mddev) {
2158                 if (!rdev->mddev) {
2159                         MD_BUG();
2160                         continue;
2161                 }
2162                 kick_rdev_from_array(rdev);
2163         }
2164         if (!list_empty(&mddev->disks))
2165                 MD_BUG();
2166         mddev->raid_disks = 0;
2167         mddev->major_version = 0;
2168 }
2169
2170 static void print_desc(mdp_disk_t *desc)
2171 {
2172         printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
2173                 desc->major,desc->minor,desc->raid_disk,desc->state);
2174 }
2175
2176 static void print_sb_90(mdp_super_t *sb)
2177 {
2178         int i;
2179
2180         printk(KERN_INFO 
2181                 "md:  SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2182                 sb->major_version, sb->minor_version, sb->patch_version,
2183                 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2184                 sb->ctime);
2185         printk(KERN_INFO "md:     L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
2186                 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
2187                 sb->md_minor, sb->layout, sb->chunk_size);
2188         printk(KERN_INFO "md:     UT:%08x ST:%d AD:%d WD:%d"
2189                 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
2190                 sb->utime, sb->state, sb->active_disks, sb->working_disks,
2191                 sb->failed_disks, sb->spare_disks,
2192                 sb->sb_csum, (unsigned long)sb->events_lo);
2193
2194         printk(KERN_INFO);
2195         for (i = 0; i < MD_SB_DISKS; i++) {
2196                 mdp_disk_t *desc;
2197
2198                 desc = sb->disks + i;
2199                 if (desc->number || desc->major || desc->minor ||
2200                     desc->raid_disk || (desc->state && (desc->state != 4))) {
2201                         printk("     D %2d: ", i);
2202                         print_desc(desc);
2203                 }
2204         }
2205         printk(KERN_INFO "md:     THIS: ");
2206         print_desc(&sb->this_disk);
2207 }
2208
2209 static void print_sb_1(struct mdp_superblock_1 *sb)
2210 {
2211         __u8 *uuid;
2212
2213         uuid = sb->set_uuid;
2214         printk(KERN_INFO
2215                "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
2216                "md:    Name: \"%s\" CT:%llu\n",
2217                 le32_to_cpu(sb->major_version),
2218                 le32_to_cpu(sb->feature_map),
2219                 uuid,
2220                 sb->set_name,
2221                 (unsigned long long)le64_to_cpu(sb->ctime)
2222                        & MD_SUPERBLOCK_1_TIME_SEC_MASK);
2223
2224         uuid = sb->device_uuid;
2225         printk(KERN_INFO
2226                "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
2227                         " RO:%llu\n"
2228                "md:     Dev:%08x UUID: %pU\n"
2229                "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
2230                "md:         (MaxDev:%u) \n",
2231                 le32_to_cpu(sb->level),
2232                 (unsigned long long)le64_to_cpu(sb->size),
2233                 le32_to_cpu(sb->raid_disks),
2234                 le32_to_cpu(sb->layout),
2235                 le32_to_cpu(sb->chunksize),
2236                 (unsigned long long)le64_to_cpu(sb->data_offset),
2237                 (unsigned long long)le64_to_cpu(sb->data_size),
2238                 (unsigned long long)le64_to_cpu(sb->super_offset),
2239                 (unsigned long long)le64_to_cpu(sb->recovery_offset),
2240                 le32_to_cpu(sb->dev_number),
2241                 uuid,
2242                 sb->devflags,
2243                 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
2244                 (unsigned long long)le64_to_cpu(sb->events),
2245                 (unsigned long long)le64_to_cpu(sb->resync_offset),
2246                 le32_to_cpu(sb->sb_csum),
2247                 le32_to_cpu(sb->max_dev)
2248                 );
2249 }
2250
2251 static void print_rdev(mdk_rdev_t *rdev, int major_version)
2252 {
2253         char b[BDEVNAME_SIZE];
2254         printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
2255                 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
2256                 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
2257                 rdev->desc_nr);
2258         if (rdev->sb_loaded) {
2259                 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2260                 switch (major_version) {
2261                 case 0:
2262                         print_sb_90(page_address(rdev->sb_page));
2263                         break;
2264                 case 1:
2265                         print_sb_1(page_address(rdev->sb_page));
2266                         break;
2267                 }
2268         } else
2269                 printk(KERN_INFO "md: no rdev superblock!\n");
2270 }
2271
2272 static void md_print_devices(void)
2273 {
2274         struct list_head *tmp;
2275         mdk_rdev_t *rdev;
2276         mddev_t *mddev;
2277         char b[BDEVNAME_SIZE];
2278
2279         printk("\n");
2280         printk("md:     **********************************\n");
2281         printk("md:     * <COMPLETE RAID STATE PRINTOUT> *\n");
2282         printk("md:     **********************************\n");
2283         for_each_mddev(mddev, tmp) {
2284
2285                 if (mddev->bitmap)
2286                         bitmap_print_sb(mddev->bitmap);
2287                 else
2288                         printk("%s: ", mdname(mddev));
2289                 list_for_each_entry(rdev, &mddev->disks, same_set)
2290                         printk("<%s>", bdevname(rdev->bdev,b));
2291                 printk("\n");
2292
2293                 list_for_each_entry(rdev, &mddev->disks, same_set)
2294                         print_rdev(rdev, mddev->major_version);
2295         }
2296         printk("md:     **********************************\n");
2297         printk("\n");
2298 }
2299
2300
2301 static void sync_sbs(mddev_t * mddev, int nospares)
2302 {
2303         /* Update each superblock (in-memory image), but
2304          * if we are allowed to, skip spares which already
2305          * have the right event counter, or have one earlier
2306          * (which would mean they aren't being marked as dirty
2307          * with the rest of the array)
2308          */
2309         mdk_rdev_t *rdev;
2310         list_for_each_entry(rdev, &mddev->disks, same_set) {
2311                 if (rdev->sb_events == mddev->events ||
2312                     (nospares &&
2313                      rdev->raid_disk < 0 &&
2314                      rdev->sb_events+1 == mddev->events)) {
2315                         /* Don't update this superblock */
2316                         rdev->sb_loaded = 2;
2317                 } else {
2318                         sync_super(mddev, rdev);
2319                         rdev->sb_loaded = 1;
2320                 }
2321         }
2322 }
2323
2324 static void md_update_sb(mddev_t * mddev, int force_change)
2325 {
2326         mdk_rdev_t *rdev;
2327         int sync_req;
2328         int nospares = 0;
2329         int any_badblocks_changed = 0;
2330
2331 repeat:
2332         /* First make sure individual recovery_offsets are correct */
2333         list_for_each_entry(rdev, &mddev->disks, same_set) {
2334                 if (rdev->raid_disk >= 0 &&
2335                     mddev->delta_disks >= 0 &&
2336                     !test_bit(In_sync, &rdev->flags) &&
2337                     mddev->curr_resync_completed > rdev->recovery_offset)
2338                                 rdev->recovery_offset = mddev->curr_resync_completed;
2339
2340         }       
2341         if (!mddev->persistent) {
2342                 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2343                 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2344                 if (!mddev->external) {
2345                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346                         list_for_each_entry(rdev, &mddev->disks, same_set) {
2347                                 if (rdev->badblocks.changed) {
2348                                         md_ack_all_badblocks(&rdev->badblocks);
2349                                         md_error(mddev, rdev);
2350                                 }
2351                                 clear_bit(Blocked, &rdev->flags);
2352                                 clear_bit(BlockedBadBlocks, &rdev->flags);
2353                                 wake_up(&rdev->blocked_wait);
2354                         }
2355                 }
2356                 wake_up(&mddev->sb_wait);
2357                 return;
2358         }
2359
2360         spin_lock_irq(&mddev->write_lock);
2361
2362         mddev->utime = get_seconds();
2363
2364         if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2365                 force_change = 1;
2366         if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2367                 /* just a clean<-> dirty transition, possibly leave spares alone,
2368                  * though if events isn't the right even/odd, we will have to do
2369                  * spares after all
2370                  */
2371                 nospares = 1;
2372         if (force_change)
2373                 nospares = 0;
2374         if (mddev->degraded)
2375                 /* If the array is degraded, then skipping spares is both
2376                  * dangerous and fairly pointless.
2377                  * Dangerous because a device that was removed from the array
2378                  * might have a event_count that still looks up-to-date,
2379                  * so it can be re-added without a resync.
2380                  * Pointless because if there are any spares to skip,
2381                  * then a recovery will happen and soon that array won't
2382                  * be degraded any more and the spare can go back to sleep then.
2383                  */
2384                 nospares = 0;
2385
2386         sync_req = mddev->in_sync;
2387
2388         /* If this is just a dirty<->clean transition, and the array is clean
2389          * and 'events' is odd, we can roll back to the previous clean state */
2390         if (nospares
2391             && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2392             && mddev->can_decrease_events
2393             && mddev->events != 1) {
2394                 mddev->events--;
2395                 mddev->can_decrease_events = 0;
2396         } else {
2397                 /* otherwise we have to go forward and ... */
2398                 mddev->events ++;
2399                 mddev->can_decrease_events = nospares;
2400         }
2401
2402         if (!mddev->events) {
2403                 /*
2404                  * oops, this 64-bit counter should never wrap.
2405                  * Either we are in around ~1 trillion A.C., assuming
2406                  * 1 reboot per second, or we have a bug:
2407                  */
2408                 MD_BUG();
2409                 mddev->events --;
2410         }
2411
2412         list_for_each_entry(rdev, &mddev->disks, same_set) {
2413                 if (rdev->badblocks.changed)
2414                         any_badblocks_changed++;
2415                 if (test_bit(Faulty, &rdev->flags))
2416                         set_bit(FaultRecorded, &rdev->flags);
2417         }
2418
2419         sync_sbs(mddev, nospares);
2420         spin_unlock_irq(&mddev->write_lock);
2421
2422         dprintk(KERN_INFO 
2423                 "md: updating %s RAID superblock on device (in sync %d)\n",
2424                 mdname(mddev),mddev->in_sync);
2425
2426         bitmap_update_sb(mddev->bitmap);
2427         list_for_each_entry(rdev, &mddev->disks, same_set) {
2428                 char b[BDEVNAME_SIZE];
2429                 dprintk(KERN_INFO "md: ");
2430                 if (rdev->sb_loaded != 1)
2431                         continue; /* no noise on spare devices */
2432                 if (test_bit(Faulty, &rdev->flags))
2433                         dprintk("(skipping faulty ");
2434
2435                 dprintk("%s ", bdevname(rdev->bdev,b));
2436                 if (!test_bit(Faulty, &rdev->flags)) {
2437                         md_super_write(mddev,rdev,
2438                                        rdev->sb_start, rdev->sb_size,
2439                                        rdev->sb_page);
2440                         dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2441                                 bdevname(rdev->bdev,b),
2442                                 (unsigned long long)rdev->sb_start);
2443                         rdev->sb_events = mddev->events;
2444                         if (rdev->badblocks.size) {
2445                                 md_super_write(mddev, rdev,
2446                                                rdev->badblocks.sector,
2447                                                rdev->badblocks.size << 9,
2448                                                rdev->bb_page);
2449                                 rdev->badblocks.size = 0;
2450                         }
2451
2452                 } else
2453                         dprintk(")\n");
2454                 if (mddev->level == LEVEL_MULTIPATH)
2455                         /* only need to write one superblock... */
2456                         break;
2457         }
2458         md_super_wait(mddev);
2459         /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2460
2461         spin_lock_irq(&mddev->write_lock);
2462         if (mddev->in_sync != sync_req ||
2463             test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2464                 /* have to write it out again */
2465                 spin_unlock_irq(&mddev->write_lock);
2466                 goto repeat;
2467         }
2468         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2469         spin_unlock_irq(&mddev->write_lock);
2470         wake_up(&mddev->sb_wait);
2471         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2472                 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2473
2474         list_for_each_entry(rdev, &mddev->disks, same_set) {
2475                 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476                         clear_bit(Blocked, &rdev->flags);
2477
2478                 if (any_badblocks_changed)
2479                         md_ack_all_badblocks(&rdev->badblocks);
2480                 clear_bit(BlockedBadBlocks, &rdev->flags);
2481                 wake_up(&rdev->blocked_wait);
2482         }
2483 }
2484
2485 /* words written to sysfs files may, or may not, be \n terminated.
2486  * We want to accept with case. For this we use cmd_match.
2487  */
2488 static int cmd_match(const char *cmd, const char *str)
2489 {
2490         /* See if cmd, written into a sysfs file, matches
2491          * str.  They must either be the same, or cmd can
2492          * have a trailing newline
2493          */
2494         while (*cmd && *str && *cmd == *str) {
2495                 cmd++;
2496                 str++;
2497         }
2498         if (*cmd == '\n')
2499                 cmd++;
2500         if (*str || *cmd)
2501                 return 0;
2502         return 1;
2503 }
2504
2505 struct rdev_sysfs_entry {
2506         struct attribute attr;
2507         ssize_t (*show)(mdk_rdev_t *, char *);
2508         ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2509 };
2510
2511 static ssize_t
2512 state_show(mdk_rdev_t *rdev, char *page)
2513 {
2514         char *sep = "";
2515         size_t len = 0;
2516
2517         if (test_bit(Faulty, &rdev->flags) ||
2518             rdev->badblocks.unacked_exist) {
2519                 len+= sprintf(page+len, "%sfaulty",sep);
2520                 sep = ",";
2521         }
2522         if (test_bit(In_sync, &rdev->flags)) {
2523                 len += sprintf(page+len, "%sin_sync",sep);
2524                 sep = ",";
2525         }
2526         if (test_bit(WriteMostly, &rdev->flags)) {
2527                 len += sprintf(page+len, "%swrite_mostly",sep);
2528                 sep = ",";
2529         }
2530         if (test_bit(Blocked, &rdev->flags) ||
2531             rdev->badblocks.unacked_exist) {
2532                 len += sprintf(page+len, "%sblocked", sep);
2533                 sep = ",";
2534         }
2535         if (!test_bit(Faulty, &rdev->flags) &&
2536             !test_bit(In_sync, &rdev->flags)) {
2537                 len += sprintf(page+len, "%sspare", sep);
2538                 sep = ",";
2539         }
2540         if (test_bit(WriteErrorSeen, &rdev->flags)) {
2541                 len += sprintf(page+len, "%swrite_error", sep);
2542                 sep = ",";
2543         }
2544         return len+sprintf(page+len, "\n");
2545 }
2546
2547 static ssize_t
2548 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2549 {
2550         /* can write
2551          *  faulty  - simulates an error
2552          *  remove  - disconnects the device
2553          *  writemostly - sets write_mostly
2554          *  -writemostly - clears write_mostly
2555          *  blocked - sets the Blocked flags
2556          *  -blocked - clears the Blocked and possibly simulates an error
2557          *  insync - sets Insync providing device isn't active
2558          *  write_error - sets WriteErrorSeen
2559          *  -write_error - clears WriteErrorSeen
2560          */
2561         int err = -EINVAL;
2562         if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2563                 md_error(rdev->mddev, rdev);
2564                 if (test_bit(Faulty, &rdev->flags))
2565                         err = 0;
2566                 else
2567                         err = -EBUSY;
2568         } else if (cmd_match(buf, "remove")) {
2569                 if (rdev->raid_disk >= 0)
2570                         err = -EBUSY;
2571                 else {
2572                         mddev_t *mddev = rdev->mddev;
2573                         kick_rdev_from_array(rdev);
2574                         if (mddev->pers)
2575                                 md_update_sb(mddev, 1);
2576                         md_new_event(mddev);
2577                         err = 0;
2578                 }
2579         } else if (cmd_match(buf, "writemostly")) {
2580                 set_bit(WriteMostly, &rdev->flags);
2581                 err = 0;
2582         } else if (cmd_match(buf, "-writemostly")) {
2583                 clear_bit(WriteMostly, &rdev->flags);
2584                 err = 0;
2585         } else if (cmd_match(buf, "blocked")) {
2586                 set_bit(Blocked, &rdev->flags);
2587                 err = 0;
2588         } else if (cmd_match(buf, "-blocked")) {
2589                 if (!test_bit(Faulty, &rdev->flags) &&
2590                     test_bit(BlockedBadBlocks, &rdev->flags)) {
2591                         /* metadata handler doesn't understand badblocks,
2592                          * so we need to fail the device
2593                          */
2594                         md_error(rdev->mddev, rdev);
2595                 }
2596                 clear_bit(Blocked, &rdev->flags);
2597                 clear_bit(BlockedBadBlocks, &rdev->flags);
2598                 wake_up(&rdev->blocked_wait);
2599                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2600                 md_wakeup_thread(rdev->mddev->thread);
2601
2602                 err = 0;
2603         } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2604                 set_bit(In_sync, &rdev->flags);
2605                 err = 0;
2606         } else if (cmd_match(buf, "write_error")) {
2607                 set_bit(WriteErrorSeen, &rdev->flags);
2608                 err = 0;
2609         } else if (cmd_match(buf, "-write_error")) {
2610                 clear_bit(WriteErrorSeen, &rdev->flags);
2611                 err = 0;
2612         }
2613         if (!err)
2614                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2615         return err ? err : len;
2616 }
2617 static struct rdev_sysfs_entry rdev_state =
2618 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2619
2620 static ssize_t
2621 errors_show(mdk_rdev_t *rdev, char *page)
2622 {
2623         return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2624 }
2625
2626 static ssize_t
2627 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2628 {
2629         char *e;
2630         unsigned long n = simple_strtoul(buf, &e, 10);
2631         if (*buf && (*e == 0 || *e == '\n')) {
2632                 atomic_set(&rdev->corrected_errors, n);
2633                 return len;
2634         }
2635         return -EINVAL;
2636 }
2637 static struct rdev_sysfs_entry rdev_errors =
2638 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2639
2640 static ssize_t
2641 slot_show(mdk_rdev_t *rdev, char *page)
2642 {
2643         if (rdev->raid_disk < 0)
2644                 return sprintf(page, "none\n");
2645         else
2646                 return sprintf(page, "%d\n", rdev->raid_disk);
2647 }
2648
2649 static ssize_t
2650 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2651 {
2652         char *e;
2653         int err;
2654         int slot = simple_strtoul(buf, &e, 10);
2655         if (strncmp(buf, "none", 4)==0)
2656                 slot = -1;
2657         else if (e==buf || (*e && *e!= '\n'))
2658                 return -EINVAL;
2659         if (rdev->mddev->pers && slot == -1) {
2660                 /* Setting 'slot' on an active array requires also
2661                  * updating the 'rd%d' link, and communicating
2662                  * with the personality with ->hot_*_disk.
2663                  * For now we only support removing
2664                  * failed/spare devices.  This normally happens automatically,
2665                  * but not when the metadata is externally managed.
2666                  */
2667                 if (rdev->raid_disk == -1)
2668                         return -EEXIST;
2669                 /* personality does all needed checks */
2670                 if (rdev->mddev->pers->hot_remove_disk == NULL)
2671                         return -EINVAL;
2672                 err = rdev->mddev->pers->
2673                         hot_remove_disk(rdev->mddev, rdev->raid_disk);
2674                 if (err)
2675                         return err;
2676                 sysfs_unlink_rdev(rdev->mddev, rdev);
2677                 rdev->raid_disk = -1;
2678                 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2679                 md_wakeup_thread(rdev->mddev->thread);
2680         } else if (rdev->mddev->pers) {
2681                 mdk_rdev_t *rdev2;
2682                 /* Activating a spare .. or possibly reactivating
2683                  * if we ever get bitmaps working here.
2684                  */
2685
2686                 if (rdev->raid_disk != -1)
2687                         return -EBUSY;
2688
2689                 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2690                         return -EBUSY;
2691
2692                 if (rdev->mddev->pers->hot_add_disk == NULL)
2693                         return -EINVAL;
2694
2695                 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2696                         if (rdev2->raid_disk == slot)
2697                                 return -EEXIST;
2698
2699                 if (slot >= rdev->mddev->raid_disks &&
2700                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2701                         return -ENOSPC;
2702
2703                 rdev->raid_disk = slot;
2704                 if (test_bit(In_sync, &rdev->flags))
2705                         rdev->saved_raid_disk = slot;
2706                 else
2707                         rdev->saved_raid_disk = -1;
2708                 err = rdev->mddev->pers->
2709                         hot_add_disk(rdev->mddev, rdev);
2710                 if (err) {
2711                         rdev->raid_disk = -1;
2712                         return err;
2713                 } else
2714                         sysfs_notify_dirent_safe(rdev->sysfs_state);
2715                 if (sysfs_link_rdev(rdev->mddev, rdev))
2716                         /* failure here is OK */;
2717                 /* don't wakeup anyone, leave that to userspace. */
2718         } else {
2719                 if (slot >= rdev->mddev->raid_disks &&
2720                     slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2721                         return -ENOSPC;
2722                 rdev->raid_disk = slot;
2723                 /* assume it is working */
2724                 clear_bit(Faulty, &rdev->flags);
2725                 clear_bit(WriteMostly, &rdev->flags);
2726                 set_bit(In_sync, &rdev->flags);
2727                 sysfs_notify_dirent_safe(rdev->sysfs_state);
2728         }
2729         return len;
2730 }
2731
2732
2733 static struct rdev_sysfs_entry rdev_slot =
2734 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2735
2736 static ssize_t
2737 offset_show(mdk_rdev_t *rdev, char *page)
2738 {
2739         return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2740 }
2741
2742 static ssize_t
2743 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2744 {
2745         char *e;
2746         unsigned long long offset = simple_strtoull(buf, &e, 10);
2747         if (e==buf || (*e && *e != '\n'))
2748                 return -EINVAL;
2749         if (rdev->mddev->pers && rdev->raid_disk >= 0)
2750                 return -EBUSY;
2751         if (rdev->sectors && rdev->mddev->external)
2752                 /* Must set offset before size, so overlap checks
2753                  * can be sane */
2754                 return -EBUSY;
2755         rdev->data_offset = offset;
2756         return len;
2757 }
2758
2759 static struct rdev_sysfs_entry rdev_offset =
2760 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2761
2762 static ssize_t
2763 rdev_size_show(mdk_rdev_t *rdev, char *page)
2764 {
2765         return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2766 }
2767
2768 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2769 {
2770         /* check if two start/length pairs overlap */
2771         if (s1+l1 <= s2)
2772                 return 0;
2773         if (s2+l2 <= s1)
2774                 return 0;
2775         return 1;
2776 }
2777
2778 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2779 {
2780         unsigned long long blocks;
2781         sector_t new;
2782
2783         if (strict_strtoull(buf, 10, &blocks) < 0)
2784                 return -EINVAL;
2785
2786         if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2787                 return -EINVAL; /* sector conversion overflow */
2788
2789         new = blocks * 2;
2790         if (new != blocks * 2)
2791                 return -EINVAL; /* unsigned long long to sector_t overflow */
2792
2793         *sectors = new;
2794         return 0;
2795 }
2796
2797 static ssize_t
2798 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2799 {
2800         mddev_t *my_mddev = rdev->mddev;
2801         sector_t oldsectors = rdev->sectors;
2802         sector_t sectors;
2803
2804         if (strict_blocks_to_sectors(buf, &sectors) < 0)
2805                 return -EINVAL;
2806         if (my_mddev->pers && rdev->raid_disk >= 0) {
2807                 if (my_mddev->persistent) {
2808                         sectors = super_types[my_mddev->major_version].
2809                                 rdev_size_change(rdev, sectors);
2810                         if (!sectors)
2811                                 return -EBUSY;
2812                 } else if (!sectors)
2813                         sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2814                                 rdev->data_offset;
2815         }
2816         if (sectors < my_mddev->dev_sectors)
2817                 return -EINVAL; /* component must fit device */
2818
2819         rdev->sectors = sectors;
2820         if (sectors > oldsectors && my_mddev->external) {
2821                 /* need to check that all other rdevs with the same ->bdev
2822                  * do not overlap.  We need to unlock the mddev to avoid
2823                  * a deadlock.  We have already changed rdev->sectors, and if
2824                  * we have to change it back, we will have the lock again.
2825                  */
2826                 mddev_t *mddev;
2827                 int overlap = 0;
2828                 struct list_head *tmp;
2829
2830                 mddev_unlock(my_mddev);
2831                 for_each_mddev(mddev, tmp) {
2832                         mdk_rdev_t *rdev2;
2833
2834                         mddev_lock(mddev);
2835                         list_for_each_entry(rdev2, &mddev->disks, same_set)
2836                                 if (rdev->bdev == rdev2->bdev &&
2837                                     rdev != rdev2 &&
2838                                     overlaps(rdev->data_offset, rdev->sectors,
2839                                              rdev2->data_offset,
2840                                              rdev2->sectors)) {
2841                                         overlap = 1;
2842                                         break;
2843                                 }
2844                         mddev_unlock(mddev);
2845                         if (overlap) {
2846                                 mddev_put(mddev);
2847                                 break;
2848                         }
2849                 }
2850                 mddev_lock(my_mddev);
2851                 if (overlap) {
2852                         /* Someone else could have slipped in a size
2853                          * change here, but doing so is just silly.
2854                          * We put oldsectors back because we *know* it is
2855                          * safe, and trust userspace not to race with
2856                          * itself
2857                          */
2858                         rdev->sectors = oldsectors;
2859                         return -EBUSY;
2860                 }
2861         }
2862         return len;
2863 }
2864
2865 static struct rdev_sysfs_entry rdev_size =
2866 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2867
2868
2869 static ssize_t recovery_start_show(mdk_rdev_t *rdev, char *page)
2870 {
2871         unsigned long long recovery_start = rdev->recovery_offset;
2872
2873         if (test_bit(In_sync, &rdev->flags) ||
2874             recovery_start == MaxSector)
2875                 return sprintf(page, "none\n");
2876
2877         return sprintf(page, "%llu\n", recovery_start);
2878 }
2879
2880 static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2881 {
2882         unsigned long long recovery_start;
2883
2884         if (cmd_match(buf, "none"))
2885                 recovery_start = MaxSector;
2886         else if (strict_strtoull(buf, 10, &recovery_start))
2887                 return -EINVAL;
2888
2889         if (rdev->mddev->pers &&
2890             rdev->raid_disk >= 0)
2891                 return -EBUSY;
2892
2893         rdev->recovery_offset = recovery_start;
2894         if (recovery_start == MaxSector)
2895                 set_bit(In_sync, &rdev->flags);
2896         else
2897                 clear_bit(In_sync, &rdev->flags);
2898         return len;
2899 }
2900
2901 static struct rdev_sysfs_entry rdev_recovery_start =
2902 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2903
2904
2905 static ssize_t
2906 badblocks_show(struct badblocks *bb, char *page, int unack);
2907 static ssize_t
2908 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2909
2910 static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2911 {
2912         return badblocks_show(&rdev->badblocks, page, 0);
2913 }
2914 static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2915 {
2916         int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2917         /* Maybe that ack was all we needed */
2918         if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2919                 wake_up(&rdev->blocked_wait);
2920         return rv;
2921 }
2922 static struct rdev_sysfs_entry rdev_bad_blocks =
2923 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2924
2925
2926 static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
2927 {
2928         return badblocks_show(&rdev->badblocks, page, 1);
2929 }
2930 static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2931 {
2932         return badblocks_store(&rdev->badblocks, page, len, 1);
2933 }
2934 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2935 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2936
2937 static struct attribute *rdev_default_attrs[] = {
2938         &rdev_state.attr,
2939         &rdev_errors.attr,
2940         &rdev_slot.attr,
2941         &rdev_offset.attr,
2942         &rdev_size.attr,
2943         &rdev_recovery_start.attr,
2944         &rdev_bad_blocks.attr,
2945         &rdev_unack_bad_blocks.attr,
2946         NULL,
2947 };
2948 static ssize_t
2949 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2950 {
2951         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2952         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2953         mddev_t *mddev = rdev->mddev;
2954         ssize_t rv;
2955
2956         if (!entry->show)
2957                 return -EIO;
2958
2959         rv = mddev ? mddev_lock(mddev) : -EBUSY;
2960         if (!rv) {
2961                 if (rdev->mddev == NULL)
2962                         rv = -EBUSY;
2963                 else
2964                         rv = entry->show(rdev, page);
2965                 mddev_unlock(mddev);
2966         }
2967         return rv;
2968 }
2969
2970 static ssize_t
2971 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2972               const char *page, size_t length)
2973 {
2974         struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2975         mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2976         ssize_t rv;
2977         mddev_t *mddev = rdev->mddev;
2978
2979         if (!entry->store)
2980                 return -EIO;
2981         if (!capable(CAP_SYS_ADMIN))
2982                 return -EACCES;
2983         rv = mddev ? mddev_lock(mddev): -EBUSY;
2984         if (!rv) {
2985                 if (rdev->mddev == NULL)
2986                         rv = -EBUSY;
2987                 else
2988                         rv = entry->store(rdev, page, length);
2989                 mddev_unlock(mddev);
2990         }
2991         return rv;
2992 }
2993
2994 static void rdev_free(struct kobject *ko)
2995 {
2996         mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2997         kfree(rdev);
2998 }
2999 static const struct sysfs_ops rdev_sysfs_ops = {
3000         .show           = rdev_attr_show,
3001         .store          = rdev_attr_store,
3002 };
3003 static struct kobj_type rdev_ktype = {
3004         .release        = rdev_free,
3005         .sysfs_ops      = &rdev_sysfs_ops,
3006         .default_attrs  = rdev_default_attrs,
3007 };
3008
3009 int md_rdev_init(mdk_rdev_t *rdev)
3010 {
3011         rdev->desc_nr = -1;
3012         rdev->saved_raid_disk = -1;
3013         rdev->raid_disk = -1;
3014         rdev->flags = 0;
3015         rdev->data_offset = 0;
3016         rdev->sb_events = 0;
3017         rdev->last_read_error.tv_sec  = 0;
3018         rdev->last_read_error.tv_nsec = 0;
3019         rdev->sb_loaded = 0;
3020         rdev->bb_page = NULL;
3021         atomic_set(&rdev->nr_pending, 0);
3022         atomic_set(&rdev->read_errors, 0);
3023         atomic_set(&rdev->corrected_errors, 0);
3024
3025         INIT_LIST_HEAD(&rdev->same_set);
3026         init_waitqueue_head(&rdev->blocked_wait);
3027
3028         /* Add space to store bad block list.
3029          * This reserves the space even on arrays where it cannot
3030          * be used - I wonder if that matters
3031          */
3032         rdev->badblocks.count = 0;
3033         rdev->badblocks.shift = 0;
3034         rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3035         seqlock_init(&rdev->badblocks.lock);
3036         if (rdev->badblocks.page == NULL)
3037                 return -ENOMEM;
3038
3039         return 0;
3040 }
3041 EXPORT_SYMBOL_GPL(md_rdev_init);
3042 /*
3043  * Import a device. If 'super_format' >= 0, then sanity check the superblock
3044  *
3045  * mark the device faulty if:
3046  *
3047  *   - the device is nonexistent (zero size)
3048  *   - the device has no valid superblock
3049  *
3050  * a faulty rdev _never_ has rdev->sb set.
3051  */
3052 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
3053 {
3054         char b[BDEVNAME_SIZE];
3055         int err;
3056         mdk_rdev_t *rdev;
3057         sector_t size;
3058
3059         rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3060         if (!rdev) {
3061                 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3062                 return ERR_PTR(-ENOMEM);
3063         }
3064
3065         err = md_rdev_init(rdev);
3066         if (err)
3067                 goto abort_free;
3068         err = alloc_disk_sb(rdev);
3069         if (err)
3070                 goto abort_free;
3071
3072         err = lock_rdev(rdev, newdev, super_format == -2);
3073         if (err)
3074                 goto abort_free;
3075
3076         kobject_init(&rdev->kobj, &rdev_ktype);
3077
3078         size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3079         if (!size) {
3080                 printk(KERN_WARNING 
3081                         "md: %s has zero or unknown size, marking faulty!\n",
3082                         bdevname(rdev->bdev,b));
3083                 err = -EINVAL;
3084                 goto abort_free;
3085         }
3086
3087         if (super_format >= 0) {
3088                 err = super_types[super_format].
3089                         load_super(rdev, NULL, super_minor);
3090                 if (err == -EINVAL) {
3091                         printk(KERN_WARNING
3092                                 "md: %s does not have a valid v%d.%d "
3093                                "superblock, not importing!\n",
3094                                 bdevname(rdev->bdev,b),
3095                                super_format, super_minor);
3096                         goto abort_free;
3097                 }
3098                 if (err < 0) {
3099                         printk(KERN_WARNING 
3100                                 "md: could not read %s's sb, not importing!\n",
3101                                 bdevname(rdev->bdev,b));
3102                         goto abort_free;
3103                 }
3104         }
3105         if (super_format == -1)
3106                 /* hot-add for 0.90, or non-persistent: so no badblocks */
3107                 rdev->badblocks.shift = -1;
3108
3109         return rdev;
3110
3111 abort_free:
3112         if (rdev->bdev)
3113                 unlock_rdev(rdev);
3114         free_disk_sb(rdev);
3115         kfree(rdev->badblocks.page);
3116         kfree(rdev);
3117         return ERR_PTR(err);
3118 }
3119
3120 /*
3121  * Check a full RAID array for plausibility
3122  */
3123
3124
3125 static void analyze_sbs(mddev_t * mddev)
3126 {
3127         int i;
3128         mdk_rdev_t *rdev, *freshest, *tmp;
3129         char b[BDEVNAME_SIZE];
3130
3131         freshest = NULL;
3132         rdev_for_each(rdev, tmp, mddev)
3133                 switch (super_types[mddev->major_version].
3134                         load_super(rdev, freshest, mddev->minor_version)) {
3135                 case 1:
3136                         freshest = rdev;
3137                         break;
3138                 case 0:
3139                         break;
3140                 default:
3141                         printk( KERN_ERR \
3142                                 "md: fatal superblock inconsistency in %s"
3143                                 " -- removing from array\n", 
3144                                 bdevname(rdev->bdev,b));
3145                         kick_rdev_from_array(rdev);
3146                 }
3147
3148
3149         super_types[mddev->major_version].
3150                 validate_super(mddev, freshest);
3151
3152         i = 0;
3153         rdev_for_each(rdev, tmp, mddev) {
3154                 if (mddev->max_disks &&
3155                     (rdev->desc_nr >= mddev->max_disks ||
3156                      i > mddev->max_disks)) {
3157                         printk(KERN_WARNING
3158                                "md: %s: %s: only %d devices permitted\n",
3159                                mdname(mddev), bdevname(rdev->bdev, b),
3160                                mddev->max_disks);
3161                         kick_rdev_from_array(rdev);
3162                         continue;
3163                 }
3164                 if (rdev != freshest)
3165                         if (super_types[mddev->major_version].
3166                             validate_super(mddev, rdev)) {
3167                                 printk(KERN_WARNING "md: kicking non-fresh %s"
3168                                         " from array!\n",
3169                                         bdevname(rdev->bdev,b));
3170                                 kick_rdev_from_array(rdev);
3171                                 continue;
3172                         }
3173                 if (mddev->level == LEVEL_MULTIPATH) {
3174                         rdev->desc_nr = i++;
3175                         rdev->raid_disk = rdev->desc_nr;
3176                         set_bit(In_sync, &rdev->flags);
3177                 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3178                         rdev->raid_disk = -1;
3179                         clear_bit(In_sync, &rdev->flags);
3180                 }
3181         }
3182 }
3183
3184 /* Read a fixed-point number.
3185  * Numbers in sysfs attributes should be in "standard" units where
3186  * possible, so time should be in seconds.
3187  * However we internally use a a much smaller unit such as 
3188  * milliseconds or jiffies.
3189  * This function takes a decimal number with a possible fractional
3190  * component, and produces an integer which is the result of
3191  * multiplying that number by 10^'scale'.
3192  * all without any floating-point arithmetic.
3193  */
3194 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3195 {
3196         unsigned long result = 0;
3197         long decimals = -1;
3198         while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3199                 if (*cp == '.')
3200                         decimals = 0;
3201                 else if (decimals < scale) {
3202                         unsigned int value;
3203                         value = *cp - '0';
3204                         result = result * 10 + value;
3205                         if (decimals >= 0)
3206                                 decimals++;
3207                 }
3208                 cp++;
3209         }
3210         if (*cp == '\n')
3211                 cp++;
3212         if (*cp)
3213                 return -EINVAL;
3214         if (decimals < 0)
3215                 decimals = 0;
3216         while (decimals < scale) {
3217                 result *= 10;
3218                 decimals ++;
3219         }
3220         *res = result;
3221         return 0;
3222 }
3223
3224
3225 static void md_safemode_timeout(unsigned long data);
3226
3227 static ssize_t
3228 safe_delay_show(mddev_t *mddev, char *page)
3229 {
3230         int msec = (mddev->safemode_delay*1000)/HZ;
3231         return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3232 }
3233 static ssize_t
3234 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
3235 {
3236         unsigned long msec;
3237
3238         if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3239                 return -EINVAL;
3240         if (msec == 0)
3241                 mddev->safemode_delay = 0;
3242         else {
3243                 unsigned long old_delay = mddev->safemode_delay;
3244                 mddev->safemode_delay = (msec*HZ)/1000;
3245                 if (mddev->safemode_delay == 0)
3246                         mddev->safemode_delay = 1;
3247                 if (mddev->safemode_delay < old_delay)
3248                         md_safemode_timeout((unsigned long)mddev);
3249         }
3250         return len;
3251 }
3252 static struct md_sysfs_entry md_safe_delay =
3253 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3254
3255 static ssize_t
3256 level_show(mddev_t *mddev, char *page)
3257 {
3258         struct mdk_personality *p = mddev->pers;
3259         if (p)
3260                 return sprintf(page, "%s\n", p->name);
3261         else if (mddev->clevel[0])
3262                 return sprintf(page, "%s\n", mddev->clevel);
3263         else if (mddev->level != LEVEL_NONE)
3264                 return sprintf(page, "%d\n", mddev->level);
3265         else
3266                 return 0;
3267 }
3268
3269 static ssize_t
3270 level_store(mddev_t *mddev, const char *buf, size_t len)
3271 {
3272         char clevel[16];
3273         ssize_t rv = len;
3274         struct mdk_personality *pers;
3275         long level;
3276         void *priv;
3277         mdk_rdev_t *rdev;
3278
3279         if (mddev->pers == NULL) {
3280                 if (len == 0)
3281                         return 0;
3282                 if (len >= sizeof(mddev->clevel))
3283                         return -ENOSPC;
3284                 strncpy(mddev->clevel, buf, len);
3285                 if (mddev->clevel[len-1] == '\n')
3286                         len--;
3287                 mddev->clevel[len] = 0;
3288                 mddev->level = LEVEL_NONE;
3289                 return rv;
3290         }
3291
3292         /* request to change the personality.  Need to ensure:
3293          *  - array is not engaged in resync/recovery/reshape
3294          *  - old personality can be suspended
3295          *  - new personality will access other array.
3296          */
3297
3298         if (mddev->sync_thread ||
3299             mddev->reshape_position != MaxSector ||
3300             mddev->sysfs_active)
3301                 return -EBUSY;
3302
3303         if (!mddev->pers->quiesce) {
3304                 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3305                        mdname(mddev), mddev->pers->name);
3306                 return -EINVAL;
3307         }
3308
3309         /* Now find the new personality */
3310         if (len == 0 || len >= sizeof(clevel))
3311                 return -EINVAL;
3312         strncpy(clevel, buf, len);
3313         if (clevel[len-1] == '\n')
3314                 len--;
3315         clevel[len] = 0;
3316         if (strict_strtol(clevel, 10, &level))
3317                 level = LEVEL_NONE;
3318
3319         if (request_module("md-%s", clevel) != 0)
3320                 request_module("md-level-%s", clevel);
3321         spin_lock(&pers_lock);
3322         pers = find_pers(level, clevel);
3323         if (!pers || !try_module_get(pers->owner)) {
3324                 spin_unlock(&pers_lock);
3325                 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3326                 return -EINVAL;
3327         }
3328         spin_unlock(&pers_lock);
3329
3330         if (pers == mddev->pers) {
3331                 /* Nothing to do! */
3332                 module_put(pers->owner);
3333                 return rv;
3334         }
3335         if (!pers->takeover) {
3336                 module_put(pers->owner);
3337                 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3338                        mdname(mddev), clevel);
3339                 return -EINVAL;
3340         }
3341
3342         list_for_each_entry(rdev, &mddev->disks, same_set)
3343                 rdev->new_raid_disk = rdev->raid_disk;
3344
3345         /* ->takeover must set new_* and/or delta_disks
3346          * if it succeeds, and may set them when it fails.
3347          */
3348         priv = pers->takeover(mddev);
3349         if (IS_ERR(priv)) {
3350                 mddev->new_level = mddev->level;
3351                 mddev->new_layout = mddev->layout;
3352                 mddev->new_chunk_sectors = mddev->chunk_sectors;
3353                 mddev->raid_disks -= mddev->delta_disks;
3354                 mddev->delta_disks = 0;
3355                 module_put(pers->owner);
3356                 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3357                        mdname(mddev), clevel);
3358                 return PTR_ERR(priv);
3359         }
3360
3361         /* Looks like we have a winner */
3362         mddev_suspend(mddev);
3363         mddev->pers->stop(mddev);
3364         
3365         if (mddev->pers->sync_request == NULL &&
3366             pers->sync_request != NULL) {
3367                 /* need to add the md_redundancy_group */
3368                 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3369                         printk(KERN_WARNING
3370                                "md: cannot register extra attributes for %s\n",
3371                                mdname(mddev));
3372                 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3373         }               
3374         if (mddev->pers->sync_request != NULL &&
3375             pers->sync_request == NULL) {
3376                 /* need to remove the md_redundancy_group */
3377                 if (mddev->to_remove == NULL)
3378                         mddev->to_remove = &md_redundancy_group;
3379         }
3380
3381         if (mddev->pers->sync_request == NULL &&
3382             mddev->external) {
3383                 /* We are converting from a no-redundancy array
3384                  * to a redundancy array and metadata is managed
3385                  * externally so we need to be sure that writes
3386                  * won't block due to a need to transition
3387                  *      clean->dirty
3388                  * until external management is started.
3389                  */
3390                 mddev->in_sync = 0;
3391                 mddev->safemode_delay = 0;
3392                 mddev->safemode = 0;
3393         }
3394
3395         list_for_each_entry(rdev, &mddev->disks, same_set) {
3396                 if (rdev->raid_disk < 0)
3397                         continue;
3398                 if (rdev->new_raid_disk >= mddev->raid_disks)
3399                         rdev->new_raid_disk = -1;
3400                 if (rdev->new_raid_disk == rdev->raid_disk)
3401                         continue;
3402                 sysfs_unlink_rdev(mddev, rdev);
3403         }
3404         list_for_each_entry(rdev, &mddev->disks, same_set) {
3405                 if (rdev->raid_disk < 0)
3406                         continue;
3407                 if (rdev->new_raid_disk == rdev->raid_disk)
3408                         continue;
3409                 rdev->raid_disk = rdev->new_raid_disk;
3410                 if (rdev->raid_disk < 0)
3411                         clear_bit(In_sync, &rdev->flags);
3412                 else {
3413                         if (sysfs_link_rdev(mddev, rdev))
3414                                 printk(KERN_WARNING "md: cannot register rd%d"
3415                                        " for %s after level change\n",
3416                                        rdev->raid_disk, mdname(mddev));
3417                 }
3418         }
3419
3420         module_put(mddev->pers->owner);
3421         mddev->pers = pers;
3422         mddev->private = priv;
3423         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3424         mddev->level = mddev->new_level;
3425         mddev->layout = mddev->new_layout;
3426         mddev->chunk_sectors = mddev->new_chunk_sectors;
3427         mddev->delta_disks = 0;
3428         mddev->degraded = 0;
3429         if (mddev->pers->sync_request == NULL) {
3430                 /* this is now an array without redundancy, so
3431                  * it must always be in_sync
3432                  */
3433                 mddev->in_sync = 1;
3434                 del_timer_sync(&mddev->safemode_timer);
3435         }
3436         pers->run(mddev);
3437         mddev_resume(mddev);
3438         set_bit(MD_CHANGE_DEVS, &mddev->flags);
3439         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3440         md_wakeup_thread(mddev->thread);
3441         sysfs_notify(&mddev->kobj, NULL, "level");
3442         md_new_event(mddev);
3443         return rv;
3444 }
3445
3446 static struct md_sysfs_entry md_level =
3447 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3448
3449
3450 static ssize_t
3451 layout_show(mddev_t *mddev, char *page)
3452 {
3453         /* just a number, not meaningful for all levels */
3454         if (mddev->reshape_position != MaxSector &&
3455             mddev->layout != mddev->new_layout)
3456                 return sprintf(page, "%d (%d)\n",
3457                                mddev->new_layout, mddev->layout);
3458         return sprintf(page, "%d\n", mddev->layout);
3459 }
3460
3461 static ssize_t
3462 layout_store(mddev_t *mddev, const char *buf, size_t len)
3463 {
3464         char *e;
3465         unsigned long n = simple_strtoul(buf, &e, 10);
3466
3467         if (!*buf || (*e && *e != '\n'))
3468                 return -EINVAL;
3469
3470         if (mddev->pers) {
3471                 int err;
3472                 if (mddev->pers->check_reshape == NULL)
3473                         return -EBUSY;
3474                 mddev->new_layout = n;
3475                 err = mddev->pers->check_reshape(mddev);
3476                 if (err) {
3477                         mddev->new_layout = mddev->layout;
3478                         return err;
3479                 }
3480         } else {
3481                 mddev->new_layout = n;
3482                 if (mddev->reshape_position == MaxSector)
3483                         mddev->layout = n;
3484         }
3485         return len;
3486 }
3487 static struct md_sysfs_entry md_layout =
3488 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3489
3490
3491 static ssize_t
3492 raid_disks_show(mddev_t *mddev, char *page)
3493 {
3494         if (mddev->raid_disks == 0)
3495                 return 0;
3496         if (mddev->reshape_position != MaxSector &&
3497             mddev->delta_disks != 0)
3498                 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3499                                mddev->raid_disks - mddev->delta_disks);
3500         return sprintf(page, "%d\n", mddev->raid_disks);
3501 }
3502
3503 static int update_raid_disks(mddev_t *mddev, int raid_disks);
3504
3505 static ssize_t
3506 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
3507 {
3508         char *e;
3509         int rv = 0;
3510         unsigned long n = simple_strtoul(buf, &e, 10);
3511
3512         if (!*buf || (*e && *e != '\n'))
3513                 return -EINVAL;
3514
3515         if (mddev->pers)
3516                 rv = update_raid_disks(mddev, n);
3517         else if (mddev->reshape_position != MaxSector) {
3518                 int olddisks = mddev->raid_disks - mddev->delta_disks;
3519                 mddev->delta_disks = n - olddisks;
3520                 mddev->raid_disks = n;
3521         } else
3522                 mddev->raid_disks = n;
3523         return rv ? rv : len;
3524 }
3525 static struct md_sysfs_entry md_raid_disks =
3526 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3527
3528 static ssize_t
3529 chunk_size_show(mddev_t *mddev, char *page)
3530 {
3531         if (mddev->reshape_position != MaxSector &&
3532             mddev->chunk_sectors != mddev->new_chunk_sectors)
3533                 return sprintf(page, "%d (%d)\n",
3534                                mddev->new_chunk_sectors << 9,
3535                                mddev->chunk_sectors << 9);
3536         return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3537 }
3538
3539 static ssize_t
3540 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
3541 {
3542         char *e;
3543         unsigned long n = simple_strtoul(buf, &e, 10);
3544
3545         if (!*buf || (*e && *e != '\n'))
3546                 return -EINVAL;
3547
3548         if (mddev->pers) {
3549                 int err;
3550                 if (mddev->pers->check_reshape == NULL)
3551                         return -EBUSY;
3552                 mddev->new_chunk_sectors = n >> 9;
3553                 err = mddev->pers->check_reshape(mddev);
3554                 if (err) {
3555                         mddev->new_chunk_sectors = mddev->chunk_sectors;
3556                         return err;
3557                 }
3558         } else {
3559                 mddev->new_chunk_sectors = n >> 9;
3560                 if (mddev->reshape_position == MaxSector)
3561                         mddev->chunk_sectors = n >> 9;
3562         }
3563         return len;
3564 }
3565 static struct md_sysfs_entry md_chunk_size =
3566 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3567
3568 static ssize_t
3569 resync_start_show(mddev_t *mddev, char *page)
3570 {
3571         if (mddev->recovery_cp == MaxSector)
3572                 return sprintf(page, "none\n");
3573         return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3574 }
3575
3576 static ssize_t
3577 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
3578 {
3579         char *e;
3580         unsigned long long n = simple_strtoull(buf, &e, 10);
3581
3582         if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3583                 return -EBUSY;
3584         if (cmd_match(buf, "none"))
3585                 n = MaxSector;
3586         else if (!*buf || (*e && *e != '\n'))
3587                 return -EINVAL;
3588
3589         mddev->recovery_cp = n;
3590         return len;
3591 }
3592 static struct md_sysfs_entry md_resync_start =
3593 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3594
3595 /*
3596  * The array state can be:
3597  *
3598  * clear
3599  *     No devices, no size, no level
3600  *     Equivalent to STOP_ARRAY ioctl
3601  * inactive
3602  *     May have some settings, but array is not active
3603  *        all IO results in error
3604  *     When written, doesn't tear down array, but just stops it
3605  * suspended (not supported yet)
3606  *     All IO requests will block. The array can be reconfigured.
3607  *     Writing this, if accepted, will block until array is quiescent
3608  * readonly
3609  *     no resync can happen.  no superblocks get written.
3610  *     write requests fail
3611  * read-auto
3612  *     like readonly, but behaves like 'clean' on a write request.
3613  *
3614  * clean - no pending writes, but otherwise active.
3615  *     When written to inactive array, starts without resync
3616  *     If a write request arrives then
3617  *       if metadata is known, mark 'dirty' and switch to 'active'.
3618  *       if not known, block and switch to write-pending
3619  *     If written to an active array that has pending writes, then fails.
3620  * active
3621  *     fully active: IO and resync can be happening.
3622  *     When written to inactive array, starts with resync
3623  *
3624  * write-pending
3625  *     clean, but writes are blocked waiting for 'active' to be written.
3626  *
3627  * active-idle
3628  *     like active, but no writes have been seen for a while (100msec).
3629  *
3630  */
3631 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3632                    write_pending, active_idle, bad_word};
3633 static char *array_states[] = {
3634         "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3635         "write-pending", "active-idle", NULL };
3636
3637 static int match_word(const char *word, char **list)
3638 {
3639         int n;
3640         for (n=0; list[n]; n++)
3641                 if (cmd_match(word, list[n]))
3642                         break;
3643         return n;
3644 }
3645
3646 static ssize_t
3647 array_state_show(mddev_t *mddev, char *page)
3648 {
3649         enum array_state st = inactive;
3650
3651         if (mddev->pers)
3652                 switch(mddev->ro) {
3653                 case 1:
3654                         st = readonly;
3655                         break;
3656                 case 2:
3657                         st = read_auto;
3658                         break;
3659                 case 0:
3660                         if (mddev->in_sync)
3661                                 st = clean;
3662                         else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3663                                 st = write_pending;
3664                         else if (mddev->safemode)
3665                                 st = active_idle;
3666                         else
3667                                 st = active;
3668                 }
3669         else {
3670                 if (list_empty(&mddev->disks) &&
3671                     mddev->raid_disks == 0 &&
3672                     mddev->dev_sectors == 0)
3673                         st = clear;
3674                 else
3675                         st = inactive;
3676         }
3677         return sprintf(page, "%s\n", array_states[st]);
3678 }
3679
3680 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3681 static int md_set_readonly(mddev_t * mddev, int is_open);
3682 static int do_md_run(mddev_t * mddev);
3683 static int restart_array(mddev_t *mddev);
3684
3685 static ssize_t
3686 array_state_store(mddev_t *mddev, const char *buf, size_t len)
3687 {
3688         int err = -EINVAL;
3689         enum array_state st = match_word(buf, array_states);
3690         switch(st) {
3691         case bad_word:
3692                 break;
3693         case clear:
3694                 /* stopping an active array */
3695                 if (atomic_read(&mddev->openers) > 0)
3696                         return -EBUSY;
3697                 err = do_md_stop(mddev, 0, 0);
3698                 break;
3699         case inactive:
3700                 /* stopping an active array */
3701                 if (mddev->pers) {
3702                         if (atomic_read(&mddev->openers) > 0)
3703                                 return -EBUSY;
3704                         err = do_md_stop(mddev, 2, 0);
3705                 } else
3706                         err = 0; /* already inactive */
3707                 break;
3708         case suspended:
3709                 break; /* not supported yet */
3710         case readonly:
3711                 if (mddev->pers)
3712                         err = md_set_readonly(mddev, 0);
3713                 else {
3714                         mddev->ro = 1;
3715                         set_disk_ro(mddev->gendisk, 1);
3716                         err = do_md_run(mddev);
3717                 }
3718                 break;
3719         case read_auto:
3720                 if (mddev->pers) {
3721                         if (mddev->ro == 0)
3722                                 err = md_set_readonly(mddev, 0);
3723                         else if (mddev->ro == 1)
3724                                 err = restart_array(mddev);
3725                         if (err == 0) {
3726                                 mddev->ro = 2;
3727                                 set_disk_ro(mddev->gendisk, 0);
3728                         }
3729                 } else {
3730                         mddev->ro = 2;
3731                         err = do_md_run(mddev);
3732                 }
3733                 break;
3734         case clean:
3735                 if (mddev->pers) {
3736                         restart_array(mddev);
3737                         spin_lock_irq(&mddev->write_lock);
3738                         if (atomic_read(&mddev->writes_pending) == 0) {
3739                                 if (mddev->in_sync == 0) {
3740                                         mddev->in_sync = 1;
3741                                         if (mddev->safemode == 1)
3742                                                 mddev->safemode = 0;
3743                                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3744                                 }
3745                                 err = 0;
3746                         } else
3747                                 err = -EBUSY;
3748                         spin_unlock_irq(&mddev->write_lock);
3749                 } else
3750                         err = -EINVAL;
3751                 break;
3752         case active:
3753                 if (mddev->pers) {
3754                         restart_array(mddev);
3755                         clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3756                         wake_up(&mddev->sb_wait);
3757                         err = 0;
3758                 } else {
3759                         mddev->ro = 0;
3760                         set_disk_ro(mddev->gendisk, 0);
3761                         err = do_md_run(mddev);
3762                 }
3763                 break;
3764         case write_pending:
3765         case active_idle:
3766                 /* these cannot be set */
3767                 break;
3768         }
3769         if (err)
3770                 return err;
3771         else {
3772                 sysfs_notify_dirent_safe(mddev->sysfs_state);
3773                 return len;
3774         }
3775 }
3776 static struct md_sysfs_entry md_array_state =
3777 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3778
3779 static ssize_t
3780 max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3781         return sprintf(page, "%d\n",
3782                        atomic_read(&mddev->max_corr_read_errors));
3783 }
3784
3785 static ssize_t
3786 max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3787 {
3788         char *e;
3789         unsigned long n = simple_strtoul(buf, &e, 10);
3790
3791         if (*buf && (*e == 0 || *e == '\n')) {
3792                 atomic_set(&mddev->max_corr_read_errors, n);
3793                 return len;
3794         }
3795         return -EINVAL;
3796 }
3797
3798 static struct md_sysfs_entry max_corr_read_errors =
3799 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3800         max_corrected_read_errors_store);
3801
3802 static ssize_t
3803 null_show(mddev_t *mddev, char *page)
3804 {
3805         return -EINVAL;
3806 }
3807
3808 static ssize_t
3809 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3810 {
3811         /* buf must be %d:%d\n? giving major and minor numbers */
3812         /* The new device is added to the array.
3813          * If the array has a persistent superblock, we read the
3814          * superblock to initialise info and check validity.
3815          * Otherwise, only checking done is that in bind_rdev_to_array,
3816          * which mainly checks size.
3817          */
3818         char *e;
3819         int major = simple_strtoul(buf, &e, 10);
3820         int minor;
3821         dev_t dev;
3822         mdk_rdev_t *rdev;
3823         int err;
3824
3825         if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3826                 return -EINVAL;
3827         minor = simple_strtoul(e+1, &e, 10);
3828         if (*e && *e != '\n')
3829                 return -EINVAL;
3830         dev = MKDEV(major, minor);
3831         if (major != MAJOR(dev) ||
3832             minor != MINOR(dev))
3833                 return -EOVERFLOW;
3834
3835
3836         if (mddev->persistent) {
3837                 rdev = md_import_device(dev, mddev->major_version,
3838                                         mddev->minor_version);
3839                 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3840                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3841                                                        mdk_rdev_t, same_set);
3842                         err = super_types[mddev->major_version]
3843                                 .load_super(rdev, rdev0, mddev->minor_version);
3844                         if (err < 0)
3845                                 goto out;
3846                 }
3847         } else if (mddev->external)
3848                 rdev = md_import_device(dev, -2, -1);
3849         else
3850                 rdev = md_import_device(dev, -1, -1);
3851
3852         if (IS_ERR(rdev))
3853                 return PTR_ERR(rdev);
3854         err = bind_rdev_to_array(rdev, mddev);
3855  out:
3856         if (err)
3857                 export_rdev(rdev);
3858         return err ? err : len;
3859 }
3860
3861 static struct md_sysfs_entry md_new_device =
3862 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3863
3864 static ssize_t
3865 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3866 {
3867         char *end;
3868         unsigned long chunk, end_chunk;
3869
3870         if (!mddev->bitmap)
3871                 goto out;
3872         /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3873         while (*buf) {
3874                 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3875                 if (buf == end) break;
3876                 if (*end == '-') { /* range */
3877                         buf = end + 1;
3878                         end_chunk = simple_strtoul(buf, &end, 0);
3879                         if (buf == end) break;
3880                 }
3881                 if (*end && !isspace(*end)) break;
3882                 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3883                 buf = skip_spaces(end);
3884         }
3885         bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3886 out:
3887         return len;
3888 }
3889
3890 static struct md_sysfs_entry md_bitmap =
3891 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3892
3893 static ssize_t
3894 size_show(mddev_t *mddev, char *page)
3895 {
3896         return sprintf(page, "%llu\n",
3897                 (unsigned long long)mddev->dev_sectors / 2);
3898 }
3899
3900 static int update_size(mddev_t *mddev, sector_t num_sectors);
3901
3902 static ssize_t
3903 size_store(mddev_t *mddev, const char *buf, size_t len)
3904 {
3905         /* If array is inactive, we can reduce the component size, but
3906          * not increase it (except from 0).
3907          * If array is active, we can try an on-line resize
3908          */
3909         sector_t sectors;
3910         int err = strict_blocks_to_sectors(buf, &sectors);
3911
3912         if (err < 0)
3913                 return err;
3914         if (mddev->pers) {
3915                 err = update_size(mddev, sectors);
3916                 md_update_sb(mddev, 1);
3917         } else {
3918                 if (mddev->dev_sectors == 0 ||
3919                     mddev->dev_sectors > sectors)
3920                         mddev->dev_sectors = sectors;
3921                 else
3922                         err = -ENOSPC;
3923         }
3924         return err ? err : len;
3925 }
3926
3927 static struct md_sysfs_entry md_size =
3928 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3929
3930
3931 /* Metdata version.
3932  * This is one of
3933  *   'none' for arrays with no metadata (good luck...)
3934  *   'external' for arrays with externally managed metadata,
3935  * or N.M for internally known formats
3936  */
3937 static ssize_t
3938 metadata_show(mddev_t *mddev, char *page)
3939 {
3940         if (mddev->persistent)
3941                 return sprintf(page, "%d.%d\n",
3942                                mddev->major_version, mddev->minor_version);
3943         else if (mddev->external)
3944                 return sprintf(page, "external:%s\n", mddev->metadata_type);
3945         else
3946                 return sprintf(page, "none\n");
3947 }
3948
3949 static ssize_t
3950 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3951 {
3952         int major, minor;
3953         char *e;
3954         /* Changing the details of 'external' metadata is
3955          * always permitted.  Otherwise there must be
3956          * no devices attached to the array.
3957          */
3958         if (mddev->external && strncmp(buf, "external:", 9) == 0)
3959                 ;
3960         else if (!list_empty(&mddev->disks))
3961                 return -EBUSY;
3962
3963         if (cmd_match(buf, "none")) {
3964                 mddev->persistent = 0;
3965                 mddev->external = 0;
3966                 mddev->major_version = 0;
3967                 mddev->minor_version = 90;
3968                 return len;
3969         }
3970         if (strncmp(buf, "external:", 9) == 0) {
3971                 size_t namelen = len-9;
3972                 if (namelen >= sizeof(mddev->metadata_type))
3973                         namelen = sizeof(mddev->metadata_type)-1;
3974                 strncpy(mddev->metadata_type, buf+9, namelen);
3975                 mddev->metadata_type[namelen] = 0;
3976                 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3977                         mddev->metadata_type[--namelen] = 0;
3978                 mddev->persistent = 0;
3979                 mddev->external = 1;
3980                 mddev->major_version = 0;
3981                 mddev->minor_version = 90;
3982                 return len;
3983         }
3984         major = simple_strtoul(buf, &e, 10);
3985         if (e==buf || *e != '.')
3986                 return -EINVAL;
3987         buf = e+1;
3988         minor = simple_strtoul(buf, &e, 10);
3989         if (e==buf || (*e && *e != '\n') )
3990                 return -EINVAL;
3991         if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3992                 return -ENOENT;
3993         mddev->major_version = major;
3994         mddev->minor_version = minor;
3995         mddev->persistent = 1;
3996         mddev->external = 0;
3997         return len;
3998 }
3999
4000 static struct md_sysfs_entry md_metadata =
4001 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4002
4003 static ssize_t
4004 action_show(mddev_t *mddev, char *page)
4005 {
4006         char *type = "idle";
4007         if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4008                 type = "frozen";
4009         else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4010             (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
4011                 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4012                         type = "reshape";
4013                 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4014                         if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4015                                 type = "resync";
4016                         else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4017                                 type = "check";
4018                         else
4019                                 type = "repair";
4020                 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4021                         type = "recover";
4022         }
4023         return sprintf(page, "%s\n", type);
4024 }
4025
4026 static void reap_sync_thread(mddev_t *mddev);
4027
4028 static ssize_t
4029 action_store(mddev_t *mddev, const char *page, size_t len)
4030 {
4031         if (!mddev->pers || !mddev->pers->sync_request)
4032                 return -EINVAL;
4033
4034         if (cmd_match(page, "frozen"))
4035                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4036         else
4037                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4038
4039         if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4040                 if (mddev->sync_thread) {
4041                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4042                         reap_sync_thread(mddev);
4043                 }
4044         } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4045                    test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4046                 return -EBUSY;
4047         else if (cmd_match(page, "resync"))
4048                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4049         else if (cmd_match(page, "recover")) {
4050                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4051                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4052         } else if (cmd_match(page, "reshape")) {
4053                 int err;
4054                 if (mddev->pers->start_reshape == NULL)
4055                         return -EINVAL;
4056                 err = mddev->pers->start_reshape(mddev);
4057                 if (err)
4058                         return err;
4059                 sysfs_notify(&mddev->kobj, NULL, "degraded");
4060         } else {
4061                 if (cmd_match(page, "check"))
4062                         set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4063                 else if (!cmd_match(page, "repair"))
4064                         return -EINVAL;
4065                 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4066                 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4067         }
4068         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4069         md_wakeup_thread(mddev->thread);
4070         sysfs_notify_dirent_safe(mddev->sysfs_action);
4071         return len;
4072 }
4073
4074 static ssize_t
4075 mismatch_cnt_show(mddev_t *mddev, char *page)
4076 {
4077         return sprintf(page, "%llu\n",
4078                        (unsigned long long) mddev->resync_mismatches);
4079 }
4080
4081 static struct md_sysfs_entry md_scan_mode =
4082 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4083
4084
4085 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4086
4087 static ssize_t
4088 sync_min_show(mddev_t *mddev, char *page)
4089 {
4090         return sprintf(page, "%d (%s)\n", speed_min(mddev),
4091                        mddev->sync_speed_min ? "local": "system");
4092 }
4093
4094 static ssize_t
4095 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
4096 {
4097         int min;
4098         char *e;
4099         if (strncmp(buf, "system", 6)==0) {
4100                 mddev->sync_speed_min = 0;
4101                 return len;
4102         }
4103         min = simple_strtoul(buf, &e, 10);
4104         if (buf == e || (*e && *e != '\n') || min <= 0)
4105                 return -EINVAL;
4106         mddev->sync_speed_min = min;
4107         return len;
4108 }
4109
4110 static struct md_sysfs_entry md_sync_min =
4111 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4112
4113 static ssize_t
4114 sync_max_show(mddev_t *mddev, char *page)
4115 {
4116         return sprintf(page, "%d (%s)\n", speed_max(mddev),
4117                        mddev->sync_speed_max ? "local": "system");
4118 }
4119
4120 static ssize_t
4121 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
4122 {
4123         int max;
4124         char *e;
4125         if (strncmp(buf, "system", 6)==0) {
4126                 mddev->sync_speed_max = 0;
4127                 return len;
4128         }
4129         max = simple_strtoul(buf, &e, 10);
4130         if (buf == e || (*e && *e != '\n') || max <= 0)
4131                 return -EINVAL;
4132         mddev->sync_speed_max = max;
4133         return len;
4134 }
4135
4136 static struct md_sysfs_entry md_sync_max =
4137 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4138
4139 static ssize_t
4140 degraded_show(mddev_t *mddev, char *page)
4141 {
4142         return sprintf(page, "%d\n", mddev->degraded);
4143 }
4144 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4145
4146 static ssize_t
4147 sync_force_parallel_show(mddev_t *mddev, char *page)
4148 {
4149         return sprintf(page, "%d\n", mddev->parallel_resync);
4150 }
4151
4152 static ssize_t
4153 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
4154 {
4155         long n;
4156
4157         if (strict_strtol(buf, 10, &n))
4158                 return -EINVAL;
4159
4160         if (n != 0 && n != 1)
4161                 return -EINVAL;
4162
4163         mddev->parallel_resync = n;
4164
4165         if (mddev->sync_thread)
4166                 wake_up(&resync_wait);
4167
4168         return len;
4169 }
4170
4171 /* force parallel resync, even with shared block devices */
4172 static struct md_sysfs_entry md_sync_force_parallel =
4173 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4174        sync_force_parallel_show, sync_force_parallel_store);
4175
4176 static ssize_t
4177 sync_speed_show(mddev_t *mddev, char *page)
4178 {
4179         unsigned long resync, dt, db;
4180         if (mddev->curr_resync == 0)
4181                 return sprintf(page, "none\n");
4182         resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4183         dt = (jiffies - mddev->resync_mark) / HZ;
4184         if (!dt) dt++;
4185         db = resync - mddev->resync_mark_cnt;
4186         return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4187 }
4188
4189 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4190
4191 static ssize_t
4192 sync_completed_show(mddev_t *mddev, char *page)
4193 {
4194         unsigned long long max_sectors, resync;
4195
4196         if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4197                 return sprintf(page, "none\n");
4198
4199         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
4200                 max_sectors = mddev->resync_max_sectors;
4201         else
4202                 max_sectors = mddev->dev_sectors;
4203
4204         resync = mddev->curr_resync_completed;
4205         return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4206 }
4207
4208 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4209
4210 static ssize_t
4211 min_sync_show(mddev_t *mddev, char *page)
4212 {
4213         return sprintf(page, "%llu\n",
4214                        (unsigned long long)mddev->resync_min);
4215 }
4216 static ssize_t
4217 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
4218 {
4219         unsigned long long min;
4220         if (strict_strtoull(buf, 10, &min))
4221                 return -EINVAL;
4222         if (min > mddev->resync_max)
4223                 return -EINVAL;
4224         if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4225                 return -EBUSY;
4226
4227         /* Must be a multiple of chunk_size */
4228         if (mddev->chunk_sectors) {
4229                 sector_t temp = min;
4230                 if (sector_div(temp, mddev->chunk_sectors))
4231                         return -EINVAL;
4232         }
4233         mddev->resync_min = min;
4234
4235         return len;
4236 }
4237
4238 static struct md_sysfs_entry md_min_sync =
4239 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4240
4241 static ssize_t
4242 max_sync_show(mddev_t *mddev, char *page)
4243 {
4244         if (mddev->resync_max == MaxSector)
4245                 return sprintf(page, "max\n");
4246         else
4247                 return sprintf(page, "%llu\n",
4248                                (unsigned long long)mddev->resync_max);
4249 }
4250 static ssize_t
4251 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
4252 {
4253         if (strncmp(buf, "max", 3) == 0)
4254                 mddev->resync_max = MaxSector;
4255         else {
4256                 unsigned long long max;
4257                 if (strict_strtoull(buf, 10, &max))
4258                         return -EINVAL;
4259                 if (max < mddev->resync_min)
4260                         return -EINVAL;
4261                 if (max < mddev->resync_max &&
4262                     mddev->ro == 0 &&
4263                     test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4264                         return -EBUSY;
4265
4266                 /* Must be a multiple of chunk_size */
4267                 if (mddev->chunk_sectors) {
4268                         sector_t temp = max;
4269                         if (sector_div(temp, mddev->chunk_sectors))
4270                                 return -EINVAL;
4271                 }
4272                 mddev->resync_max = max;
4273         }
4274         wake_up(&mddev->recovery_wait);
4275         return len;
4276 }
4277
4278 static struct md_sysfs_entry md_max_sync =
4279 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4280
4281 static ssize_t
4282 suspend_lo_show(mddev_t *mddev, char *page)
4283 {
4284         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4285 }
4286
4287 static ssize_t
4288 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
4289 {
4290         char *e;
4291         unsigned long long new = simple_strtoull(buf, &e, 10);
4292         unsigned long long old = mddev->suspend_lo;
4293
4294         if (mddev->pers == NULL || 
4295             mddev->pers->quiesce == NULL)
4296                 return -EINVAL;
4297         if (buf == e || (*e && *e != '\n'))
4298                 return -EINVAL;
4299
4300         mddev->suspend_lo = new;
4301         if (new >= old)
4302                 /* Shrinking suspended region */
4303                 mddev->pers->quiesce(mddev, 2);
4304         else {
4305                 /* Expanding suspended region - need to wait */
4306                 mddev->pers->quiesce(mddev, 1);
4307                 mddev->pers->quiesce(mddev, 0);
4308         }
4309         return len;
4310 }
4311 static struct md_sysfs_entry md_suspend_lo =
4312 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4313
4314
4315 static ssize_t
4316 suspend_hi_show(mddev_t *mddev, char *page)
4317 {
4318         return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4319 }
4320
4321 static ssize_t
4322 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
4323 {
4324         char *e;
4325         unsigned long long new = simple_strtoull(buf, &e, 10);
4326         unsigned long long old = mddev->suspend_hi;
4327
4328         if (mddev->pers == NULL ||
4329             mddev->pers->quiesce == NULL)
4330                 return -EINVAL;
4331         if (buf == e || (*e && *e != '\n'))
4332                 return -EINVAL;
4333
4334         mddev->suspend_hi = new;
4335         if (new <= old)
4336                 /* Shrinking suspended region */
4337                 mddev->pers->quiesce(mddev, 2);
4338         else {
4339                 /* Expanding suspended region - need to wait */
4340                 mddev->pers->quiesce(mddev, 1);
4341                 mddev->pers->quiesce(mddev, 0);
4342         }
4343         return len;
4344 }
4345 static struct md_sysfs_entry md_suspend_hi =
4346 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4347
4348 static ssize_t
4349 reshape_position_show(mddev_t *mddev, char *page)
4350 {
4351         if (mddev->reshape_position != MaxSector)
4352                 return sprintf(page, "%llu\n",
4353                                (unsigned long long)mddev->reshape_position);
4354         strcpy(page, "none\n");
4355         return 5;
4356 }
4357
4358 static ssize_t
4359 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
4360 {
4361         char *e;
4362         unsigned long long new = simple_strtoull(buf, &e, 10);
4363         if (mddev->pers)
4364                 return -EBUSY;
4365         if (buf == e || (*e && *e != '\n'))
4366                 return -EINVAL;
4367         mddev->reshape_position = new;
4368         mddev->delta_disks = 0;
4369         mddev->new_level = mddev->level;
4370         mddev->new_layout = mddev->layout;
4371         mddev->new_chunk_sectors = mddev->chunk_sectors;
4372         return len;
4373 }
4374
4375 static struct md_sysfs_entry md_reshape_position =
4376 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4377        reshape_position_store);
4378
4379 static ssize_t
4380 array_size_show(mddev_t *mddev, char *page)
4381 {
4382         if (mddev->external_size)
4383                 return sprintf(page, "%llu\n",
4384                                (unsigned long long)mddev->array_sectors/2);
4385         else
4386                 return sprintf(page, "default\n");
4387 }
4388
4389 static ssize_t
4390 array_size_store(mddev_t *mddev, const char *buf, size_t len)
4391 {
4392         sector_t sectors;
4393
4394         if (strncmp(buf, "default", 7) == 0) {
4395                 if (mddev->pers)
4396                         sectors = mddev->pers->size(mddev, 0, 0);
4397                 else
4398                         sectors = mddev->array_sectors;
4399
4400                 mddev->external_size = 0;
4401         } else {
4402                 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4403                         return -EINVAL;
4404                 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4405                         return -E2BIG;
4406
4407                 mddev->external_size = 1;
4408         }
4409
4410         mddev->array_sectors = sectors;
4411         if (mddev->pers) {
4412                 set_capacity(mddev->gendisk, mddev->array_sectors);
4413                 revalidate_disk(mddev->gendisk);
4414         }
4415         return len;
4416 }
4417
4418 static struct md_sysfs_entry md_array_size =
4419 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4420        array_size_store);
4421
4422 static struct attribute *md_default_attrs[] = {
4423         &md_level.attr,
4424         &md_layout.attr,
4425         &md_raid_disks.attr,
4426         &md_chunk_size.attr,
4427         &md_size.attr,
4428         &md_resync_start.attr,
4429         &md_metadata.attr,
4430         &md_new_device.attr,
4431         &md_safe_delay.attr,
4432         &md_array_state.attr,
4433         &md_reshape_position.attr,
4434         &md_array_size.attr,
4435         &max_corr_read_errors.attr,
4436         NULL,
4437 };
4438
4439 static struct attribute *md_redundancy_attrs[] = {
4440         &md_scan_mode.attr,
4441         &md_mismatches.attr,
4442         &md_sync_min.attr,
4443         &md_sync_max.attr,
4444         &md_sync_speed.attr,
4445         &md_sync_force_parallel.attr,
4446         &md_sync_completed.attr,
4447         &md_min_sync.attr,
4448         &md_max_sync.attr,
4449         &md_suspend_lo.attr,
4450         &md_suspend_hi.attr,
4451         &md_bitmap.attr,
4452         &md_degraded.attr,
4453         NULL,
4454 };
4455 static struct attribute_group md_redundancy_group = {
4456         .name = NULL,
4457         .attrs = md_redundancy_attrs,
4458 };
4459
4460
4461 static ssize_t
4462 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4463 {
4464         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4465         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4466         ssize_t rv;
4467
4468         if (!entry->show)
4469                 return -EIO;
4470         rv = mddev_lock(mddev);
4471         if (!rv) {
4472                 rv = entry->show(mddev, page);
4473                 mddev_unlock(mddev);
4474         }
4475         return rv;
4476 }
4477
4478 static ssize_t
4479 md_attr_store(struct kobject *kobj, struct attribute *attr,
4480               const char *page, size_t length)
4481 {
4482         struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4483         mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
4484         ssize_t rv;
4485
4486         if (!entry->store)
4487                 return -EIO;
4488         if (!capable(CAP_SYS_ADMIN))
4489                 return -EACCES;
4490         rv = mddev_lock(mddev);
4491         if (mddev->hold_active == UNTIL_IOCTL)
4492                 mddev->hold_active = 0;
4493         if (!rv) {
4494                 rv = entry->store(mddev, page, length);
4495                 mddev_unlock(mddev);
4496         }
4497         return rv;
4498 }
4499
4500 static void md_free(struct kobject *ko)
4501 {
4502         mddev_t *mddev = container_of(ko, mddev_t, kobj);
4503
4504         if (mddev->sysfs_state)
4505                 sysfs_put(mddev->sysfs_state);
4506
4507         if (mddev->gendisk) {
4508                 del_gendisk(mddev->gendisk);
4509                 put_disk(mddev->gendisk);
4510         }
4511         if (mddev->queue)
4512                 blk_cleanup_queue(mddev->queue);
4513
4514         kfree(mddev);
4515 }
4516
4517 static const struct sysfs_ops md_sysfs_ops = {
4518         .show   = md_attr_show,
4519         .store  = md_attr_store,
4520 };
4521 static struct kobj_type md_ktype = {
4522         .release        = md_free,
4523         .sysfs_ops      = &md_sysfs_ops,
4524         .default_attrs  = md_default_attrs,
4525 };
4526
4527 int mdp_major = 0;
4528
4529 static void mddev_delayed_delete(struct work_struct *ws)
4530 {
4531         mddev_t *mddev = container_of(ws, mddev_t, del_work);
4532
4533         sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4534         kobject_del(&mddev->kobj);
4535         kobject_put(&mddev->kobj);
4536 }
4537
4538 static int md_alloc(dev_t dev, char *name)
4539 {
4540         static DEFINE_MUTEX(disks_mutex);
4541         mddev_t *mddev = mddev_find(dev);
4542         struct gendisk *disk;
4543         int partitioned;
4544         int shift;
4545         int unit;
4546         int error;
4547
4548         if (!mddev)
4549                 return -ENODEV;
4550
4551         partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4552         shift = partitioned ? MdpMinorShift : 0;
4553         unit = MINOR(mddev->unit) >> shift;
4554
4555         /* wait for any previous instance of this device to be
4556          * completely removed (mddev_delayed_delete).
4557          */
4558         flush_workqueue(md_misc_wq);
4559
4560         mutex_lock(&disks_mutex);
4561         error = -EEXIST;
4562         if (mddev->gendisk)
4563                 goto abort;
4564
4565         if (name) {
4566                 /* Need to ensure that 'name' is not a duplicate.
4567                  */
4568                 mddev_t *mddev2;
4569                 spin_lock(&all_mddevs_lock);
4570
4571                 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4572                         if (mddev2->gendisk &&
4573                             strcmp(mddev2->gendisk->disk_name, name) == 0) {
4574                                 spin_unlock(&all_mddevs_lock);
4575                                 goto abort;
4576                         }
4577                 spin_unlock(&all_mddevs_lock);
4578         }
4579
4580         error = -ENOMEM;
4581         mddev->queue = blk_alloc_queue(GFP_KERNEL);
4582         if (!mddev->queue)
4583                 goto abort;
4584         mddev->queue->queuedata = mddev;
4585
4586         blk_queue_make_request(mddev->queue, md_make_request);
4587
4588         disk = alloc_disk(1 << shift);
4589         if (!disk) {
4590                 blk_cleanup_queue(mddev->queue);
4591                 mddev->queue = NULL;
4592                 goto abort;
4593         }
4594         disk->major = MAJOR(mddev->unit);
4595         disk->first_minor = unit << shift;
4596         if (name)
4597                 strcpy(disk->disk_name, name);
4598         else if (partitioned)
4599                 sprintf(disk->disk_name, "md_d%d", unit);
4600         else
4601                 sprintf(disk->disk_name, "md%d", unit);
4602         disk->fops = &md_fops;
4603         disk->private_data = mddev;
4604         disk->queue = mddev->queue;
4605         blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4606         /* Allow extended partitions.  This makes the
4607          * 'mdp' device redundant, but we can't really
4608          * remove it now.
4609          */
4610         disk->flags |= GENHD_FL_EXT_DEVT;
4611         mddev->gendisk = disk;
4612         /* As soon as we call add_disk(), another thread could get
4613          * through to md_open, so make sure it doesn't get too far
4614          */
4615         mutex_lock(&mddev->open_mutex);
4616         add_disk(disk);
4617
4618         error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4619                                      &disk_to_dev(disk)->kobj, "%s", "md");
4620         if (error) {
4621                 /* This isn't possible, but as kobject_init_and_add is marked
4622                  * __must_check, we must do something with the result
4623                  */
4624                 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4625                        disk->disk_name);
4626                 error = 0;
4627         }
4628         if (mddev->kobj.sd &&
4629             sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4630                 printk(KERN_DEBUG "pointless warning\n");
4631         mutex_unlock(&mddev->open_mutex);
4632  abort:
4633         mutex_unlock(&disks_mutex);
4634         if (!error && mddev->kobj.sd) {
4635                 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4636                 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4637         }
4638         mddev_put(mddev);
4639         return error;
4640 }
4641
4642 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4643 {
4644         md_alloc(dev, NULL);
4645         return NULL;
4646 }
4647
4648 static int add_named_array(const char *val, struct kernel_param *kp)
4649 {
4650         /* val must be "md_*" where * is not all digits.
4651          * We allocate an array with a large free minor number, and
4652          * set the name to val.  val must not already be an active name.
4653          */
4654         int len = strlen(val);
4655         char buf[DISK_NAME_LEN];
4656
4657         while (len && val[len-1] == '\n')
4658                 len--;
4659         if (len >= DISK_NAME_LEN)
4660                 return -E2BIG;
4661         strlcpy(buf, val, len+1);
4662         if (strncmp(buf, "md_", 3) != 0)
4663                 return -EINVAL;
4664         return md_alloc(0, buf);
4665 }
4666
4667 static void md_safemode_timeout(unsigned long data)
4668 {
4669         mddev_t *mddev = (mddev_t *) data;
4670
4671         if (!atomic_read(&mddev->writes_pending)) {
4672                 mddev->safemode = 1;
4673                 if (mddev->external)
4674                         sysfs_notify_dirent_safe(mddev->sysfs_state);
4675         }
4676         md_wakeup_thread(mddev->thread);
4677 }
4678
4679 static int start_dirty_degraded;
4680
4681 int md_run(mddev_t *mddev)
4682 {
4683         int err;
4684         mdk_rdev_t *rdev;
4685         struct mdk_personality *pers;
4686
4687         if (list_empty(&mddev->disks))
4688                 /* cannot run an array with no devices.. */
4689                 return -EINVAL;
4690
4691         if (mddev->pers)
4692                 return -EBUSY;
4693         /* Cannot run until previous stop completes properly */
4694         if (mddev->sysfs_active)
4695                 return -EBUSY;
4696
4697         /*
4698          * Analyze all RAID superblock(s)
4699          */
4700         if (!mddev->raid_disks) {
4701                 if (!mddev->persistent)
4702                         return -EINVAL;
4703                 analyze_sbs(mddev);
4704         }
4705
4706         if (mddev->level != LEVEL_NONE)
4707                 request_module("md-level-%d", mddev->level);
4708         else if (mddev->clevel[0])
4709                 request_module("md-%s", mddev->clevel);
4710
4711         /*
4712          * Drop all container device buffers, from now on
4713          * the only valid external interface is through the md
4714          * device.
4715          */
4716         list_for_each_entry(rdev, &mddev->disks, same_set) {
4717                 if (test_bit(Faulty, &rdev->flags))
4718                         continue;
4719                 sync_blockdev(rdev->bdev);
4720                 invalidate_bdev(rdev->bdev);
4721
4722                 /* perform some consistency tests on the device.
4723                  * We don't want the data to overlap the metadata,
4724                  * Internal Bitmap issues have been handled elsewhere.
4725                  */
4726                 if (rdev->meta_bdev) {
4727                         /* Nothing to check */;
4728                 } else if (rdev->data_offset < rdev->sb_start) {
4729                         if (mddev->dev_sectors &&
4730                             rdev->data_offset + mddev->dev_sectors
4731                             > rdev->sb_start) {
4732                                 printk("md: %s: data overlaps metadata\n",
4733                                        mdname(mddev));
4734                                 return -EINVAL;
4735                         }
4736                 } else {
4737                         if (rdev->sb_start + rdev->sb_size/512
4738                             > rdev->data_offset) {
4739                                 printk("md: %s: metadata overlaps data\n",
4740                                        mdname(mddev));
4741                                 return -EINVAL;
4742                         }
4743                 }
4744                 sysfs_notify_dirent_safe(rdev->sysfs_state);
4745         }
4746
4747         if (mddev->bio_set == NULL)
4748                 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4749                                                sizeof(mddev_t *));
4750
4751         spin_lock(&pers_lock);
4752         pers = find_pers(mddev->level, mddev->clevel);
4753         if (!pers || !try_module_get(pers->owner)) {
4754                 spin_unlock(&pers_lock);
4755                 if (mddev->level != LEVEL_NONE)
4756                         printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4757                                mddev->level);
4758                 else
4759                         printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4760                                mddev->clevel);
4761                 return -EINVAL;
4762         }
4763         mddev->pers = pers;
4764         spin_unlock(&pers_lock);
4765         if (mddev->level != pers->level) {
4766                 mddev->level = pers->level;
4767                 mddev->new_level = pers->level;
4768         }
4769         strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4770
4771         if (mddev->reshape_position != MaxSector &&
4772             pers->start_reshape == NULL) {
4773                 /* This personality cannot handle reshaping... */
4774                 mddev->pers = NULL;
4775                 module_put(pers->owner);
4776                 return -EINVAL;
4777         }
4778
4779         if (pers->sync_request) {
4780                 /* Warn if this is a potentially silly
4781                  * configuration.
4782                  */
4783                 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4784                 mdk_rdev_t *rdev2;
4785                 int warned = 0;
4786
4787                 list_for_each_entry(rdev, &mddev->disks, same_set)
4788                         list_for_each_entry(rdev2, &mddev->disks, same_set) {
4789                                 if (rdev < rdev2 &&
4790                                     rdev->bdev->bd_contains ==
4791                                     rdev2->bdev->bd_contains) {
4792                                         printk(KERN_WARNING
4793                                                "%s: WARNING: %s appears to be"
4794                                                " on the same physical disk as"
4795                                                " %s.\n",
4796                                                mdname(mddev),
4797                                                bdevname(rdev->bdev,b),
4798                                                bdevname(rdev2->bdev,b2));
4799                                         warned = 1;
4800                                 }
4801                         }
4802
4803                 if (warned)
4804                         printk(KERN_WARNING
4805                                "True protection against single-disk"
4806                                " failure might be compromised.\n");
4807         }
4808
4809         mddev->recovery = 0;
4810         /* may be over-ridden by personality */
4811         mddev->resync_max_sectors = mddev->dev_sectors;
4812
4813         mddev->ok_start_degraded = start_dirty_degraded;
4814
4815         if (start_readonly && mddev->ro == 0)
4816                 mddev->ro = 2; /* read-only, but switch on first write */
4817
4818         err = mddev->pers->run(mddev);
4819         if (err)
4820                 printk(KERN_ERR "md: pers->run() failed ...\n");
4821         else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4822                 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4823                           " but 'external_size' not in effect?\n", __func__);
4824                 printk(KERN_ERR
4825                        "md: invalid array_size %llu > default size %llu\n",
4826                        (unsigned long long)mddev->array_sectors / 2,
4827                        (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4828                 err = -EINVAL;
4829                 mddev->pers->stop(mddev);
4830         }
4831         if (err == 0 && mddev->pers->sync_request) {
4832                 err = bitmap_create(mddev);
4833                 if (err) {
4834                         printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4835                                mdname(mddev), err);
4836                         mddev->pers->stop(mddev);
4837                 }
4838         }
4839         if (err) {
4840                 module_put(mddev->pers->owner);
4841                 mddev->pers = NULL;
4842                 bitmap_destroy(mddev);
4843                 return err;
4844         }
4845         if (mddev->pers->sync_request) {
4846                 if (mddev->kobj.sd &&
4847                     sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4848                         printk(KERN_WARNING
4849                                "md: cannot register extra attributes for %s\n",
4850                                mdname(mddev));
4851                 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
4852         } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4853                 mddev->ro = 0;
4854
4855         atomic_set(&mddev->writes_pending,0);
4856         atomic_set(&mddev->max_corr_read_errors,
4857                    MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4858         mddev->safemode = 0;
4859         mddev->safemode_timer.function = md_safemode_timeout;
4860         mddev->safemode_timer.data = (unsigned long) mddev;
4861         mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4862         mddev->in_sync = 1;
4863         smp_wmb();
4864         mddev->ready = 1;
4865         list_for_each_entry(rdev, &mddev->disks, same_set)
4866                 if (rdev->raid_disk >= 0)
4867                         if (sysfs_link_rdev(mddev, rdev))
4868                                 /* failure here is OK */;
4869         
4870         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4871         
4872         if (mddev->flags)
4873                 md_update_sb(mddev, 0);
4874
4875         md_new_event(mddev);
4876         sysfs_notify_dirent_safe(mddev->sysfs_state);
4877         sysfs_notify_dirent_safe(mddev->sysfs_action);
4878         sysfs_notify(&mddev->kobj, NULL, "degraded");
4879         return 0;
4880 }
4881 EXPORT_SYMBOL_GPL(md_run);
4882
4883 static int do_md_run(mddev_t *mddev)
4884 {
4885         int err;
4886
4887         err = md_run(mddev);
4888         if (err)
4889                 goto out;
4890         err = bitmap_load(mddev);
4891         if (err) {
4892                 bitmap_destroy(mddev);
4893                 goto out;
4894         }
4895
4896         md_wakeup_thread(mddev->thread);
4897         md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4898
4899         set_capacity(mddev->gendisk, mddev->array_sectors);
4900         revalidate_disk(mddev->gendisk);
4901         mddev->changed = 1;
4902         kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4903 out:
4904         return err;
4905 }
4906
4907 static int restart_array(mddev_t *mddev)
4908 {
4909         struct gendisk *disk = mddev->gendisk;
4910
4911         /* Complain if it has no devices */
4912         if (list_empty(&mddev->disks))
4913                 return -ENXIO;
4914         if (!mddev->pers)
4915                 return -EINVAL;
4916         if (!mddev->ro)
4917                 return -EBUSY;
4918         mddev->safemode = 0;
4919         mddev->ro = 0;
4920         set_disk_ro(disk, 0);
4921         printk(KERN_INFO "md: %s switched to read-write mode.\n",
4922                 mdname(mddev));
4923         /* Kick recovery or resync if necessary */
4924         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4925         md_wakeup_thread(mddev->thread);
4926         md_wakeup_thread(mddev->sync_thread);
4927         sysfs_notify_dirent_safe(mddev->sysfs_state);
4928         return 0;
4929 }
4930
4931 /* similar to deny_write_access, but accounts for our holding a reference
4932  * to the file ourselves */
4933 static int deny_bitmap_write_access(struct file * file)
4934 {
4935         struct inode *inode = file->f_mapping->host;
4936
4937         spin_lock(&inode->i_lock);
4938         if (atomic_read(&inode->i_writecount) > 1) {
4939                 spin_unlock(&inode->i_lock);
4940                 return -ETXTBSY;
4941         }
4942         atomic_set(&inode->i_writecount, -1);
4943         spin_unlock(&inode->i_lock);
4944
4945         return 0;
4946 }
4947
4948 void restore_bitmap_write_access(struct file *file)
4949 {
4950         struct inode *inode = file->f_mapping->host;
4951
4952         spin_lock(&inode->i_lock);
4953         atomic_set(&inode->i_writecount, 1);
4954         spin_unlock(&inode->i_lock);
4955 }
4956
4957 static void md_clean(mddev_t *mddev)
4958 {
4959         mddev->array_sectors = 0;
4960         mddev->external_size = 0;
4961         mddev->dev_sectors = 0;
4962         mddev->raid_disks = 0;
4963         mddev->recovery_cp = 0;
4964         mddev->resync_min = 0;
4965         mddev->resync_max = MaxSector;
4966         mddev->reshape_position = MaxSector;
4967         mddev->external = 0;
4968         mddev->persistent = 0;
4969         mddev->level = LEVEL_NONE;
4970         mddev->clevel[0] = 0;
4971         mddev->flags = 0;
4972         mddev->ro = 0;
4973         mddev->metadata_type[0] = 0;
4974         mddev->chunk_sectors = 0;
4975         mddev->ctime = mddev->utime = 0;
4976         mddev->layout = 0;
4977         mddev->max_disks = 0;
4978         mddev->events = 0;
4979         mddev->can_decrease_events = 0;
4980         mddev->delta_disks = 0;
4981         mddev->new_level = LEVEL_NONE;
4982         mddev->new_layout = 0;
4983         mddev->new_chunk_sectors = 0;
4984         mddev->curr_resync = 0;
4985         mddev->resync_mismatches = 0;
4986         mddev->suspend_lo = mddev->suspend_hi = 0;
4987         mddev->sync_speed_min = mddev->sync_speed_max = 0;
4988         mddev->recovery = 0;
4989         mddev->in_sync = 0;
4990         mddev->changed = 0;
4991         mddev->degraded = 0;
4992         mddev->safemode = 0;
4993         mddev->bitmap_info.offset = 0;
4994         mddev->bitmap_info.default_offset = 0;
4995         mddev->bitmap_info.chunksize = 0;
4996         mddev->bitmap_info.daemon_sleep = 0;
4997         mddev->bitmap_info.max_write_behind = 0;
4998 }
4999
5000 static void __md_stop_writes(mddev_t *mddev)
5001 {
5002         if (mddev->sync_thread) {
5003                 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5004                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5005                 reap_sync_thread(mddev);
5006         }
5007
5008         del_timer_sync(&mddev->safemode_timer);
5009
5010         bitmap_flush(mddev);
5011         md_super_wait(mddev);
5012
5013         if (!mddev->in_sync || mddev->flags) {
5014                 /* mark array as shutdown cleanly */
5015                 mddev->in_sync = 1;
5016                 md_update_sb(mddev, 1);
5017         }
5018 }
5019
5020 void md_stop_writes(mddev_t *mddev)
5021 {
5022         mddev_lock(mddev);
5023         __md_stop_writes(mddev);
5024         mddev_unlock(mddev);
5025 }
5026 EXPORT_SYMBOL_GPL(md_stop_writes);
5027
5028 void md_stop(mddev_t *mddev)
5029 {
5030         mddev->ready = 0;
5031         mddev->pers->stop(mddev);
5032         if (mddev->pers->sync_request && mddev->to_remove == NULL)
5033                 mddev->to_remove = &md_redundancy_group;
5034         module_put(mddev->pers->owner);
5035         mddev->pers = NULL;
5036         clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5037 }
5038 EXPORT_SYMBOL_GPL(md_stop);
5039
5040 static int md_set_readonly(mddev_t *mddev, int is_open)
5041 {
5042         int err = 0;
5043         mutex_lock(&mddev->open_mutex);
5044         if (atomic_read(&mddev->openers) > is_open) {
5045                 printk("md: %s still in use.\n",mdname(mddev));
5046                 err = -EBUSY;
5047                 goto out;
5048         }
5049         if (mddev->pers) {
5050                 __md_stop_writes(mddev);
5051
5052                 err  = -ENXIO;
5053                 if (mddev->ro==1)
5054                         goto out;
5055                 mddev->ro = 1;
5056                 set_disk_ro(mddev->gendisk, 1);
5057                 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5058                 sysfs_notify_dirent_safe(mddev->sysfs_state);
5059                 err = 0;        
5060         }
5061 out:
5062         mutex_unlock(&mddev->open_mutex);
5063         return err;
5064 }
5065
5066 /* mode:
5067  *   0 - completely stop and dis-assemble array
5068  *   2 - stop but do not disassemble array
5069  */
5070 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
5071 {
5072         struct gendisk *disk = mddev->gendisk;
5073         mdk_rdev_t *rdev;
5074
5075         mutex_lock(&mddev->open_mutex);
5076         if (atomic_read(&mddev->openers) > is_open ||
5077             mddev->sysfs_active) {
5078                 printk("md: %s still in use.\n",mdname(mddev));
5079                 mutex_unlock(&mddev->open_mutex);
5080                 return -EBUSY;
5081         }
5082
5083         if (mddev->pers) {
5084                 if (mddev->ro)
5085                         set_disk_ro(disk, 0);
5086
5087                 __md_stop_writes(mddev);
5088                 md_stop(mddev);
5089                 mddev->queue->merge_bvec_fn = NULL;
5090                 mddev->queue->backing_dev_info.congested_fn = NULL;
5091
5092                 /* tell userspace to handle 'inactive' */
5093                 sysfs_notify_dirent_safe(mddev->sysfs_state);
5094
5095                 list_for_each_entry(rdev, &mddev->disks, same_set)
5096                         if (rdev->raid_disk >= 0)
5097                                 sysfs_unlink_rdev(mddev, rdev);
5098
5099                 set_capacity(disk, 0);
5100                 mutex_unlock(&mddev->open_mutex);
5101                 mddev->changed = 1;
5102                 revalidate_disk(disk);
5103
5104                 if (mddev->ro)
5105                         mddev->ro = 0;
5106         } else
5107                 mutex_unlock(&mddev->open_mutex);
5108         /*
5109          * Free resources if final stop
5110          */
5111         if (mode == 0) {
5112                 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5113
5114                 bitmap_destroy(mddev);
5115                 if (mddev->bitmap_info.file) {
5116                         restore_bitmap_write_access(mddev->bitmap_info.file);
5117                         fput(mddev->bitmap_info.file);
5118                         mddev->bitmap_info.file = NULL;
5119                 }
5120                 mddev->bitmap_info.offset = 0;
5121
5122                 export_array(mddev);
5123
5124                 md_clean(mddev);
5125                 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5126                 if (mddev->hold_active == UNTIL_STOP)
5127                         mddev->hold_active = 0;
5128         }
5129         blk_integrity_unregister(disk);
5130         md_new_event(mddev);
5131         sysfs_notify_dirent_safe(mddev->sysfs_state);
5132         return 0;
5133 }
5134
5135 #ifndef MODULE
5136 static void autorun_array(mddev_t *mddev)
5137 {
5138         mdk_rdev_t *rdev;
5139         int err;
5140
5141         if (list_empty(&mddev->disks))
5142                 return;
5143
5144         printk(KERN_INFO "md: running: ");
5145
5146         list_for_each_entry(rdev, &mddev->disks, same_set) {
5147                 char b[BDEVNAME_SIZE];
5148                 printk("<%s>", bdevname(rdev->bdev,b));
5149         }
5150         printk("\n");
5151
5152         err = do_md_run(mddev);
5153         if (err) {
5154                 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5155                 do_md_stop(mddev, 0, 0);
5156         }
5157 }
5158
5159 /*
5160  * lets try to run arrays based on all disks that have arrived
5161  * until now. (those are in pending_raid_disks)
5162  *
5163  * the method: pick the first pending disk, collect all disks with
5164  * the same UUID, remove all from the pending list and put them into
5165  * the 'same_array' list. Then order this list based on superblock
5166  * update time (freshest comes first), kick out 'old' disks and
5167  * compare superblocks. If everything's fine then run it.
5168  *
5169  * If "unit" is allocated, then bump its reference count
5170  */
5171 static void autorun_devices(int part)
5172 {
5173         mdk_rdev_t *rdev0, *rdev, *tmp;
5174         mddev_t *mddev;
5175         char b[BDEVNAME_SIZE];
5176
5177         printk(KERN_INFO "md: autorun ...\n");
5178         while (!list_empty(&pending_raid_disks)) {
5179                 int unit;
5180                 dev_t dev;
5181                 LIST_HEAD(candidates);
5182                 rdev0 = list_entry(pending_raid_disks.next,
5183                                          mdk_rdev_t, same_set);
5184
5185                 printk(KERN_INFO "md: considering %s ...\n",
5186                         bdevname(rdev0->bdev,b));
5187                 INIT_LIST_HEAD(&candidates);
5188                 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5189                         if (super_90_load(rdev, rdev0, 0) >= 0) {
5190                                 printk(KERN_INFO "md:  adding %s ...\n",
5191                                         bdevname(rdev->bdev,b));
5192                                 list_move(&rdev->same_set, &candidates);
5193                         }
5194                 /*
5195                  * now we have a set of devices, with all of them having
5196                  * mostly sane superblocks. It's time to allocate the
5197                  * mddev.
5198                  */
5199                 if (part) {
5200                         dev = MKDEV(mdp_major,
5201                                     rdev0->preferred_minor << MdpMinorShift);
5202                         unit = MINOR(dev) >> MdpMinorShift;
5203                 } else {
5204                         dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5205                         unit = MINOR(dev);
5206                 }
5207                 if (rdev0->preferred_minor != unit) {
5208                         printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5209                                bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5210                         break;
5211                 }
5212
5213                 md_probe(dev, NULL, NULL);
5214                 mddev = mddev_find(dev);
5215                 if (!mddev || !mddev->gendisk) {
5216                         if (mddev)
5217                                 mddev_put(mddev);
5218                         printk(KERN_ERR
5219                                 "md: cannot allocate memory for md drive.\n");
5220                         break;
5221                 }
5222                 if (mddev_lock(mddev)) 
5223                         printk(KERN_WARNING "md: %s locked, cannot run\n",
5224                                mdname(mddev));
5225                 else if (mddev->raid_disks || mddev->major_version
5226                          || !list_empty(&mddev->disks)) {
5227                         printk(KERN_WARNING 
5228                                 "md: %s already running, cannot run %s\n",
5229                                 mdname(mddev), bdevname(rdev0->bdev,b));
5230                         mddev_unlock(mddev);
5231                 } else {
5232                         printk(KERN_INFO "md: created %s\n", mdname(mddev));
5233                         mddev->persistent = 1;
5234                         rdev_for_each_list(rdev, tmp, &candidates) {
5235                                 list_del_init(&rdev->same_set);
5236                                 if (bind_rdev_to_array(rdev, mddev))
5237                                         export_rdev(rdev);
5238                         }
5239                         autorun_array(mddev);
5240                         mddev_unlock(mddev);
5241                 }
5242                 /* on success, candidates will be empty, on error
5243                  * it won't...
5244                  */
5245                 rdev_for_each_list(rdev, tmp, &candidates) {
5246                         list_del_init(&rdev->same_set);
5247                         export_rdev(rdev);
5248                 }
5249                 mddev_put(mddev);
5250         }
5251         printk(KERN_INFO "md: ... autorun DONE.\n");
5252 }
5253 #endif /* !MODULE */
5254
5255 static int get_version(void __user * arg)
5256 {
5257         mdu_version_t ver;
5258
5259         ver.major = MD_MAJOR_VERSION;
5260         ver.minor = MD_MINOR_VERSION;
5261         ver.patchlevel = MD_PATCHLEVEL_VERSION;
5262
5263         if (copy_to_user(arg, &ver, sizeof(ver)))
5264                 return -EFAULT;
5265
5266         return 0;
5267 }
5268
5269 static int get_array_info(mddev_t * mddev, void __user * arg)
5270 {
5271         mdu_array_info_t info;
5272         int nr,working,insync,failed,spare;
5273         mdk_rdev_t *rdev;
5274
5275         nr=working=insync=failed=spare=0;
5276         list_for_each_entry(rdev, &mddev->disks, same_set) {
5277                 nr++;
5278                 if (test_bit(Faulty, &rdev->flags))
5279                         failed++;
5280                 else {
5281                         working++;
5282                         if (test_bit(In_sync, &rdev->flags))
5283                                 insync++;       
5284                         else
5285                                 spare++;
5286                 }
5287         }
5288
5289         info.major_version = mddev->major_version;
5290         info.minor_version = mddev->minor_version;
5291         info.patch_version = MD_PATCHLEVEL_VERSION;
5292         info.ctime         = mddev->ctime;
5293         info.level         = mddev->level;
5294         info.size          = mddev->dev_sectors / 2;
5295         if (info.size != mddev->dev_sectors / 2) /* overflow */
5296                 info.size = -1;
5297         info.nr_disks      = nr;
5298         info.raid_disks    = mddev->raid_disks;
5299         info.md_minor      = mddev->md_minor;
5300         info.not_persistent= !mddev->persistent;
5301
5302         info.utime         = mddev->utime;
5303         info.state         = 0;
5304         if (mddev->in_sync)
5305                 info.state = (1<<MD_SB_CLEAN);
5306         if (mddev->bitmap && mddev->bitmap_info.offset)
5307                 info.state = (1<<MD_SB_BITMAP_PRESENT);
5308         info.active_disks  = insync;
5309         info.working_disks = working;
5310         info.failed_disks  = failed;
5311         info.spare_disks   = spare;
5312
5313         info.layout        = mddev->layout;
5314         info.chunk_size    = mddev->chunk_sectors << 9;
5315
5316         if (copy_to_user(arg, &info, sizeof(info)))
5317                 return -EFAULT;
5318
5319         return 0;
5320 }
5321
5322 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
5323 {
5324         mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5325         char *ptr, *buf = NULL;
5326         int err = -ENOMEM;
5327
5328         if (md_allow_write(mddev))
5329                 file = kmalloc(sizeof(*file), GFP_NOIO);
5330         else
5331                 file = kmalloc(sizeof(*file), GFP_KERNEL);
5332
5333         if (!file)
5334                 goto out;
5335
5336         /* bitmap disabled, zero the first byte and copy out */
5337         if (!mddev->bitmap || !mddev->bitmap->file) {
5338                 file->pathname[0] = '\0';
5339                 goto copy_out;
5340         }
5341
5342         buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5343         if (!buf)
5344                 goto out;
5345
5346         ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
5347         if (IS_ERR(ptr))
5348                 goto out;
5349
5350         strcpy(file->pathname, ptr);
5351
5352 copy_out:
5353         err = 0;
5354         if (copy_to_user(arg, file, sizeof(*file)))
5355                 err = -EFAULT;
5356 out:
5357         kfree(buf);
5358         kfree(file);
5359         return err;
5360 }
5361
5362 static int get_disk_info(mddev_t * mddev, void __user * arg)
5363 {
5364         mdu_disk_info_t info;
5365         mdk_rdev_t *rdev;
5366
5367         if (copy_from_user(&info, arg, sizeof(info)))
5368                 return -EFAULT;
5369
5370         rdev = find_rdev_nr(mddev, info.number);
5371         if (rdev) {
5372                 info.major = MAJOR(rdev->bdev->bd_dev);
5373                 info.minor = MINOR(rdev->bdev->bd_dev);
5374                 info.raid_disk = rdev->raid_disk;
5375                 info.state = 0;
5376                 if (test_bit(Faulty, &rdev->flags))
5377                         info.state |= (1<<MD_DISK_FAULTY);
5378                 else if (test_bit(In_sync, &rdev->flags)) {
5379                         info.state |= (1<<MD_DISK_ACTIVE);
5380                         info.state |= (1<<MD_DISK_SYNC);
5381                 }
5382                 if (test_bit(WriteMostly, &rdev->flags))
5383                         info.state |= (1<<MD_DISK_WRITEMOSTLY);
5384         } else {
5385                 info.major = info.minor = 0;
5386                 info.raid_disk = -1;
5387                 info.state = (1<<MD_DISK_REMOVED);
5388         }
5389
5390         if (copy_to_user(arg, &info, sizeof(info)))
5391                 return -EFAULT;
5392
5393         return 0;
5394 }
5395
5396 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
5397 {
5398         char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5399         mdk_rdev_t *rdev;
5400         dev_t dev = MKDEV(info->major,info->minor);
5401
5402         if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5403                 return -EOVERFLOW;
5404
5405         if (!mddev->raid_disks) {
5406                 int err;
5407                 /* expecting a device which has a superblock */
5408                 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5409                 if (IS_ERR(rdev)) {
5410                         printk(KERN_WARNING 
5411                                 "md: md_import_device returned %ld\n",
5412                                 PTR_ERR(rdev));
5413                         return PTR_ERR(rdev);
5414                 }
5415                 if (!list_empty(&mddev->disks)) {
5416                         mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
5417                                                         mdk_rdev_t, same_set);
5418                         err = super_types[mddev->major_version]
5419                                 .load_super(rdev, rdev0, mddev->minor_version);
5420                         if (err < 0) {
5421                                 printk(KERN_WARNING 
5422                                         "md: %s has different UUID to %s\n",
5423                                         bdevname(rdev->bdev,b), 
5424                                         bdevname(rdev0->bdev,b2));
5425                                 export_rdev(rdev);
5426                                 return -EINVAL;
5427                         }
5428                 }
5429                 err = bind_rdev_to_array(rdev, mddev);
5430                 if (err)
5431                         export_rdev(rdev);
5432                 return err;
5433         }
5434
5435         /*
5436          * add_new_disk can be used once the array is assembled
5437          * to add "hot spares".  They must already have a superblock
5438          * written
5439          */
5440         if (mddev->pers) {
5441                 int err;
5442                 if (!mddev->pers->hot_add_disk) {
5443                         printk(KERN_WARNING 
5444                                 "%s: personality does not support diskops!\n",
5445                                mdname(mddev));
5446                         return -EINVAL;
5447                 }
5448                 if (mddev->persistent)
5449                         rdev = md_import_device(dev, mddev->major_version,
5450                                                 mddev->minor_version);
5451                 else
5452                         rdev = md_import_device(dev, -1, -1);
5453                 if (IS_ERR(rdev)) {
5454                         printk(KERN_WARNING 
5455                                 "md: md_import_device returned %ld\n",
5456                                 PTR_ERR(rdev));
5457                         return PTR_ERR(rdev);
5458                 }
5459                 /* set saved_raid_disk if appropriate */
5460                 if (!mddev->persistent) {
5461                         if (info->state & (1<<MD_DISK_SYNC)  &&
5462                             info->raid_disk < mddev->raid_disks) {
5463                                 rdev->raid_disk = info->raid_disk;
5464                                 set_bit(In_sync, &rdev->flags);
5465                         } else
5466                                 rdev->raid_disk = -1;
5467                 } else
5468                         super_types[mddev->major_version].
5469                                 validate_super(mddev, rdev);
5470                 if ((info->state & (1<<MD_DISK_SYNC)) &&
5471                     (!test_bit(In_sync, &rdev->flags) ||
5472                      rdev->raid_disk != info->raid_disk)) {
5473                         /* This was a hot-add request, but events doesn't
5474                          * match, so reject it.
5475                          */
5476                         export_rdev(rdev);
5477                         return -EINVAL;
5478                 }
5479
5480                 if (test_bit(In_sync, &rdev->flags))
5481                         rdev->saved_raid_disk = rdev->raid_disk;
5482                 else
5483                         rdev->saved_raid_disk = -1;
5484
5485                 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5486                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5487                         set_bit(WriteMostly, &rdev->flags);
5488                 else
5489                         clear_bit(WriteMostly, &rdev->flags);
5490
5491                 rdev->raid_disk = -1;
5492                 err = bind_rdev_to_array(rdev, mddev);
5493                 if (!err && !mddev->pers->hot_remove_disk) {
5494                         /* If there is hot_add_disk but no hot_remove_disk
5495                          * then added disks for geometry changes,
5496                          * and should be added immediately.
5497                          */
5498                         super_types[mddev->major_version].
5499                                 validate_super(mddev, rdev);
5500                         err = mddev->pers->hot_add_disk(mddev, rdev);
5501                         if (err)
5502                                 unbind_rdev_from_array(rdev);
5503                 }
5504                 if (err)
5505                         export_rdev(rdev);
5506                 else
5507                         sysfs_notify_dirent_safe(rdev->sysfs_state);
5508
5509                 md_update_sb(mddev, 1);
5510                 if (mddev->degraded)
5511                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5512                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5513                 if (!err)
5514                         md_new_event(mddev);
5515                 md_wakeup_thread(mddev->thread);
5516                 return err;
5517         }
5518
5519         /* otherwise, add_new_disk is only allowed
5520          * for major_version==0 superblocks
5521          */
5522         if (mddev->major_version != 0) {
5523                 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5524                        mdname(mddev));
5525                 return -EINVAL;
5526         }
5527
5528         if (!(info->state & (1<<MD_DISK_FAULTY))) {
5529                 int err;
5530                 rdev = md_import_device(dev, -1, 0);
5531                 if (IS_ERR(rdev)) {
5532                         printk(KERN_WARNING 
5533                                 "md: error, md_import_device() returned %ld\n",
5534                                 PTR_ERR(rdev));
5535                         return PTR_ERR(rdev);
5536                 }
5537                 rdev->desc_nr = info->number;
5538                 if (info->raid_disk < mddev->raid_disks)
5539                         rdev->raid_disk = info->raid_disk;
5540                 else
5541                         rdev->raid_disk = -1;
5542
5543                 if (rdev->raid_disk < mddev->raid_disks)
5544                         if (info->state & (1<<MD_DISK_SYNC))
5545                                 set_bit(In_sync, &rdev->flags);
5546
5547                 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5548                         set_bit(WriteMostly, &rdev->flags);
5549
5550                 if (!mddev->persistent) {
5551                         printk(KERN_INFO "md: nonpersistent superblock ...\n");
5552                         rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5553                 } else
5554                         rdev->sb_start = calc_dev_sboffset(rdev);
5555                 rdev->sectors = rdev->sb_start;
5556
5557                 err = bind_rdev_to_array(rdev, mddev);
5558                 if (err) {
5559                         export_rdev(rdev);
5560                         return err;
5561                 }
5562         }
5563
5564         return 0;
5565 }
5566
5567 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
5568 {
5569         char b[BDEVNAME_SIZE];
5570         mdk_rdev_t *rdev;
5571
5572         rdev = find_rdev(mddev, dev);
5573         if (!rdev)
5574                 return -ENXIO;
5575
5576         if (rdev->raid_disk >= 0)
5577                 goto busy;
5578
5579         kick_rdev_from_array(rdev);
5580         md_update_sb(mddev, 1);
5581         md_new_event(mddev);
5582
5583         return 0;
5584 busy:
5585         printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5586                 bdevname(rdev->bdev,b), mdname(mddev));
5587         return -EBUSY;
5588 }
5589
5590 static int hot_add_disk(mddev_t * mddev, dev_t dev)
5591 {
5592         char b[BDEVNAME_SIZE];
5593         int err;
5594         mdk_rdev_t *rdev;
5595
5596         if (!mddev->pers)
5597                 return -ENODEV;
5598
5599         if (mddev->major_version != 0) {
5600                 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5601                         " version-0 superblocks.\n",
5602                         mdname(mddev));
5603                 return -EINVAL;
5604         }
5605         if (!mddev->pers->hot_add_disk) {
5606                 printk(KERN_WARNING 
5607                         "%s: personality does not support diskops!\n",
5608                         mdname(mddev));
5609                 return -EINVAL;
5610         }
5611
5612         rdev = md_import_device(dev, -1, 0);
5613         if (IS_ERR(rdev)) {
5614                 printk(KERN_WARNING 
5615                         "md: error, md_import_device() returned %ld\n",
5616                         PTR_ERR(rdev));
5617                 return -EINVAL;
5618         }
5619
5620         if (mddev->persistent)
5621                 rdev->sb_start = calc_dev_sboffset(rdev);
5622         else
5623                 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5624
5625         rdev->sectors = rdev->sb_start;
5626
5627         if (test_bit(Faulty, &rdev->flags)) {
5628                 printk(KERN_WARNING 
5629                         "md: can not hot-add faulty %s disk to %s!\n",
5630                         bdevname(rdev->bdev,b), mdname(mddev));
5631                 err = -EINVAL;
5632                 goto abort_export;
5633         }
5634         clear_bit(In_sync, &rdev->flags);
5635         rdev->desc_nr = -1;
5636         rdev->saved_raid_disk = -1;
5637         err = bind_rdev_to_array(rdev, mddev);
5638         if (err)
5639                 goto abort_export;
5640
5641         /*
5642          * The rest should better be atomic, we can have disk failures
5643          * noticed in interrupt contexts ...
5644          */
5645
5646         rdev->raid_disk = -1;
5647
5648         md_update_sb(mddev, 1);
5649
5650         /*
5651          * Kick recovery, maybe this spare has to be added to the
5652          * array immediately.
5653          */
5654         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5655         md_wakeup_thread(mddev->thread);
5656         md_new_event(mddev);
5657         return 0;
5658
5659 abort_export:
5660         export_rdev(rdev);
5661         return err;
5662 }
5663
5664 static int set_bitmap_file(mddev_t *mddev, int fd)
5665 {
5666         int err;
5667
5668         if (mddev->pers) {
5669                 if (!mddev->pers->quiesce)
5670                         return -EBUSY;
5671                 if (mddev->recovery || mddev->sync_thread)
5672                         return -EBUSY;
5673                 /* we should be able to change the bitmap.. */
5674         }
5675
5676
5677         if (fd >= 0) {
5678                 if (mddev->bitmap)
5679                         return -EEXIST; /* cannot add when bitmap is present */
5680                 mddev->bitmap_info.file = fget(fd);
5681
5682                 if (mddev->bitmap_info.file == NULL) {
5683                         printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5684                                mdname(mddev));
5685                         return -EBADF;
5686                 }
5687
5688                 err = deny_bitmap_write_access(mddev->bitmap_info.file);
5689                 if (err) {
5690                         printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5691                                mdname(mddev));
5692                         fput(mddev->bitmap_info.file);
5693                         mddev->bitmap_info.file = NULL;
5694                         return err;
5695                 }
5696                 mddev->bitmap_info.offset = 0; /* file overrides offset */
5697         } else if (mddev->bitmap == NULL)
5698                 return -ENOENT; /* cannot remove what isn't there */
5699         err = 0;
5700         if (mddev->pers) {
5701                 mddev->pers->quiesce(mddev, 1);
5702                 if (fd >= 0) {
5703                         err = bitmap_create(mddev);
5704                         if (!err)
5705                                 err = bitmap_load(mddev);
5706                 }
5707                 if (fd < 0 || err) {
5708                         bitmap_destroy(mddev);
5709                         fd = -1; /* make sure to put the file */
5710                 }
5711                 mddev->pers->quiesce(mddev, 0);
5712         }
5713         if (fd < 0) {
5714                 if (mddev->bitmap_info.file) {
5715                         restore_bitmap_write_access(mddev->bitmap_info.file);
5716                         fput(mddev->bitmap_info.file);
5717                 }
5718                 mddev->bitmap_info.file = NULL;
5719         }
5720
5721         return err;
5722 }
5723
5724 /*
5725  * set_array_info is used two different ways
5726  * The original usage is when creating a new array.
5727  * In this usage, raid_disks is > 0 and it together with
5728  *  level, size, not_persistent,layout,chunksize determine the
5729  *  shape of the array.
5730  *  This will always create an array with a type-0.90.0 superblock.
5731  * The newer usage is when assembling an array.
5732  *  In this case raid_disks will be 0, and the major_version field is
5733  *  use to determine which style super-blocks are to be found on the devices.
5734  *  The minor and patch _version numbers are also kept incase the
5735  *  super_block handler wishes to interpret them.
5736  */
5737 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5738 {
5739
5740         if (info->raid_disks == 0) {
5741                 /* just setting version number for superblock loading */
5742                 if (info->major_version < 0 ||
5743                     info->major_version >= ARRAY_SIZE(super_types) ||
5744                     super_types[info->major_version].name == NULL) {
5745                         /* maybe try to auto-load a module? */
5746                         printk(KERN_INFO 
5747                                 "md: superblock version %d not known\n",
5748                                 info->major_version);
5749                         return -EINVAL;
5750                 }
5751                 mddev->major_version = info->major_version;
5752                 mddev->minor_version = info->minor_version;
5753                 mddev->patch_version = info->patch_version;
5754                 mddev->persistent = !info->not_persistent;
5755                 /* ensure mddev_put doesn't delete this now that there
5756                  * is some minimal configuration.
5757                  */
5758                 mddev->ctime         = get_seconds();
5759                 return 0;
5760         }
5761         mddev->major_version = MD_MAJOR_VERSION;
5762         mddev->minor_version = MD_MINOR_VERSION;
5763         mddev->patch_version = MD_PATCHLEVEL_VERSION;
5764         mddev->ctime         = get_seconds();
5765
5766         mddev->level         = info->level;
5767         mddev->clevel[0]     = 0;
5768         mddev->dev_sectors   = 2 * (sector_t)info->size;
5769         mddev->raid_disks    = info->raid_disks;
5770         /* don't set md_minor, it is determined by which /dev/md* was
5771          * openned
5772          */
5773         if (info->state & (1<<MD_SB_CLEAN))
5774                 mddev->recovery_cp = MaxSector;
5775         else
5776                 mddev->recovery_cp = 0;
5777         mddev->persistent    = ! info->not_persistent;
5778         mddev->external      = 0;
5779
5780         mddev->layout        = info->layout;
5781         mddev->chunk_sectors = info->chunk_size >> 9;
5782
5783         mddev->max_disks     = MD_SB_DISKS;
5784
5785         if (mddev->persistent)
5786                 mddev->flags         = 0;
5787         set_bit(MD_CHANGE_DEVS, &mddev->flags);
5788
5789         mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5790         mddev->bitmap_info.offset = 0;
5791
5792         mddev->reshape_position = MaxSector;
5793
5794         /*
5795          * Generate a 128 bit UUID
5796          */
5797         get_random_bytes(mddev->uuid, 16);
5798
5799         mddev->new_level = mddev->level;
5800         mddev->new_chunk_sectors = mddev->chunk_sectors;
5801         mddev->new_layout = mddev->layout;
5802         mddev->delta_disks = 0;
5803
5804         return 0;
5805 }
5806
5807 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5808 {
5809         WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5810
5811         if (mddev->external_size)
5812                 return;
5813
5814         mddev->array_sectors = array_sectors;
5815 }
5816 EXPORT_SYMBOL(md_set_array_sectors);
5817
5818 static int update_size(mddev_t *mddev, sector_t num_sectors)
5819 {
5820         mdk_rdev_t *rdev;
5821         int rv;
5822         int fit = (num_sectors == 0);
5823
5824         if (mddev->pers->resize == NULL)
5825                 return -EINVAL;
5826         /* The "num_sectors" is the number of sectors of each device that
5827          * is used.  This can only make sense for arrays with redundancy.
5828          * linear and raid0 always use whatever space is available. We can only
5829          * consider changing this number if no resync or reconstruction is
5830          * happening, and if the new size is acceptable. It must fit before the
5831          * sb_start or, if that is <data_offset, it must fit before the size
5832          * of each device.  If num_sectors is zero, we find the largest size
5833          * that fits.
5834          */
5835         if (mddev->sync_thread)
5836                 return -EBUSY;
5837         if (mddev->bitmap)
5838                 /* Sorry, cannot grow a bitmap yet, just remove it,
5839                  * grow, and re-add.
5840                  */
5841                 return -EBUSY;
5842         list_for_each_entry(rdev, &mddev->disks, same_set) {
5843                 sector_t avail = rdev->sectors;
5844
5845                 if (fit && (num_sectors == 0 || num_sectors > avail))
5846                         num_sectors = avail;
5847                 if (avail < num_sectors)
5848                         return -ENOSPC;
5849         }
5850         rv = mddev->pers->resize(mddev, num_sectors);
5851         if (!rv)
5852                 revalidate_disk(mddev->gendisk);
5853         return rv;
5854 }
5855
5856 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5857 {
5858         int rv;
5859         /* change the number of raid disks */
5860         if (mddev->pers->check_reshape == NULL)
5861                 return -EINVAL;
5862         if (raid_disks <= 0 ||
5863             (mddev->max_disks && raid_disks >= mddev->max_disks))
5864                 return -EINVAL;
5865         if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5866                 return -EBUSY;
5867         mddev->delta_disks = raid_disks - mddev->raid_disks;
5868
5869         rv = mddev->pers->check_reshape(mddev);
5870         if (rv < 0)
5871                 mddev->delta_disks = 0;
5872         return rv;
5873 }
5874
5875
5876 /*
5877  * update_array_info is used to change the configuration of an
5878  * on-line array.
5879  * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5880  * fields in the info are checked against the array.
5881  * Any differences that cannot be handled will cause an error.
5882  * Normally, only one change can be managed at a time.
5883  */
5884 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5885 {
5886         int rv = 0;
5887         int cnt = 0;
5888         int state = 0;
5889
5890         /* calculate expected state,ignoring low bits */
5891         if (mddev->bitmap && mddev->bitmap_info.offset)
5892                 state |= (1 << MD_SB_BITMAP_PRESENT);
5893
5894         if (mddev->major_version != info->major_version ||
5895             mddev->minor_version != info->minor_version ||
5896 /*          mddev->patch_version != info->patch_version || */
5897             mddev->ctime         != info->ctime         ||
5898             mddev->level         != info->level         ||
5899 /*          mddev->layout        != info->layout        || */
5900             !mddev->persistent   != info->not_persistent||
5901             mddev->chunk_sectors != info->chunk_size >> 9 ||
5902             /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5903             ((state^info->state) & 0xfffffe00)
5904                 )
5905                 return -EINVAL;
5906         /* Check there is only one change */
5907         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5908                 cnt++;
5909         if (mddev->raid_disks != info->raid_disks)
5910                 cnt++;
5911         if (mddev->layout != info->layout)
5912                 cnt++;
5913         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5914                 cnt++;
5915         if (cnt == 0)
5916                 return 0;
5917         if (cnt > 1)
5918                 return -EINVAL;
5919
5920         if (mddev->layout != info->layout) {
5921                 /* Change layout
5922                  * we don't need to do anything at the md level, the
5923                  * personality will take care of it all.
5924                  */
5925                 if (mddev->pers->check_reshape == NULL)
5926                         return -EINVAL;
5927                 else {
5928                         mddev->new_layout = info->layout;
5929                         rv = mddev->pers->check_reshape(mddev);
5930                         if (rv)
5931                                 mddev->new_layout = mddev->layout;
5932                         return rv;
5933                 }
5934         }
5935         if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5936                 rv = update_size(mddev, (sector_t)info->size * 2);
5937
5938         if (mddev->raid_disks    != info->raid_disks)
5939                 rv = update_raid_disks(mddev, info->raid_disks);
5940
5941         if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5942                 if (mddev->pers->quiesce == NULL)
5943                         return -EINVAL;
5944                 if (mddev->recovery || mddev->sync_thread)
5945                         return -EBUSY;
5946                 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5947                         /* add the bitmap */
5948                         if (mddev->bitmap)
5949                                 return -EEXIST;
5950                         if (mddev->bitmap_info.default_offset == 0)
5951                                 return -EINVAL;
5952                         mddev->bitmap_info.offset =
5953                                 mddev->bitmap_info.default_offset;
5954                         mddev->pers->quiesce(mddev, 1);
5955                         rv = bitmap_create(mddev);
5956                         if (!rv)
5957                                 rv = bitmap_load(mddev);
5958                         if (rv)
5959                                 bitmap_destroy(mddev);
5960                         mddev->pers->quiesce(mddev, 0);
5961                 } else {
5962                         /* remove the bitmap */
5963                         if (!mddev->bitmap)
5964                                 return -ENOENT;
5965                         if (mddev->bitmap->file)
5966                                 return -EINVAL;
5967                         mddev->pers->quiesce(mddev, 1);
5968                         bitmap_destroy(mddev);
5969                         mddev->pers->quiesce(mddev, 0);
5970                         mddev->bitmap_info.offset = 0;
5971                 }
5972         }
5973         md_update_sb(mddev, 1);
5974         return rv;
5975 }
5976
5977 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5978 {
5979         mdk_rdev_t *rdev;
5980
5981         if (mddev->pers == NULL)
5982                 return -ENODEV;
5983
5984         rdev = find_rdev(mddev, dev);
5985         if (!rdev)
5986                 return -ENODEV;
5987
5988         md_error(mddev, rdev);
5989         if (!test_bit(Faulty, &rdev->flags))
5990                 return -EBUSY;
5991         return 0;
5992 }
5993
5994 /*
5995  * We have a problem here : there is no easy way to give a CHS
5996  * virtual geometry. We currently pretend that we have a 2 heads
5997  * 4 sectors (with a BIG number of cylinders...). This drives
5998  * dosfs just mad... ;-)
5999  */
6000 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6001 {
6002         mddev_t *mddev = bdev->bd_disk->private_data;
6003
6004         geo->heads = 2;
6005         geo->sectors = 4;
6006         geo->cylinders = mddev->array_sectors / 8;
6007         return 0;
6008 }
6009
6010 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6011                         unsigned int cmd, unsigned long arg)
6012 {
6013         int err = 0;
6014         void __user *argp = (void __user *)arg;
6015         mddev_t *mddev = NULL;
6016         int ro;
6017
6018         if (!capable(CAP_SYS_ADMIN))
6019                 return -EACCES;
6020
6021         /*
6022          * Commands dealing with the RAID driver but not any
6023          * particular array:
6024          */
6025         switch (cmd)
6026         {
6027                 case RAID_VERSION:
6028                         err = get_version(argp);
6029                         goto done;
6030
6031                 case PRINT_RAID_DEBUG:
6032                         err = 0;
6033                         md_print_devices();
6034                         goto done;
6035
6036 #ifndef MODULE
6037                 case RAID_AUTORUN:
6038                         err = 0;
6039                         autostart_arrays(arg);
6040                         goto done;
6041 #endif
6042                 default:;
6043         }
6044
6045         /*
6046          * Commands creating/starting a new array:
6047          */
6048
6049         mddev = bdev->bd_disk->private_data;
6050
6051         if (!mddev) {
6052                 BUG();
6053                 goto abort;
6054         }
6055
6056         err = mddev_lock(mddev);
6057         if (err) {
6058                 printk(KERN_INFO 
6059                         "md: ioctl lock interrupted, reason %d, cmd %d\n",
6060                         err, cmd);
6061                 goto abort;
6062         }
6063
6064         switch (cmd)
6065         {
6066                 case SET_ARRAY_INFO:
6067                         {
6068                                 mdu_array_info_t info;
6069                                 if (!arg)
6070                                         memset(&info, 0, sizeof(info));
6071                                 else if (copy_from_user(&info, argp, sizeof(info))) {
6072                                         err = -EFAULT;
6073                                         goto abort_unlock;
6074                                 }
6075                                 if (mddev->pers) {
6076                                         err = update_array_info(mddev, &info);
6077                                         if (err) {
6078                                                 printk(KERN_WARNING "md: couldn't update"
6079                                                        " array info. %d\n", err);
6080                                                 goto abort_unlock;
6081                                         }
6082                                         goto done_unlock;
6083                                 }
6084                                 if (!list_empty(&mddev->disks)) {
6085                                         printk(KERN_WARNING
6086                                                "md: array %s already has disks!\n",
6087                                                mdname(mddev));
6088                                         err = -EBUSY;
6089                                         goto abort_unlock;
6090                                 }
6091                                 if (mddev->raid_disks) {
6092                                         printk(KERN_WARNING
6093                                                "md: array %s already initialised!\n",
6094                                                mdname(mddev));
6095                                         err = -EBUSY;
6096                                         goto abort_unlock;
6097                                 }
6098                                 err = set_array_info(mddev, &info);
6099                                 if (err) {
6100                                         printk(KERN_WARNING "md: couldn't set"
6101                                                " array info. %d\n", err);
6102                                         goto abort_unlock;
6103                                 }
6104                         }
6105                         goto done_unlock;
6106
6107                 default:;
6108         }
6109
6110         /*
6111          * Commands querying/configuring an existing array:
6112          */
6113         /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6114          * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6115         if ((!mddev->raid_disks && !mddev->external)
6116             && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6117             && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6118             && cmd != GET_BITMAP_FILE) {
6119                 err = -ENODEV;
6120                 goto abort_unlock;
6121         }
6122
6123         /*
6124          * Commands even a read-only array can execute:
6125          */
6126         switch (cmd)
6127         {
6128                 case GET_ARRAY_INFO:
6129                         err = get_array_info(mddev, argp);
6130                         goto done_unlock;
6131
6132                 case GET_BITMAP_FILE:
6133                         err = get_bitmap_file(mddev, argp);
6134                         goto done_unlock;
6135
6136                 case GET_DISK_INFO:
6137                         err = get_disk_info(mddev, argp);
6138                         goto done_unlock;
6139
6140                 case RESTART_ARRAY_RW:
6141                         err = restart_array(mddev);
6142                         goto done_unlock;
6143
6144                 case STOP_ARRAY:
6145                         err = do_md_stop(mddev, 0, 1);
6146                         goto done_unlock;
6147
6148                 case STOP_ARRAY_RO:
6149                         err = md_set_readonly(mddev, 1);
6150                         goto done_unlock;
6151
6152                 case BLKROSET:
6153                         if (get_user(ro, (int __user *)(arg))) {
6154                                 err = -EFAULT;
6155                                 goto done_unlock;
6156                         }
6157                         err = -EINVAL;
6158
6159                         /* if the bdev is going readonly the value of mddev->ro
6160                          * does not matter, no writes are coming
6161                          */
6162                         if (ro)
6163                                 goto done_unlock;
6164
6165                         /* are we are already prepared for writes? */
6166                         if (mddev->ro != 1)
6167                                 goto done_unlock;
6168
6169                         /* transitioning to readauto need only happen for
6170                          * arrays that call md_write_start
6171                          */
6172                         if (mddev->pers) {
6173                                 err = restart_array(mddev);
6174                                 if (err == 0) {
6175                                         mddev->ro = 2;
6176                                         set_disk_ro(mddev->gendisk, 0);
6177                                 }
6178                         }
6179                         goto done_unlock;
6180         }
6181
6182         /*
6183          * The remaining ioctls are changing the state of the
6184          * superblock, so we do not allow them on read-only arrays.
6185          * However non-MD ioctls (e.g. get-size) will still come through
6186          * here and hit the 'default' below, so only disallow
6187          * 'md' ioctls, and switch to rw mode if started auto-readonly.
6188          */
6189         if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
6190                 if (mddev->ro == 2) {
6191                         mddev->ro = 0;
6192                         sysfs_notify_dirent_safe(mddev->sysfs_state);
6193                         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6194                         md_wakeup_thread(mddev->thread);
6195                 } else {
6196                         err = -EROFS;
6197                         goto abort_unlock;
6198                 }
6199         }
6200
6201         switch (cmd)
6202         {
6203                 case ADD_NEW_DISK:
6204                 {
6205                         mdu_disk_info_t info;
6206                         if (copy_from_user(&info, argp, sizeof(info)))
6207                                 err = -EFAULT;
6208                         else
6209                                 err = add_new_disk(mddev, &info);
6210                         goto done_unlock;
6211                 }
6212
6213                 case HOT_REMOVE_DISK:
6214                         err = hot_remove_disk(mddev, new_decode_dev(arg));
6215                         goto done_unlock;
6216
6217                 case HOT_ADD_DISK:
6218                         err = hot_add_disk(mddev, new_decode_dev(arg));
6219                         goto done_unlock;
6220
6221                 case SET_DISK_FAULTY:
6222                         err = set_disk_faulty(mddev, new_decode_dev(arg));
6223                         goto done_unlock;
6224
6225                 case RUN_ARRAY:
6226                         err = do_md_run(mddev);
6227                         goto done_unlock;
6228
6229                 case SET_BITMAP_FILE:
6230                         err = set_bitmap_file(mddev, (int)arg);
6231                         goto done_unlock;
6232
6233                 default:
6234                         err = -EINVAL;
6235                         goto abort_unlock;
6236         }
6237
6238 done_unlock:
6239 abort_unlock:
6240         if (mddev->hold_active == UNTIL_IOCTL &&
6241             err != -EINVAL)
6242                 mddev->hold_active = 0;
6243         mddev_unlock(mddev);
6244
6245         return err;
6246 done:
6247         if (err)
6248                 MD_BUG();
6249 abort:
6250         return err;
6251 }
6252 #ifdef CONFIG_COMPAT
6253 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6254                     unsigned int cmd, unsigned long arg)
6255 {
6256         switch (cmd) {
6257         case HOT_REMOVE_DISK:
6258         case HOT_ADD_DISK:
6259         case SET_DISK_FAULTY:
6260         case SET_BITMAP_FILE:
6261                 /* These take in integer arg, do not convert */
6262                 break;
6263         default:
6264                 arg = (unsigned long)compat_ptr(arg);
6265                 break;
6266         }
6267
6268         return md_ioctl(bdev, mode, cmd, arg);
6269 }
6270 #endif /* CONFIG_COMPAT */
6271
6272 static int md_open(struct block_device *bdev, fmode_t mode)
6273 {
6274         /*
6275          * Succeed if we can lock the mddev, which confirms that
6276          * it isn't being stopped right now.
6277          */
6278         mddev_t *mddev = mddev_find(bdev->bd_dev);
6279         int err;
6280
6281         if (mddev->gendisk != bdev->bd_disk) {
6282                 /* we are racing with mddev_put which is discarding this
6283                  * bd_disk.
6284                  */
6285                 mddev_put(mddev);
6286                 /* Wait until bdev->bd_disk is definitely gone */
6287                 flush_workqueue(md_misc_wq);
6288                 /* Then retry the open from the top */
6289                 return -ERESTARTSYS;
6290         }
6291         BUG_ON(mddev != bdev->bd_disk->private_data);
6292
6293         if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6294                 goto out;
6295
6296         err = 0;
6297         atomic_inc(&mddev->openers);
6298         mutex_unlock(&mddev->open_mutex);
6299
6300         check_disk_change(bdev);
6301  out:
6302         return err;
6303 }
6304
6305 static int md_release(struct gendisk *disk, fmode_t mode)
6306 {
6307         mddev_t *mddev = disk->private_data;
6308
6309         BUG_ON(!mddev);
6310         atomic_dec(&mddev->openers);
6311         mddev_put(mddev);
6312
6313         return 0;
6314 }
6315
6316 static int md_media_changed(struct gendisk *disk)
6317 {
6318         mddev_t *mddev = disk->private_data;
6319
6320         return mddev->changed;
6321 }
6322
6323 static int md_revalidate(struct gendisk *disk)
6324 {
6325         mddev_t *mddev = disk->private_data;
6326
6327         mddev->changed = 0;
6328         return 0;
6329 }
6330 static const struct block_device_operations md_fops =
6331 {
6332         .owner          = THIS_MODULE,
6333         .open           = md_open,
6334         .release        = md_release,
6335         .ioctl          = md_ioctl,
6336 #ifdef CONFIG_COMPAT
6337         .compat_ioctl   = md_compat_ioctl,
6338 #endif
6339         .getgeo         = md_getgeo,
6340         .media_changed  = md_media_changed,
6341         .revalidate_disk= md_revalidate,
6342 };
6343
6344 static int md_thread(void * arg)
6345 {
6346         mdk_thread_t *thread = arg;
6347
6348         /*
6349          * md_thread is a 'system-thread', it's priority should be very
6350          * high. We avoid resource deadlocks individually in each
6351          * raid personality. (RAID5 does preallocation) We also use RR and
6352          * the very same RT priority as kswapd, thus we will never get
6353          * into a priority inversion deadlock.
6354          *
6355          * we definitely have to have equal or higher priority than
6356          * bdflush, otherwise bdflush will deadlock if there are too
6357          * many dirty RAID5 blocks.
6358          */
6359
6360         allow_signal(SIGKILL);
6361         while (!kthread_should_stop()) {
6362
6363                 /* We need to wait INTERRUPTIBLE so that
6364                  * we don't add to the load-average.
6365                  * That means we need to be sure no signals are
6366                  * pending
6367                  */
6368                 if (signal_pending(current))
6369                         flush_signals(current);
6370
6371                 wait_event_interruptible_timeout
6372                         (thread->wqueue,
6373                          test_bit(THREAD_WAKEUP, &thread->flags)
6374                          || kthread_should_stop(),
6375                          thread->timeout);
6376
6377                 clear_bit(THREAD_WAKEUP, &thread->flags);
6378                 if (!kthread_should_stop())
6379                         thread->run(thread->mddev);
6380         }
6381
6382         return 0;
6383 }
6384
6385 void md_wakeup_thread(mdk_thread_t *thread)
6386 {
6387         if (thread) {
6388                 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
6389                 set_bit(THREAD_WAKEUP, &thread->flags);
6390                 wake_up(&thread->wqueue);
6391         }
6392 }
6393
6394 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
6395                                  const char *name)
6396 {
6397         mdk_thread_t *thread;
6398
6399         thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
6400         if (!thread)
6401                 return NULL;
6402
6403         init_waitqueue_head(&thread->wqueue);
6404
6405         thread->run = run;
6406         thread->mddev = mddev;
6407         thread->timeout = MAX_SCHEDULE_TIMEOUT;
6408         thread->tsk = kthread_run(md_thread, thread,
6409                                   "%s_%s",
6410                                   mdname(thread->mddev),
6411                                   name ?: mddev->pers->name);
6412         if (IS_ERR(thread->tsk)) {
6413                 kfree(thread);
6414                 return NULL;
6415         }
6416         return thread;
6417 }
6418
6419 void md_unregister_thread(mdk_thread_t *thread)
6420 {
6421         if (!thread)
6422                 return;
6423         dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6424
6425         kthread_stop(thread->tsk);
6426         kfree(thread);
6427 }
6428
6429 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6430 {
6431         if (!mddev) {
6432                 MD_BUG();
6433                 return;
6434         }
6435
6436         if (!rdev || test_bit(Faulty, &rdev->flags))
6437                 return;
6438
6439         if (!mddev->pers || !mddev->pers->error_handler)
6440                 return;
6441         mddev->pers->error_handler(mddev,rdev);
6442         if (mddev->degraded)
6443                 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6444         sysfs_notify_dirent_safe(rdev->sysfs_state);
6445         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6446         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6447         md_wakeup_thread(mddev->thread);
6448         if (mddev->event_work.func)
6449                 queue_work(md_misc_wq, &mddev->event_work);
6450         md_new_event_inintr(mddev);
6451 }
6452
6453 /* seq_file implementation /proc/mdstat */
6454
6455 static void status_unused(struct seq_file *seq)
6456 {
6457         int i = 0;
6458         mdk_rdev_t *rdev;
6459
6460         seq_printf(seq, "unused devices: ");
6461
6462         list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6463                 char b[BDEVNAME_SIZE];
6464                 i++;
6465                 seq_printf(seq, "%s ",
6466                               bdevname(rdev->bdev,b));
6467         }
6468         if (!i)
6469                 seq_printf(seq, "<none>");
6470
6471         seq_printf(seq, "\n");
6472 }
6473
6474
6475 static void status_resync(struct seq_file *seq, mddev_t * mddev)
6476 {
6477         sector_t max_sectors, resync, res;
6478         unsigned long dt, db;
6479         sector_t rt;
6480         int scale;
6481         unsigned int per_milli;
6482
6483         resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
6484
6485         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6486                 max_sectors = mddev->resync_max_sectors;
6487         else
6488                 max_sectors = mddev->dev_sectors;
6489
6490         /*
6491          * Should not happen.
6492          */
6493         if (!max_sectors) {
6494                 MD_BUG();
6495                 return;
6496         }
6497         /* Pick 'scale' such that (resync>>scale)*1000 will fit
6498          * in a sector_t, and (max_sectors>>scale) will fit in a
6499          * u32, as those are the requirements for sector_div.
6500          * Thus 'scale' must be at least 10
6501          */
6502         scale = 10;
6503         if (sizeof(sector_t) > sizeof(unsigned long)) {
6504                 while ( max_sectors/2 > (1ULL<<(scale+32)))
6505                         scale++;
6506         }
6507         res = (resync>>scale)*1000;
6508         sector_div(res, (u32)((max_sectors>>scale)+1));
6509
6510         per_milli = res;
6511         {
6512                 int i, x = per_milli/50, y = 20-x;
6513                 seq_printf(seq, "[");
6514                 for (i = 0; i < x; i++)
6515                         seq_printf(seq, "=");
6516                 seq_printf(seq, ">");
6517                 for (i = 0; i < y; i++)
6518                         seq_printf(seq, ".");
6519                 seq_printf(seq, "] ");
6520         }
6521         seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6522                    (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6523                     "reshape" :
6524                     (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6525                      "check" :
6526                      (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6527                       "resync" : "recovery"))),
6528                    per_milli/10, per_milli % 10,
6529                    (unsigned long long) resync/2,
6530                    (unsigned long long) max_sectors/2);
6531
6532         /*
6533          * dt: time from mark until now
6534          * db: blocks written from mark until now
6535          * rt: remaining time
6536          *
6537          * rt is a sector_t, so could be 32bit or 64bit.
6538          * So we divide before multiply in case it is 32bit and close
6539          * to the limit.
6540          * We scale the divisor (db) by 32 to avoid losing precision
6541          * near the end of resync when the number of remaining sectors
6542          * is close to 'db'.
6543          * We then divide rt by 32 after multiplying by db to compensate.
6544          * The '+1' avoids division by zero if db is very small.
6545          */
6546         dt = ((jiffies - mddev->resync_mark) / HZ);
6547         if (!dt) dt++;
6548         db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6549                 - mddev->resync_mark_cnt;
6550
6551         rt = max_sectors - resync;    /* number of remaining sectors */
6552         sector_div(rt, db/32+1);
6553         rt *= dt;
6554         rt >>= 5;
6555
6556         seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6557                    ((unsigned long)rt % 60)/6);
6558
6559         seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6560 }
6561
6562 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6563 {
6564         struct list_head *tmp;
6565         loff_t l = *pos;
6566         mddev_t *mddev;
6567
6568         if (l >= 0x10000)
6569                 return NULL;
6570         if (!l--)
6571                 /* header */
6572                 return (void*)1;
6573
6574         spin_lock(&all_mddevs_lock);
6575         list_for_each(tmp,&all_mddevs)
6576                 if (!l--) {
6577                         mddev = list_entry(tmp, mddev_t, all_mddevs);
6578                         mddev_get(mddev);
6579                         spin_unlock(&all_mddevs_lock);
6580                         return mddev;
6581                 }
6582         spin_unlock(&all_mddevs_lock);
6583         if (!l--)
6584                 return (void*)2;/* tail */
6585         return NULL;
6586 }
6587
6588 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6589 {
6590         struct list_head *tmp;
6591         mddev_t *next_mddev, *mddev = v;
6592         
6593         ++*pos;
6594         if (v == (void*)2)
6595                 return NULL;
6596
6597         spin_lock(&all_mddevs_lock);
6598         if (v == (void*)1)
6599                 tmp = all_mddevs.next;
6600         else
6601                 tmp = mddev->all_mddevs.next;
6602         if (tmp != &all_mddevs)
6603                 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
6604         else {
6605                 next_mddev = (void*)2;
6606                 *pos = 0x10000;
6607         }               
6608         spin_unlock(&all_mddevs_lock);
6609
6610         if (v != (void*)1)
6611                 mddev_put(mddev);
6612         return next_mddev;
6613
6614 }
6615
6616 static void md_seq_stop(struct seq_file *seq, void *v)
6617 {
6618         mddev_t *mddev = v;
6619
6620         if (mddev && v != (void*)1 && v != (void*)2)
6621                 mddev_put(mddev);
6622 }
6623
6624 static int md_seq_show(struct seq_file *seq, void *v)
6625 {
6626         mddev_t *mddev = v;
6627         sector_t sectors;
6628         mdk_rdev_t *rdev;
6629         struct bitmap *bitmap;
6630
6631         if (v == (void*)1) {
6632                 struct mdk_personality *pers;
6633                 seq_printf(seq, "Personalities : ");
6634                 spin_lock(&pers_lock);
6635                 list_for_each_entry(pers, &pers_list, list)
6636                         seq_printf(seq, "[%s] ", pers->name);
6637
6638                 spin_unlock(&pers_lock);
6639                 seq_printf(seq, "\n");
6640                 seq->poll_event = atomic_read(&md_event_count);
6641                 return 0;
6642         }
6643         if (v == (void*)2) {
6644                 status_unused(seq);
6645                 return 0;
6646         }
6647
6648         if (mddev_lock(mddev) < 0)
6649                 return -EINTR;
6650
6651         if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6652                 seq_printf(seq, "%s : %sactive", mdname(mddev),
6653                                                 mddev->pers ? "" : "in");
6654                 if (mddev->pers) {
6655                         if (mddev->ro==1)
6656                                 seq_printf(seq, " (read-only)");
6657                         if (mddev->ro==2)
6658                                 seq_printf(seq, " (auto-read-only)");
6659                         seq_printf(seq, " %s", mddev->pers->name);
6660                 }
6661
6662                 sectors = 0;
6663                 list_for_each_entry(rdev, &mddev->disks, same_set) {
6664                         char b[BDEVNAME_SIZE];
6665                         seq_printf(seq, " %s[%d]",
6666                                 bdevname(rdev->bdev,b), rdev->desc_nr);
6667                         if (test_bit(WriteMostly, &rdev->flags))
6668                                 seq_printf(seq, "(W)");
6669                         if (test_bit(Faulty, &rdev->flags)) {
6670                                 seq_printf(seq, "(F)");
6671                                 continue;
6672                         } else if (rdev->raid_disk < 0)
6673                                 seq_printf(seq, "(S)"); /* spare */
6674                         sectors += rdev->sectors;
6675                 }
6676
6677                 if (!list_empty(&mddev->disks)) {
6678                         if (mddev->pers)
6679                                 seq_printf(seq, "\n      %llu blocks",
6680                                            (unsigned long long)
6681                                            mddev->array_sectors / 2);
6682                         else
6683                                 seq_printf(seq, "\n      %llu blocks",
6684                                            (unsigned long long)sectors / 2);
6685                 }
6686                 if (mddev->persistent) {
6687                         if (mddev->major_version != 0 ||
6688                             mddev->minor_version != 90) {
6689                                 seq_printf(seq," super %d.%d",
6690                                            mddev->major_version,
6691                                            mddev->minor_version);
6692                         }
6693                 } else if (mddev->external)
6694                         seq_printf(seq, " super external:%s",
6695                                    mddev->metadata_type);
6696                 else
6697                         seq_printf(seq, " super non-persistent");
6698
6699                 if (mddev->pers) {
6700                         mddev->pers->status(seq, mddev);
6701                         seq_printf(seq, "\n      ");
6702                         if (mddev->pers->sync_request) {
6703                                 if (mddev->curr_resync > 2) {
6704                                         status_resync(seq, mddev);
6705                                         seq_printf(seq, "\n      ");
6706                                 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
6707                                         seq_printf(seq, "\tresync=DELAYED\n      ");
6708                                 else if (mddev->recovery_cp < MaxSector)
6709                                         seq_printf(seq, "\tresync=PENDING\n      ");
6710                         }
6711                 } else
6712                         seq_printf(seq, "\n       ");
6713
6714                 if ((bitmap = mddev->bitmap)) {
6715                         unsigned long chunk_kb;
6716                         unsigned long flags;
6717                         spin_lock_irqsave(&bitmap->lock, flags);
6718                         chunk_kb = mddev->bitmap_info.chunksize >> 10;
6719                         seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6720                                 "%lu%s chunk",
6721                                 bitmap->pages - bitmap->missing_pages,
6722                                 bitmap->pages,
6723                                 (bitmap->pages - bitmap->missing_pages)
6724                                         << (PAGE_SHIFT - 10),
6725                                 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6726                                 chunk_kb ? "KB" : "B");
6727                         if (bitmap->file) {
6728                                 seq_printf(seq, ", file: ");
6729                                 seq_path(seq, &bitmap->file->f_path, " \t\n");
6730                         }
6731
6732                         seq_printf(seq, "\n");
6733                         spin_unlock_irqrestore(&bitmap->lock, flags);
6734                 }
6735
6736                 seq_printf(seq, "\n");
6737         }
6738         mddev_unlock(mddev);
6739         
6740         return 0;
6741 }
6742
6743 static const struct seq_operations md_seq_ops = {
6744         .start  = md_seq_start,
6745         .next   = md_seq_next,
6746         .stop   = md_seq_stop,
6747         .show   = md_seq_show,
6748 };
6749
6750 static int md_seq_open(struct inode *inode, struct file *file)
6751 {
6752         struct seq_file *seq;
6753         int error;
6754
6755         error = seq_open(file, &md_seq_ops);
6756         if (error)
6757                 return error;
6758
6759         seq = file->private_data;
6760         seq->poll_event = atomic_read(&md_event_count);
6761         return error;
6762 }
6763
6764 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6765 {
6766         struct seq_file *seq = filp->private_data;
6767         int mask;
6768
6769         poll_wait(filp, &md_event_waiters, wait);
6770
6771         /* always allow read */
6772         mask = POLLIN | POLLRDNORM;
6773
6774         if (seq->poll_event != atomic_read(&md_event_count))
6775                 mask |= POLLERR | POLLPRI;
6776         return mask;
6777 }
6778
6779 static const struct file_operations md_seq_fops = {
6780         .owner          = THIS_MODULE,
6781         .open           = md_seq_open,
6782         .read           = seq_read,
6783         .llseek         = seq_lseek,
6784         .release        = seq_release_private,
6785         .poll           = mdstat_poll,
6786 };
6787
6788 int register_md_personality(struct mdk_personality *p)
6789 {
6790         spin_lock(&pers_lock);
6791         list_add_tail(&p->list, &pers_list);
6792         printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6793         spin_unlock(&pers_lock);
6794         return 0;
6795 }
6796
6797 int unregister_md_personality(struct mdk_personality *p)
6798 {
6799         printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6800         spin_lock(&pers_lock);
6801         list_del_init(&p->list);
6802         spin_unlock(&pers_lock);
6803         return 0;
6804 }
6805
6806 static int is_mddev_idle(mddev_t *mddev, int init)
6807 {
6808         mdk_rdev_t * rdev;
6809         int idle;
6810         int curr_events;
6811
6812         idle = 1;
6813         rcu_read_lock();
6814         rdev_for_each_rcu(rdev, mddev) {
6815                 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6816                 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6817                               (int)part_stat_read(&disk->part0, sectors[1]) -
6818                               atomic_read(&disk->sync_io);
6819                 /* sync IO will cause sync_io to increase before the disk_stats
6820                  * as sync_io is counted when a request starts, and
6821                  * disk_stats is counted when it completes.
6822                  * So resync activity will cause curr_events to be smaller than
6823                  * when there was no such activity.
6824                  * non-sync IO will cause disk_stat to increase without
6825                  * increasing sync_io so curr_events will (eventually)
6826                  * be larger than it was before.  Once it becomes
6827                  * substantially larger, the test below will cause
6828                  * the array to appear non-idle, and resync will slow
6829                  * down.
6830                  * If there is a lot of outstanding resync activity when
6831                  * we set last_event to curr_events, then all that activity
6832                  * completing might cause the array to appear non-idle
6833                  * and resync will be slowed down even though there might
6834                  * not have been non-resync activity.  This will only
6835                  * happen once though.  'last_events' will soon reflect
6836                  * the state where there is little or no outstanding
6837                  * resync requests, and further resync activity will
6838                  * always make curr_events less than last_events.
6839                  *
6840                  */
6841                 if (init || curr_events - rdev->last_events > 64) {
6842                         rdev->last_events = curr_events;
6843                         idle = 0;
6844                 }
6845         }
6846         rcu_read_unlock();
6847         return idle;
6848 }
6849
6850 void md_done_sync(mddev_t *mddev, int blocks, int ok)
6851 {
6852         /* another "blocks" (512byte) blocks have been synced */
6853         atomic_sub(blocks, &mddev->recovery_active);
6854         wake_up(&mddev->recovery_wait);
6855         if (!ok) {
6856                 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6857                 md_wakeup_thread(mddev->thread);
6858                 // stop recovery, signal do_sync ....
6859         }
6860 }
6861
6862
6863 /* md_write_start(mddev, bi)
6864  * If we need to update some array metadata (e.g. 'active' flag
6865  * in superblock) before writing, schedule a superblock update
6866  * and wait for it to complete.
6867  */
6868 void md_write_start(mddev_t *mddev, struct bio *bi)
6869 {
6870         int did_change = 0;
6871         if (bio_data_dir(bi) != WRITE)
6872                 return;
6873
6874         BUG_ON(mddev->ro == 1);
6875         if (mddev->ro == 2) {
6876                 /* need to switch to read/write */
6877                 mddev->ro = 0;
6878                 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6879                 md_wakeup_thread(mddev->thread);
6880                 md_wakeup_thread(mddev->sync_thread);
6881                 did_change = 1;
6882         }
6883         atomic_inc(&mddev->writes_pending);
6884         if (mddev->safemode == 1)
6885                 mddev->safemode = 0;
6886         if (mddev->in_sync) {
6887                 spin_lock_irq(&mddev->write_lock);
6888                 if (mddev->in_sync) {
6889                         mddev->in_sync = 0;
6890                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6891                         set_bit(MD_CHANGE_PENDING, &mddev->flags);
6892                         md_wakeup_thread(mddev->thread);
6893                         did_change = 1;
6894                 }
6895                 spin_unlock_irq(&mddev->write_lock);
6896         }
6897         if (did_change)
6898                 sysfs_notify_dirent_safe(mddev->sysfs_state);
6899         wait_event(mddev->sb_wait,
6900                    !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6901 }
6902
6903 void md_write_end(mddev_t *mddev)
6904 {
6905         if (atomic_dec_and_test(&mddev->writes_pending)) {
6906                 if (mddev->safemode == 2)
6907                         md_wakeup_thread(mddev->thread);
6908                 else if (mddev->safemode_delay)
6909                         mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6910         }
6911 }
6912
6913 /* md_allow_write(mddev)
6914  * Calling this ensures that the array is marked 'active' so that writes
6915  * may proceed without blocking.  It is important to call this before
6916  * attempting a GFP_KERNEL allocation while holding the mddev lock.
6917  * Must be called with mddev_lock held.
6918  *
6919  * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6920  * is dropped, so return -EAGAIN after notifying userspace.
6921  */
6922 int md_allow_write(mddev_t *mddev)
6923 {
6924         if (!mddev->pers)
6925                 return 0;
6926         if (mddev->ro)
6927                 return 0;
6928         if (!mddev->pers->sync_request)
6929                 return 0;
6930
6931         spin_lock_irq(&mddev->write_lock);
6932         if (mddev->in_sync) {
6933                 mddev->in_sync = 0;
6934                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6935                 set_bit(MD_CHANGE_PENDING, &mddev->flags);
6936                 if (mddev->safemode_delay &&
6937                     mddev->safemode == 0)
6938                         mddev->safemode = 1;
6939                 spin_unlock_irq(&mddev->write_lock);
6940                 md_update_sb(mddev, 0);
6941                 sysfs_notify_dirent_safe(mddev->sysfs_state);
6942         } else
6943                 spin_unlock_irq(&mddev->write_lock);
6944
6945         if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
6946                 return -EAGAIN;
6947         else
6948                 return 0;
6949 }
6950 EXPORT_SYMBOL_GPL(md_allow_write);
6951
6952 #define SYNC_MARKS      10
6953 #define SYNC_MARK_STEP  (3*HZ)
6954 void md_do_sync(mddev_t *mddev)
6955 {
6956         mddev_t *mddev2;
6957         unsigned int currspeed = 0,
6958                  window;
6959         sector_t max_sectors,j, io_sectors;
6960         unsigned long mark[SYNC_MARKS];
6961         sector_t mark_cnt[SYNC_MARKS];
6962         int last_mark,m;
6963         struct list_head *tmp;
6964         sector_t last_check;
6965         int skipped = 0;
6966         mdk_rdev_t *rdev;
6967         char *desc;
6968
6969         /* just incase thread restarts... */
6970         if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6971                 return;
6972         if (mddev->ro) /* never try to sync a read-only array */
6973                 return;
6974
6975         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6976                 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6977                         desc = "data-check";
6978                 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6979                         desc = "requested-resync";
6980                 else
6981                         desc = "resync";
6982         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6983                 desc = "reshape";
6984         else
6985                 desc = "recovery";
6986
6987         /* we overload curr_resync somewhat here.
6988          * 0 == not engaged in resync at all
6989          * 2 == checking that there is no conflict with another sync
6990          * 1 == like 2, but have yielded to allow conflicting resync to
6991          *              commense
6992          * other == active in resync - this many blocks
6993          *
6994          * Before starting a resync we must have set curr_resync to
6995          * 2, and then checked that every "conflicting" array has curr_resync
6996          * less than ours.  When we find one that is the same or higher
6997          * we wait on resync_wait.  To avoid deadlock, we reduce curr_resync
6998          * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6999          * This will mean we have to start checking from the beginning again.
7000          *
7001          */
7002
7003         do {
7004                 mddev->curr_resync = 2;
7005
7006         try_again:
7007                 if (kthread_should_stop())
7008                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7009
7010                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7011                         goto skip;
7012                 for_each_mddev(mddev2, tmp) {
7013                         if (mddev2 == mddev)
7014                                 continue;
7015                         if (!mddev->parallel_resync
7016                         &&  mddev2->curr_resync
7017                         &&  match_mddev_units(mddev, mddev2)) {
7018                                 DEFINE_WAIT(wq);
7019                                 if (mddev < mddev2 && mddev->curr_resync == 2) {
7020                                         /* arbitrarily yield */
7021                                         mddev->curr_resync = 1;
7022                                         wake_up(&resync_wait);
7023                                 }
7024                                 if (mddev > mddev2 && mddev->curr_resync == 1)
7025                                         /* no need to wait here, we can wait the next
7026                                          * time 'round when curr_resync == 2
7027                                          */
7028                                         continue;
7029                                 /* We need to wait 'interruptible' so as not to
7030                                  * contribute to the load average, and not to
7031                                  * be caught by 'softlockup'
7032                                  */
7033                                 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7034                                 if (!kthread_should_stop() &&
7035                                     mddev2->curr_resync >= mddev->curr_resync) {
7036                                         printk(KERN_INFO "md: delaying %s of %s"
7037                                                " until %s has finished (they"
7038                                                " share one or more physical units)\n",
7039                                                desc, mdname(mddev), mdname(mddev2));
7040                                         mddev_put(mddev2);
7041                                         if (signal_pending(current))
7042                                                 flush_signals(current);
7043                                         schedule();
7044                                         finish_wait(&resync_wait, &wq);
7045                                         goto try_again;
7046                                 }
7047                                 finish_wait(&resync_wait, &wq);
7048                         }
7049                 }
7050         } while (mddev->curr_resync < 2);
7051
7052         j = 0;
7053         if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7054                 /* resync follows the size requested by the personality,
7055                  * which defaults to physical size, but can be virtual size
7056                  */
7057                 max_sectors = mddev->resync_max_sectors;
7058                 mddev->resync_mismatches = 0;
7059                 /* we don't use the checkpoint if there's a bitmap */
7060                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7061                         j = mddev->resync_min;
7062                 else if (!mddev->bitmap)
7063                         j = mddev->recovery_cp;
7064
7065         } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7066                 max_sectors = mddev->dev_sectors;
7067         else {
7068                 /* recovery follows the physical size of devices */
7069                 max_sectors = mddev->dev_sectors;
7070                 j = MaxSector;
7071                 rcu_read_lock();
7072                 list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
7073                         if (rdev->raid_disk >= 0 &&
7074                             !test_bit(Faulty, &rdev->flags) &&
7075                             !test_bit(In_sync, &rdev->flags) &&
7076                             rdev->recovery_offset < j)
7077                                 j = rdev->recovery_offset;
7078                 rcu_read_unlock();
7079         }
7080
7081         printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7082         printk(KERN_INFO "md: minimum _guaranteed_  speed:"
7083                 " %d KB/sec/disk.\n", speed_min(mddev));
7084         printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7085                "(but not more than %d KB/sec) for %s.\n",
7086                speed_max(mddev), desc);
7087
7088         is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7089
7090         io_sectors = 0;
7091         for (m = 0; m < SYNC_MARKS; m++) {
7092                 mark[m] = jiffies;
7093                 mark_cnt[m] = io_sectors;
7094         }
7095         last_mark = 0;
7096         mddev->resync_mark = mark[last_mark];
7097         mddev->resync_mark_cnt = mark_cnt[last_mark];
7098
7099         /*
7100          * Tune reconstruction:
7101          */
7102         window = 32*(PAGE_SIZE/512);
7103         printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7104                 window/2, (unsigned long long)max_sectors/2);
7105
7106         atomic_set(&mddev->recovery_active, 0);
7107         last_check = 0;
7108
7109         if (j>2) {
7110                 printk(KERN_INFO 
7111                        "md: resuming %s of %s from checkpoint.\n",
7112                        desc, mdname(mddev));
7113                 mddev->curr_resync = j;
7114         }
7115         mddev->curr_resync_completed = j;
7116
7117         while (j < max_sectors) {
7118                 sector_t sectors;
7119
7120                 skipped = 0;
7121
7122                 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7123                     ((mddev->curr_resync > mddev->curr_resync_completed &&
7124                       (mddev->curr_resync - mddev->curr_resync_completed)
7125                       > (max_sectors >> 4)) ||
7126                      (j - mddev->curr_resync_completed)*2
7127                      >= mddev->resync_max - mddev->curr_resync_completed
7128                             )) {
7129                         /* time to update curr_resync_completed */
7130                         wait_event(mddev->recovery_wait,
7131                                    atomic_read(&mddev->recovery_active) == 0);
7132                         mddev->curr_resync_completed = j;
7133                         set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7134                         sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7135                 }
7136
7137                 while (j >= mddev->resync_max && !kthread_should_stop()) {
7138                         /* As this condition is controlled by user-space,
7139                          * we can block indefinitely, so use '_interruptible'
7140                          * to avoid triggering warnings.
7141                          */
7142                         flush_signals(current); /* just in case */
7143                         wait_event_interruptible(mddev->recovery_wait,
7144                                                  mddev->resync_max > j
7145                                                  || kthread_should_stop());
7146                 }
7147
7148                 if (kthread_should_stop())
7149                         goto interrupted;
7150
7151                 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7152                                                   currspeed < speed_min(mddev));
7153                 if (sectors == 0) {
7154                         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7155                         goto out;
7156                 }
7157
7158                 if (!skipped) { /* actual IO requested */
7159                         io_sectors += sectors;
7160                         atomic_add(sectors, &mddev->recovery_active);
7161                 }
7162
7163                 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7164                         break;
7165
7166                 j += sectors;
7167                 if (j>1) mddev->curr_resync = j;
7168                 mddev->curr_mark_cnt = io_sectors;
7169                 if (last_check == 0)
7170                         /* this is the earliest that rebuild will be
7171                          * visible in /proc/mdstat
7172                          */
7173                         md_new_event(mddev);
7174
7175                 if (last_check + window > io_sectors || j == max_sectors)
7176                         continue;
7177
7178                 last_check = io_sectors;
7179         repeat:
7180                 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7181                         /* step marks */
7182                         int next = (last_mark+1) % SYNC_MARKS;
7183
7184                         mddev->resync_mark = mark[next];
7185                         mddev->resync_mark_cnt = mark_cnt[next];
7186                         mark[next] = jiffies;
7187                         mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7188                         last_mark = next;
7189                 }
7190
7191
7192                 if (kthread_should_stop())
7193                         goto interrupted;
7194
7195
7196                 /*
7197                  * this loop exits only if either when we are slower than
7198                  * the 'hard' speed limit, or the system was IO-idle for
7199                  * a jiffy.
7200                  * the system might be non-idle CPU-wise, but we only care
7201                  * about not overloading the IO subsystem. (things like an
7202                  * e2fsck being done on the RAID array should execute fast)
7203                  */
7204                 cond_resched();
7205
7206                 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
7207                         /((jiffies-mddev->resync_mark)/HZ +1) +1;
7208
7209                 if (currspeed > speed_min(mddev)) {
7210                         if ((currspeed > speed_max(mddev)) ||
7211                                         !is_mddev_idle(mddev, 0)) {
7212                                 msleep(500);
7213                                 goto repeat;
7214                         }
7215                 }
7216         }
7217         printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
7218         /*
7219          * this also signals 'finished resyncing' to md_stop
7220          */
7221  out:
7222         wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7223
7224         /* tell personality that we are finished */
7225         mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7226
7227         if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7228             mddev->curr_resync > 2) {
7229                 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7230                         if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7231                                 if (mddev->curr_resync >= mddev->recovery_cp) {
7232                                         printk(KERN_INFO
7233                                                "md: checkpointing %s of %s.\n",
7234                                                desc, mdname(mddev));
7235                                         mddev->recovery_cp = mddev->curr_resync;
7236                                 }
7237                         } else
7238                                 mddev->recovery_cp = MaxSector;
7239                 } else {
7240                         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7241                                 mddev->curr_resync = MaxSector;
7242                         rcu_read_lock();
7243                         list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
7244                                 if (rdev->raid_disk >= 0 &&
7245                                     mddev->delta_disks >= 0 &&
7246                                     !test_bit(Faulty, &rdev->flags) &&
7247                                     !test_bit(In_sync, &rdev->flags) &&
7248                                     rdev->recovery_offset < mddev->curr_resync)
7249                                         rdev->recovery_offset = mddev->curr_resync;
7250                         rcu_read_unlock();
7251                 }
7252         }
7253         set_bit(MD_CHANGE_DEVS, &mddev->flags);
7254
7255  skip:
7256         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7257                 /* We completed so min/max setting can be forgotten if used. */
7258                 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7259                         mddev->resync_min = 0;
7260                 mddev->resync_max = MaxSector;
7261         } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7262                 mddev->resync_min = mddev->curr_resync_completed;
7263         mddev->curr_resync = 0;
7264         wake_up(&resync_wait);
7265         set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7266         md_wakeup_thread(mddev->thread);
7267         return;
7268
7269  interrupted:
7270         /*
7271          * got a signal, exit.
7272          */
7273         printk(KERN_INFO
7274                "md: md_do_sync() got signal ... exiting\n");
7275         set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7276         goto out;
7277
7278 }
7279 EXPORT_SYMBOL_GPL(md_do_sync);
7280
7281 static int remove_and_add_spares(mddev_t *mddev)
7282 {
7283         mdk_rdev_t *rdev;
7284         int spares = 0;
7285
7286         mddev->curr_resync_completed = 0;
7287
7288         list_for_each_entry(rdev, &mddev->disks, same_set)
7289                 if (rdev->raid_disk >= 0 &&
7290                     !test_bit(Blocked, &rdev->flags) &&
7291                     (test_bit(Faulty, &rdev->flags) ||
7292                      ! test_bit(In_sync, &rdev->flags)) &&
7293                     atomic_read(&rdev->nr_pending)==0) {
7294                         if (mddev->pers->hot_remove_disk(
7295                                     mddev, rdev->raid_disk)==0) {
7296                                 sysfs_unlink_rdev(mddev, rdev);
7297                                 rdev->raid_disk = -1;
7298                         }
7299                 }
7300
7301         if (mddev->degraded) {
7302                 list_for_each_entry(rdev, &mddev->disks, same_set) {
7303                         if (rdev->raid_disk >= 0 &&
7304                             !test_bit(In_sync, &rdev->flags) &&
7305                             !test_bit(Faulty, &rdev->flags))
7306                                 spares++;
7307                         if (rdev->raid_disk < 0
7308                             && !test_bit(Faulty, &rdev->flags)) {
7309                                 rdev->recovery_offset = 0;
7310                                 if (mddev->pers->
7311                                     hot_add_disk(mddev, rdev) == 0) {
7312                                         if (sysfs_link_rdev(mddev, rdev))
7313                                                 /* failure here is OK */;
7314                                         spares++;
7315                                         md_new_event(mddev);
7316                                         set_bit(MD_CHANGE_DEVS, &mddev->flags);
7317                                 } else
7318                                         break;
7319                         }
7320                 }
7321         }
7322         return spares;
7323 }
7324
7325 static void reap_sync_thread(mddev_t *mddev)
7326 {
7327         mdk_rdev_t *rdev;
7328
7329         /* resync has finished, collect result */
7330         md_unregister_thread(mddev->sync_thread);
7331         mddev->sync_thread = NULL;
7332         if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7333             !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7334                 /* success...*/
7335                 /* activate any spares */
7336                 if (mddev->pers->spare_active(mddev))
7337                         sysfs_notify(&mddev->kobj, NULL,
7338                                      "degraded");
7339         }
7340         if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7341             mddev->pers->finish_reshape)
7342                 mddev->pers->finish_reshape(mddev);
7343         md_update_sb(mddev, 1);
7344
7345         /* if array is no-longer degraded, then any saved_raid_disk
7346          * information must be scrapped
7347          */
7348         if (!mddev->degraded)
7349                 list_for_each_entry(rdev, &mddev->disks, same_set)
7350                         rdev->saved_raid_disk = -1;
7351
7352         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7353         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7354         clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7355         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7356         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7357         /* flag recovery needed just to double check */
7358         set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7359         sysfs_notify_dirent_safe(mddev->sysfs_action);
7360         md_new_event(mddev);
7361         if (mddev->event_work.func)
7362                 queue_work(md_misc_wq, &mddev->event_work);
7363 }
7364
7365 /*
7366  * This routine is regularly called by all per-raid-array threads to
7367  * deal with generic issues like resync and super-block update.
7368  * Raid personalities that don't have a thread (linear/raid0) do not
7369  * need this as they never do any recovery or update the superblock.
7370  *
7371  * It does not do any resync itself, but rather "forks" off other threads
7372  * to do that as needed.
7373  * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7374  * "->recovery" and create a thread at ->sync_thread.
7375  * When the thread finishes it sets MD_RECOVERY_DONE
7376  * and wakeups up this thread which will reap the thread and finish up.
7377  * This thread also removes any faulty devices (with nr_pending == 0).
7378  *
7379  * The overall approach is:
7380  *  1/ if the superblock needs updating, update it.
7381  *  2/ If a recovery thread is running, don't do anything else.
7382  *  3/ If recovery has finished, clean up, possibly marking spares active.
7383  *  4/ If there are any faulty devices, remove them.
7384  *  5/ If array is degraded, try to add spares devices
7385  *  6/ If array has spares or is not in-sync, start a resync thread.
7386  */
7387 void md_check_recovery(mddev_t *mddev)
7388 {
7389         if (mddev->suspended)
7390                 return;
7391
7392         if (mddev->bitmap)
7393                 bitmap_daemon_work(mddev);
7394
7395         if (signal_pending(current)) {
7396                 if (mddev->pers->sync_request && !mddev->external) {
7397                         printk(KERN_INFO "md: %s in immediate safe mode\n",
7398                                mdname(mddev));
7399                         mddev->safemode = 2;
7400                 }
7401                 flush_signals(current);
7402         }
7403
7404         if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7405                 return;
7406         if ( ! (
7407                 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
7408                 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7409                 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7410                 (mddev->external == 0 && mddev->safemode == 1) ||
7411                 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7412                  && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7413                 ))
7414                 return;
7415
7416         if (mddev_trylock(mddev)) {
7417                 int spares = 0;
7418
7419                 if (mddev->ro) {
7420                         /* Only thing we do on a ro array is remove
7421                          * failed devices.
7422                          */
7423                         mdk_rdev_t *rdev;
7424                         list_for_each_entry(rdev, &mddev->disks, same_set)
7425                                 if (rdev->raid_disk >= 0 &&
7426                                     !test_bit(Blocked, &rdev->flags) &&
7427                                     test_bit(Faulty, &rdev->flags) &&
7428                                     atomic_read(&rdev->nr_pending)==0) {
7429                                         if (mddev->pers->hot_remove_disk(
7430                                                     mddev, rdev->raid_disk)==0) {
7431                                                 sysfs_unlink_rdev(mddev, rdev);
7432                                                 rdev->raid_disk = -1;
7433                                         }
7434                                 }
7435                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7436                         goto unlock;
7437                 }
7438
7439                 if (!mddev->external) {
7440                         int did_change = 0;
7441                         spin_lock_irq(&mddev->write_lock);
7442                         if (mddev->safemode &&
7443                             !atomic_read(&mddev->writes_pending) &&
7444                             !mddev->in_sync &&
7445                             mddev->recovery_cp == MaxSector) {
7446                                 mddev->in_sync = 1;
7447                                 did_change = 1;
7448                                 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7449                         }
7450                         if (mddev->safemode == 1)
7451                                 mddev->safemode = 0;
7452                         spin_unlock_irq(&mddev->write_lock);
7453                         if (did_change)
7454                                 sysfs_notify_dirent_safe(mddev->sysfs_state);
7455                 }
7456
7457                 if (mddev->flags)
7458                         md_update_sb(mddev, 0);
7459
7460                 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7461                     !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7462                         /* resync/recovery still happening */
7463                         clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7464                         goto unlock;
7465                 }
7466                 if (mddev->sync_thread) {
7467                         reap_sync_thread(mddev);
7468                         goto unlock;
7469                 }
7470                 /* Set RUNNING before clearing NEEDED to avoid
7471                  * any transients in the value of "sync_action".
7472                  */
7473                 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7474                 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7475                 /* Clear some bits that don't mean anything, but
7476                  * might be left set
7477                  */
7478                 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7479                 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7480
7481                 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7482                         goto unlock;
7483                 /* no recovery is running.
7484                  * remove any failed drives, then
7485                  * add spares if possible.
7486                  * Spare are also removed and re-added, to allow
7487                  * the personality to fail the re-add.
7488                  */
7489
7490                 if (mddev->reshape_position != MaxSector) {
7491                         if (mddev->pers->check_reshape == NULL ||
7492                             mddev->pers->check_reshape(mddev) != 0)
7493                                 /* Cannot proceed */
7494                                 goto unlock;
7495                         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7496                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7497                 } else if ((spares = remove_and_add_spares(mddev))) {
7498                         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7499                         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7500                         clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7501                         set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7502                 } else if (mddev->recovery_cp < MaxSector) {
7503                         set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7504                         clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7505                 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7506                         /* nothing to be done ... */
7507                         goto unlock;
7508
7509                 if (mddev->pers->sync_request) {
7510                         if (spares && mddev->bitmap && ! mddev->bitmap->file) {
7511                                 /* We are adding a device or devices to an array
7512                                  * which has the bitmap stored on all devices.
7513                                  * So make sure all bitmap pages get written
7514                                  */
7515                                 bitmap_write_all(mddev->bitmap);
7516                         }
7517                         mddev->sync_thread = md_register_thread(md_do_sync,
7518                                                                 mddev,
7519                                                                 "resync");
7520                         if (!mddev->sync_thread) {
7521                                 printk(KERN_ERR "%s: could not start resync"
7522                                         " thread...\n", 
7523                                         mdname(mddev));
7524                                 /* leave the spares where they are, it shouldn't hurt */
7525                                 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7526                                 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7527                                 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7528                                 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7529                                 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7530                         } else
7531                                 md_wakeup_thread(mddev->sync_thread);
7532                         sysfs_notify_dirent_safe(mddev->sysfs_action);
7533                         md_new_event(mddev);
7534                 }
7535         unlock:
7536                 if (!mddev->sync_thread) {
7537                         clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7538                         if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7539                                                &mddev->recovery))
7540                                 if (mddev->sysfs_action)
7541                                         sysfs_notify_dirent_safe(mddev->sysfs_action);
7542                 }
7543                 mddev_unlock(mddev);
7544         }
7545 }
7546
7547 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7548 {
7549         sysfs_notify_dirent_safe(rdev->sysfs_state);
7550         wait_event_timeout(rdev->blocked_wait,
7551                            !test_bit(Blocked, &rdev->flags) &&
7552                            !test_bit(BlockedBadBlocks, &rdev->flags),
7553                            msecs_to_jiffies(5000));
7554         rdev_dec_pending(rdev, mddev);
7555 }
7556 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7557
7558
7559 /* Bad block management.
7560  * We can record which blocks on each device are 'bad' and so just
7561  * fail those blocks, or that stripe, rather than the whole device.
7562  * Entries in the bad-block table are 64bits wide.  This comprises:
7563  * Length of bad-range, in sectors: 0-511 for lengths 1-512
7564  * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7565  *  A 'shift' can be set so that larger blocks are tracked and
7566  *  consequently larger devices can be covered.
7567  * 'Acknowledged' flag - 1 bit. - the most significant bit.
7568  *
7569  * Locking of the bad-block table uses a seqlock so md_is_badblock
7570  * might need to retry if it is very unlucky.
7571  * We will sometimes want to check for bad blocks in a bi_end_io function,
7572  * so we use the write_seqlock_irq variant.
7573  *
7574  * When looking for a bad block we specify a range and want to
7575  * know if any block in the range is bad.  So we binary-search
7576  * to the last range that starts at-or-before the given endpoint,
7577  * (or "before the sector after the target range")
7578  * then see if it ends after the given start.
7579  * We return
7580  *  0 if there are no known bad blocks in the range
7581  *  1 if there are known bad block which are all acknowledged
7582  * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7583  * plus the start/length of the first bad section we overlap.
7584  */
7585 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7586                    sector_t *first_bad, int *bad_sectors)
7587 {
7588         int hi;
7589         int lo = 0;
7590         u64 *p = bb->page;
7591         int rv = 0;
7592         sector_t target = s + sectors;
7593         unsigned seq;
7594
7595         if (bb->shift > 0) {
7596                 /* round the start down, and the end up */
7597                 s >>= bb->shift;
7598                 target += (1<<bb->shift) - 1;
7599                 target >>= bb->shift;
7600                 sectors = target - s;
7601         }
7602         /* 'target' is now the first block after the bad range */
7603
7604 retry:
7605         seq = read_seqbegin(&bb->lock);
7606
7607         hi = bb->count;
7608
7609         /* Binary search between lo and hi for 'target'
7610          * i.e. for the last range that starts before 'target'
7611          */
7612         /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7613          * are known not to be the last range before target.
7614          * VARIANT: hi-lo is the number of possible
7615          * ranges, and decreases until it reaches 1
7616          */
7617         while (hi - lo > 1) {
7618                 int mid = (lo + hi) / 2;
7619                 sector_t a = BB_OFFSET(p[mid]);
7620                 if (a < target)
7621                         /* This could still be the one, earlier ranges
7622                          * could not. */
7623                         lo = mid;
7624                 else
7625                         /* This and later ranges are definitely out. */
7626                         hi = mid;
7627         }
7628         /* 'lo' might be the last that started before target, but 'hi' isn't */
7629         if (hi > lo) {
7630                 /* need to check all range that end after 's' to see if
7631                  * any are unacknowledged.
7632                  */
7633                 while (lo >= 0 &&
7634                        BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7635                         if (BB_OFFSET(p[lo]) < target) {
7636                                 /* starts before the end, and finishes after
7637                                  * the start, so they must overlap
7638                                  */
7639                                 if (rv != -1 && BB_ACK(p[lo]))
7640                                         rv = 1;
7641                                 else
7642                                         rv = -1;
7643                                 *first_bad = BB_OFFSET(p[lo]);
7644                                 *bad_sectors = BB_LEN(p[lo]);
7645                         }
7646                         lo--;
7647                 }
7648         }
7649
7650         if (read_seqretry(&bb->lock, seq))
7651                 goto retry;
7652
7653         return rv;
7654 }
7655 EXPORT_SYMBOL_GPL(md_is_badblock);
7656
7657 /*
7658  * Add a range of bad blocks to the table.
7659  * This might extend the table, or might contract it
7660  * if two adjacent ranges can be merged.
7661  * We binary-search to find the 'insertion' point, then
7662  * decide how best to handle it.
7663  */
7664 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7665                             int acknowledged)
7666 {
7667         u64 *p;
7668         int lo, hi;
7669         int rv = 1;
7670
7671         if (bb->shift < 0)
7672                 /* badblocks are disabled */
7673                 return 0;
7674
7675         if (bb->shift) {
7676                 /* round the start down, and the end up */
7677                 sector_t next = s + sectors;
7678                 s >>= bb->shift;
7679                 next += (1<<bb->shift) - 1;
7680                 next >>= bb->shift;
7681                 sectors = next - s;
7682         }
7683
7684         write_seqlock_irq(&bb->lock);
7685
7686         p = bb->page;
7687         lo = 0;
7688         hi = bb->count;
7689         /* Find the last range that starts at-or-before 's' */
7690         while (hi - lo > 1) {
7691                 int mid = (lo + hi) / 2;
7692                 sector_t a = BB_OFFSET(p[mid]);
7693                 if (a <= s)
7694                         lo = mid;
7695                 else
7696                         hi = mid;
7697         }
7698         if (hi > lo && BB_OFFSET(p[lo]) > s)
7699                 hi = lo;
7700
7701         if (hi > lo) {
7702                 /* we found a range that might merge with the start
7703                  * of our new range
7704                  */
7705                 sector_t a = BB_OFFSET(p[lo]);
7706                 sector_t e = a + BB_LEN(p[lo]);
7707                 int ack = BB_ACK(p[lo]);
7708                 if (e >= s) {
7709                         /* Yes, we can merge with a previous range */
7710                         if (s == a && s + sectors >= e)
7711                                 /* new range covers old */
7712                                 ack = acknowledged;
7713                         else
7714                                 ack = ack && acknowledged;
7715
7716                         if (e < s + sectors)
7717                                 e = s + sectors;
7718                         if (e - a <= BB_MAX_LEN) {
7719                                 p[lo] = BB_MAKE(a, e-a, ack);
7720                                 s = e;
7721                         } else {
7722                                 /* does not all fit in one range,
7723                                  * make p[lo] maximal
7724                                  */
7725                                 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7726                                         p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7727                                 s = a + BB_MAX_LEN;
7728                         }
7729                         sectors = e - s;
7730                 }
7731         }
7732         if (sectors && hi < bb->count) {
7733                 /* 'hi' points to the first range that starts after 's'.
7734                  * Maybe we can merge with the start of that range */
7735                 sector_t a = BB_OFFSET(p[hi]);
7736                 sector_t e = a + BB_LEN(p[hi]);
7737                 int ack = BB_ACK(p[hi]);
7738                 if (a <= s + sectors) {
7739                         /* merging is possible */
7740                         if (e <= s + sectors) {
7741                                 /* full overlap */
7742                                 e = s + sectors;
7743                                 ack = acknowledged;
7744                         } else
7745                                 ack = ack && acknowledged;
7746
7747                         a = s;
7748                         if (e - a <= BB_MAX_LEN) {
7749                                 p[hi] = BB_MAKE(a, e-a, ack);
7750                                 s = e;
7751                         } else {
7752                                 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7753                                 s = a + BB_MAX_LEN;
7754                         }
7755                         sectors = e - s;
7756                         lo = hi;
7757                         hi++;
7758                 }
7759         }
7760         if (sectors == 0 && hi < bb->count) {
7761                 /* we might be able to combine lo and hi */
7762                 /* Note: 's' is at the end of 'lo' */
7763                 sector_t a = BB_OFFSET(p[hi]);
7764                 int lolen = BB_LEN(p[lo]);
7765                 int hilen = BB_LEN(p[hi]);
7766                 int newlen = lolen + hilen - (s - a);
7767                 if (s >= a && newlen < BB_MAX_LEN) {
7768                         /* yes, we can combine them */
7769                         int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7770                         p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7771                         memmove(p + hi, p + hi + 1,
7772                                 (bb->count - hi - 1) * 8);
7773                         bb->count--;
7774                 }
7775         }
7776         while (sectors) {
7777                 /* didn't merge (it all).
7778                  * Need to add a range just before 'hi' */
7779                 if (bb->count >= MD_MAX_BADBLOCKS) {
7780                         /* No room for more */
7781                         rv = 0;
7782                         break;
7783                 } else {
7784                         int this_sectors = sectors;
7785                         memmove(p + hi + 1, p + hi,
7786                                 (bb->count - hi) * 8);
7787                         bb->count++;
7788
7789                         if (this_sectors > BB_MAX_LEN)
7790                                 this_sectors = BB_MAX_LEN;
7791                         p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7792                         sectors -= this_sectors;
7793                         s += this_sectors;
7794                 }
7795         }
7796
7797         bb->changed = 1;
7798         if (!acknowledged)
7799                 bb->unacked_exist = 1;
7800         write_sequnlock_irq(&bb->lock);
7801
7802         return rv;
7803 }
7804
7805 int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7806                        int acknowledged)
7807 {
7808         int rv = md_set_badblocks(&rdev->badblocks,
7809                                   s + rdev->data_offset, sectors, acknowledged);
7810         if (rv) {
7811                 /* Make sure they get written out promptly */
7812                 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7813                 md_wakeup_thread(rdev->mddev->thread);
7814         }
7815         return rv;
7816 }
7817 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7818
7819 /*
7820  * Remove a range of bad blocks from the table.
7821  * This may involve extending the table if we spilt a region,
7822  * but it must not fail.  So if the table becomes full, we just
7823  * drop the remove request.
7824  */
7825 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7826 {
7827         u64 *p;
7828         int lo, hi;
7829         sector_t target = s + sectors;
7830         int rv = 0;
7831
7832         if (bb->shift > 0) {
7833                 /* When clearing we round the start up and the end down.
7834                  * This should not matter as the shift should align with
7835                  * the block size and no rounding should ever be needed.
7836                  * However it is better the think a block is bad when it
7837                  * isn't than to think a block is not bad when it is.
7838                  */
7839                 s += (1<<bb->shift) - 1;
7840                 s >>= bb->shift;
7841                 target >>= bb->shift;
7842                 sectors = target - s;
7843         }
7844
7845         write_seqlock_irq(&bb->lock);
7846
7847         p = bb->page;
7848         lo = 0;
7849         hi = bb->count;
7850         /* Find the last range that starts before 'target' */
7851         while (hi - lo > 1) {
7852                 int mid = (lo + hi) / 2;
7853                 sector_t a = BB_OFFSET(p[mid]);
7854                 if (a < target)
7855                         lo = mid;
7856                 else
7857                         hi = mid;
7858         }
7859         if (hi > lo) {
7860                 /* p[lo] is the last range that could overlap the
7861                  * current range.  Earlier ranges could also overlap,
7862                  * but only this one can overlap the end of the range.
7863                  */
7864                 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7865                         /* Partial overlap, leave the tail of this range */
7866                         int ack = BB_ACK(p[lo]);
7867                         sector_t a = BB_OFFSET(p[lo]);
7868                         sector_t end = a + BB_LEN(p[lo]);
7869
7870                         if (a < s) {
7871                                 /* we need to split this range */
7872                                 if (bb->count >= MD_MAX_BADBLOCKS) {
7873                                         rv = 0;
7874                                         goto out;
7875                                 }
7876                                 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7877                                 bb->count++;
7878                                 p[lo] = BB_MAKE(a, s-a, ack);
7879                                 lo++;
7880                         }
7881                         p[lo] = BB_MAKE(target, end - target, ack);
7882                         /* there is no longer an overlap */
7883                         hi = lo;
7884                         lo--;
7885                 }
7886                 while (lo >= 0 &&
7887                        BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7888                         /* This range does overlap */
7889                         if (BB_OFFSET(p[lo]) < s) {
7890                                 /* Keep the early parts of this range. */
7891                                 int ack = BB_ACK(p[lo]);
7892                                 sector_t start = BB_OFFSET(p[lo]);
7893                                 p[lo] = BB_MAKE(start, s - start, ack);
7894                                 /* now low doesn't overlap, so.. */
7895                                 break;
7896                         }
7897                         lo--;
7898                 }
7899                 /* 'lo' is strictly before, 'hi' is strictly after,
7900                  * anything between needs to be discarded
7901                  */
7902                 if (hi - lo > 1) {
7903                         memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7904                         bb->count -= (hi - lo - 1);
7905                 }
7906         }
7907
7908         bb->changed = 1;
7909 out:
7910         write_sequnlock_irq(&bb->lock);
7911         return rv;
7912 }
7913
7914 int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7915 {
7916         return md_clear_badblocks(&rdev->badblocks,
7917                                   s + rdev->data_offset,
7918                                   sectors);
7919 }
7920 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7921
7922 /*
7923  * Acknowledge all bad blocks in a list.
7924  * This only succeeds if ->changed is clear.  It is used by
7925  * in-kernel metadata updates
7926  */
7927 void md_ack_all_badblocks(struct badblocks *bb)
7928 {
7929         if (bb->page == NULL || bb->changed)
7930                 /* no point even trying */
7931                 return;
7932         write_seqlock_irq(&bb->lock);
7933
7934         if (bb->changed == 0) {
7935                 u64 *p = bb->page;
7936                 int i;
7937                 for (i = 0; i < bb->count ; i++) {
7938                         if (!BB_ACK(p[i])) {
7939                                 sector_t start = BB_OFFSET(p[i]);
7940                                 int len = BB_LEN(p[i]);
7941                                 p[i] = BB_MAKE(start, len, 1);
7942                         }
7943                 }
7944                 bb->unacked_exist = 0;
7945         }
7946         write_sequnlock_irq(&bb->lock);
7947 }
7948 EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7949
7950 /* sysfs access to bad-blocks list.
7951  * We present two files.
7952  * 'bad-blocks' lists sector numbers and lengths of ranges that
7953  *    are recorded as bad.  The list is truncated to fit within
7954  *    the one-page limit of sysfs.
7955  *    Writing "sector length" to this file adds an acknowledged
7956  *    bad block list.
7957  * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
7958  *    been acknowledged.  Writing to this file adds bad blocks
7959  *    without acknowledging them.  This is largely for testing.
7960  */
7961
7962 static ssize_t
7963 badblocks_show(struct badblocks *bb, char *page, int unack)
7964 {
7965         size_t len;
7966         int i;
7967         u64 *p = bb->page;
7968         unsigned seq;
7969
7970         if (bb->shift < 0)
7971                 return 0;
7972
7973 retry:
7974         seq = read_seqbegin(&bb->lock);
7975
7976         len = 0;
7977         i = 0;
7978
7979         while (len < PAGE_SIZE && i < bb->count) {
7980                 sector_t s = BB_OFFSET(p[i]);
7981                 unsigned int length = BB_LEN(p[i]);
7982                 int ack = BB_ACK(p[i]);
7983                 i++;
7984
7985                 if (unack && ack)
7986                         continue;
7987
7988                 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
7989                                 (unsigned long long)s << bb->shift,
7990                                 length << bb->shift);
7991         }
7992         if (unack && len == 0)
7993                 bb->unacked_exist = 0;
7994
7995         if (read_seqretry(&bb->lock, seq))
7996                 goto retry;
7997
7998         return len;
7999 }
8000
8001 #define DO_DEBUG 1
8002
8003 static ssize_t
8004 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8005 {
8006         unsigned long long sector;
8007         int length;
8008         char newline;
8009 #ifdef DO_DEBUG
8010         /* Allow clearing via sysfs *only* for testing/debugging.
8011          * Normally only a successful write may clear a badblock
8012          */
8013         int clear = 0;
8014         if (page[0] == '-') {
8015                 clear = 1;
8016                 page++;
8017         }
8018 #endif /* DO_DEBUG */
8019
8020         switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8021         case 3:
8022                 if (newline != '\n')
8023                         return -EINVAL;
8024         case 2:
8025                 if (length <= 0)
8026                         return -EINVAL;
8027                 break;
8028         default:
8029                 return -EINVAL;
8030         }
8031
8032 #ifdef DO_DEBUG
8033         if (clear) {
8034                 md_clear_badblocks(bb, sector, length);
8035                 return len;
8036         }
8037 #endif /* DO_DEBUG */
8038         if (md_set_badblocks(bb, sector, length, !unack))
8039                 return len;
8040         else
8041                 return -ENOSPC;
8042 }
8043
8044 static int md_notify_reboot(struct notifier_block *this,
8045                             unsigned long code, void *x)
8046 {
8047         struct list_head *tmp;
8048         mddev_t *mddev;
8049
8050         if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
8051
8052                 printk(KERN_INFO "md: stopping all md devices.\n");
8053
8054                 for_each_mddev(mddev, tmp)
8055                         if (mddev_trylock(mddev)) {
8056                                 /* Force a switch to readonly even array
8057                                  * appears to still be in use.  Hence
8058                                  * the '100'.
8059                                  */
8060                                 md_set_readonly(mddev, 100);
8061                                 mddev_unlock(mddev);
8062                         }
8063                 /*
8064                  * certain more exotic SCSI devices are known to be
8065                  * volatile wrt too early system reboots. While the
8066                  * right place to handle this issue is the given
8067                  * driver, we do want to have a safe RAID driver ...
8068                  */
8069                 mdelay(1000*1);
8070         }
8071         return NOTIFY_DONE;
8072 }
8073
8074 static struct notifier_block md_notifier = {
8075         .notifier_call  = md_notify_reboot,
8076         .next           = NULL,
8077         .priority       = INT_MAX, /* before any real devices */
8078 };
8079
8080 static void md_geninit(void)
8081 {
8082         dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8083
8084         proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8085 }
8086
8087 static int __init md_init(void)
8088 {
8089         int ret = -ENOMEM;
8090
8091         md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8092         if (!md_wq)
8093                 goto err_wq;
8094
8095         md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8096         if (!md_misc_wq)
8097                 goto err_misc_wq;
8098
8099         if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8100                 goto err_md;
8101
8102         if ((ret = register_blkdev(0, "mdp")) < 0)
8103                 goto err_mdp;
8104         mdp_major = ret;
8105
8106         blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
8107                             md_probe, NULL, NULL);
8108         blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8109                             md_probe, NULL, NULL);
8110
8111         register_reboot_notifier(&md_notifier);
8112         raid_table_header = register_sysctl_table(raid_root_table);
8113
8114         md_geninit();
8115         return 0;
8116
8117 err_mdp:
8118         unregister_blkdev(MD_MAJOR, "md");
8119 err_md:
8120         destroy_workqueue(md_misc_wq);
8121 err_misc_wq:
8122         destroy_workqueue(md_wq);
8123 err_wq:
8124         return ret;
8125 }
8126
8127 #ifndef MODULE
8128
8129 /*
8130  * Searches all registered partitions for autorun RAID arrays
8131  * at boot time.
8132  */
8133
8134 static LIST_HEAD(all_detected_devices);
8135 struct detected_devices_node {
8136         struct list_head list;
8137         dev_t dev;
8138 };
8139
8140 void md_autodetect_dev(dev_t dev)
8141 {
8142         struct detected_devices_node *node_detected_dev;
8143
8144         node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8145         if (node_detected_dev) {
8146                 node_detected_dev->dev = dev;
8147                 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8148         } else {
8149                 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8150                         ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8151         }
8152 }
8153
8154
8155 static void autostart_arrays(int part)
8156 {
8157         mdk_rdev_t *rdev;
8158         struct detected_devices_node *node_detected_dev;
8159         dev_t dev;
8160         int i_scanned, i_passed;
8161
8162         i_scanned = 0;
8163         i_passed = 0;
8164
8165         printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8166
8167         while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8168                 i_scanned++;
8169                 node_detected_dev = list_entry(all_detected_devices.next,
8170                                         struct detected_devices_node, list);
8171                 list_del(&node_detected_dev->list);
8172                 dev = node_detected_dev->dev;
8173                 kfree(node_detected_dev);
8174                 rdev = md_import_device(dev,0, 90);
8175                 if (IS_ERR(rdev))
8176                         continue;
8177
8178                 if (test_bit(Faulty, &rdev->flags)) {
8179                         MD_BUG();
8180                         continue;
8181                 }
8182                 set_bit(AutoDetected, &rdev->flags);
8183                 list_add(&rdev->same_set, &pending_raid_disks);
8184                 i_passed++;
8185         }
8186
8187         printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8188                                                 i_scanned, i_passed);
8189
8190         autorun_devices(part);
8191 }
8192
8193 #endif /* !MODULE */
8194
8195 static __exit void md_exit(void)
8196 {
8197         mddev_t *mddev;
8198         struct list_head *tmp;
8199
8200         blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
8201         blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8202
8203         unregister_blkdev(MD_MAJOR,"md");
8204         unregister_blkdev(mdp_major, "mdp");
8205         unregister_reboot_notifier(&md_notifier);
8206         unregister_sysctl_table(raid_table_header);
8207         remove_proc_entry("mdstat", NULL);
8208         for_each_mddev(mddev, tmp) {
8209                 export_array(mddev);
8210                 mddev->hold_active = 0;
8211         }
8212         destroy_workqueue(md_misc_wq);
8213         destroy_workqueue(md_wq);
8214 }
8215
8216 subsys_initcall(md_init);
8217 module_exit(md_exit)
8218
8219 static int get_ro(char *buffer, struct kernel_param *kp)
8220 {
8221         return sprintf(buffer, "%d", start_readonly);
8222 }
8223 static int set_ro(const char *val, struct kernel_param *kp)
8224 {
8225         char *e;
8226         int num = simple_strtoul(val, &e, 10);
8227         if (*val && (*e == '\0' || *e == '\n')) {
8228                 start_readonly = num;
8229                 return 0;
8230         }
8231         return -EINVAL;
8232 }
8233
8234 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8235 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
8236
8237 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
8238
8239 EXPORT_SYMBOL(register_md_personality);
8240 EXPORT_SYMBOL(unregister_md_personality);
8241 EXPORT_SYMBOL(md_error);
8242 EXPORT_SYMBOL(md_done_sync);
8243 EXPORT_SYMBOL(md_write_start);
8244 EXPORT_SYMBOL(md_write_end);
8245 EXPORT_SYMBOL(md_register_thread);
8246 EXPORT_SYMBOL(md_unregister_thread);
8247 EXPORT_SYMBOL(md_wakeup_thread);
8248 EXPORT_SYMBOL(md_check_recovery);
8249 MODULE_LICENSE("GPL");
8250 MODULE_DESCRIPTION("MD RAID framework");
8251 MODULE_ALIAS("md");
8252 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);