Update to 3.4-final.
[linux-flexiantxendom0-3.2.10.git] / drivers / md / md.c
index 0ea3485..01233d8 100644 (file)
 #include <linux/blkdev.h>
 #include <linux/sysctl.h>
 #include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/hdreg.h>
 #include <linux/proc_fs.h>
 #include <linux/random.h>
+#include <linux/module.h>
 #include <linux/reboot.h>
 #include <linux/file.h>
 #include <linux/compat.h>
@@ -332,18 +332,17 @@ static DEFINE_SPINLOCK(all_mddevs_lock);
  * call has finished, the bio has been linked into some internal structure
  * and so is visible to ->quiesce(), so we don't need the refcount any more.
  */
-static int md_make_request(struct request_queue *q, struct bio *bio)
+static void md_make_request(struct request_queue *q, struct bio *bio)
 {
        const int rw = bio_data_dir(bio);
        struct mddev *mddev = q->queuedata;
-       int rv;
        int cpu;
        unsigned int sectors;
 
        if (mddev == NULL || mddev->pers == NULL
            || !mddev->ready) {
                bio_io_error(bio);
-               return 0;
+               return;
        }
        smp_rmb(); /* Ensure implications of  'active' are visible */
        rcu_read_lock();
@@ -368,7 +367,7 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
         * go away inside make_request
         */
        sectors = bio_sectors(bio);
-       rv = mddev->pers->make_request(mddev, bio);
+       mddev->pers->make_request(mddev, bio);
 
        cpu = part_stat_lock();
        part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
@@ -377,8 +376,6 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
 
        if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
                wake_up(&mddev->sb_wait);
-
-       return rv;
 }
 
 /* mddev_suspend makes sure no new requests are submitted
@@ -394,6 +391,8 @@ void mddev_suspend(struct mddev *mddev)
        synchronize_rcu();
        wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
        mddev->pers->quiesce(mddev, 1);
+
+       del_timer_sync(&mddev->safemode_timer);
 }
 EXPORT_SYMBOL_GPL(mddev_suspend);
 
@@ -442,7 +441,7 @@ static void submit_flushes(struct work_struct *ws)
        INIT_WORK(&mddev->flush_work, md_submit_flush_data);
        atomic_set(&mddev->flush_pending, 1);
        rcu_read_lock();
-       list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+       rdev_for_each_rcu(rdev, mddev)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        /* Take two references, one is dropped
@@ -477,8 +476,7 @@ static void md_submit_flush_data(struct work_struct *ws)
                bio_endio(bio, 0);
        else {
                bio->bi_rw &= ~REQ_FLUSH;
-               if (mddev->pers->make_request(mddev, bio))
-                       generic_make_request(bio);
+               mddev->pers->make_request(mddev, bio);
        }
 
        mddev->flush_bio = NULL;
@@ -573,7 +571,7 @@ static void mddev_put(struct mddev *mddev)
            mddev->ctime == 0 && !mddev->hold_active) {
                /* Array is not configured at all, and not held active,
                 * so destroy it */
-               list_del(&mddev->all_mddevs);
+               list_del_init(&mddev->all_mddevs);
                bs = mddev->bio_set;
                mddev->bio_set = NULL;
                if (mddev->gendisk) {
@@ -741,8 +739,8 @@ static void mddev_unlock(struct mddev * mddev)
        } else
                mutex_unlock(&mddev->reconfig_mutex);
 
-       /* was we've dropped the mutex we need a spinlock to
-        * make sur the thread doesn't disappear
+       /* As we've dropped the mutex we need a spinlock to
+        * make sure the thread doesn't disappear
         */
        spin_lock(&pers_lock);
        md_wakeup_thread(mddev->thread);
@@ -753,7 +751,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
 {
        struct md_rdev *rdev;
 
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
                if (rdev->desc_nr == nr)
                        return rdev;
 
@@ -764,7 +762,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
 {
        struct md_rdev *rdev;
 
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
                if (rdev->bdev->bd_dev == dev)
                        return rdev;
 
@@ -1346,7 +1344,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
-       list_for_each_entry(rdev2, &mddev->disks, same_set) {
+       rdev_for_each(rdev2, mddev) {
                mdp_disk_t *d;
                int desc_nr;
                int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1717,6 +1715,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                }
                if (sb->devflags & WriteMostly1)
                        set_bit(WriteMostly, &rdev->flags);
+               if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+                       set_bit(Replacement, &rdev->flags);
        } else /* MULTIPATH are always insync */
                set_bit(In_sync, &rdev->flags);
 
@@ -1770,6 +1770,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
                sb->recovery_offset =
                        cpu_to_le64(rdev->recovery_offset);
        }
+       if (test_bit(Replacement, &rdev->flags))
+               sb->feature_map |=
+                       cpu_to_le32(MD_FEATURE_REPLACEMENT);
 
        if (mddev->reshape_position != MaxSector) {
                sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -1804,18 +1807,18 @@ retry:
                                                | BB_LEN(internal_bb));
                                *bbp++ = cpu_to_le64(store_bb);
                        }
+                       bb->changed = 0;
                        if (read_seqretry(&bb->lock, seq))
                                goto retry;
 
                        bb->sector = (rdev->sb_start +
                                      (int)le32_to_cpu(sb->bblog_offset));
                        bb->size = le16_to_cpu(sb->bblog_size);
-                       bb->changed = 0;
                }
        }
 
        max_dev = 0;
-       list_for_each_entry(rdev2, &mddev->disks, same_set)
+       rdev_for_each(rdev2, mddev)
                if (rdev2->desc_nr+1 > max_dev)
                        max_dev = rdev2->desc_nr+1;
 
@@ -1832,7 +1835,7 @@ retry:
        for (i=0; i<max_dev;i++)
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
        
-       list_for_each_entry(rdev2, &mddev->disks, same_set) {
+       rdev_for_each(rdev2, mddev) {
                i = rdev2->desc_nr;
                if (test_bit(Faulty, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1947,7 +1950,7 @@ int md_integrity_register(struct mddev *mddev)
                return 0; /* nothing to do */
        if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
                return 0; /* shouldn't register, or already is */
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                /* skip spares and non-functional disks */
                if (test_bit(Faulty, &rdev->flags))
                        continue;
@@ -2174,7 +2177,7 @@ static void export_array(struct mddev *mddev)
 {
        struct md_rdev *rdev, *tmp;
 
-       rdev_for_each(rdev, tmp, mddev) {
+       rdev_for_each_safe(rdev, tmp, mddev) {
                if (!rdev->mddev) {
                        MD_BUG();
                        continue;
@@ -2306,11 +2309,11 @@ static void md_print_devices(void)
                        bitmap_print_sb(mddev->bitmap);
                else
                        printk("%s: ", mdname(mddev));
-               list_for_each_entry(rdev, &mddev->disks, same_set)
+               rdev_for_each(rdev, mddev)
                        printk("<%s>", bdevname(rdev->bdev,b));
                printk("\n");
 
-               list_for_each_entry(rdev, &mddev->disks, same_set)
+               rdev_for_each(rdev, mddev)
                        print_rdev(rdev, mddev->major_version);
        }
        printk("md:     **********************************\n");
@@ -2327,7 +2330,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
         * with the rest of the array)
         */
        struct md_rdev *rdev;
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (rdev->sb_events == mddev->events ||
                    (nospares &&
                     rdev->raid_disk < 0 &&
@@ -2350,7 +2353,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
 
 repeat:
        /* First make sure individual recovery_offsets are correct */
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk >= 0 &&
                    mddev->delta_disks >= 0 &&
                    !test_bit(In_sync, &rdev->flags) &&
@@ -2363,8 +2366,9 @@ repeat:
                clear_bit(MD_CHANGE_DEVS, &mddev->flags);
                if (!mddev->external) {
                        clear_bit(MD_CHANGE_PENDING, &mddev->flags);
-                       list_for_each_entry(rdev, &mddev->disks, same_set) {
+                       rdev_for_each(rdev, mddev) {
                                if (rdev->badblocks.changed) {
+                                       rdev->badblocks.changed = 0;
                                        md_ack_all_badblocks(&rdev->badblocks);
                                        md_error(mddev, rdev);
                                }
@@ -2429,7 +2433,7 @@ repeat:
                mddev->events --;
        }
 
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (rdev->badblocks.changed)
                        any_badblocks_changed++;
                if (test_bit(Faulty, &rdev->flags))
@@ -2443,13 +2447,14 @@ repeat:
                 mdname(mddev), mddev->in_sync);
 
        bitmap_update_sb(mddev->bitmap);
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                char b[BDEVNAME_SIZE];
 
                if (rdev->sb_loaded != 1)
                        continue; /* no noise on spare devices */
 
-               if (!test_bit(Faulty, &rdev->flags)) {
+               if (!test_bit(Faulty, &rdev->flags) &&
+                   rdev->saved_raid_disk == -1) {
                        md_super_write(mddev,rdev,
                                       rdev->sb_start, rdev->sb_size,
                                       rdev->sb_page);
@@ -2465,9 +2470,12 @@ repeat:
                                rdev->badblocks.size = 0;
                        }
 
-               } else
+               } else if (test_bit(Faulty, &rdev->flags))
                        pr_debug("md: %s (skipping faulty)\n",
                                 bdevname(rdev->bdev, b));
+               else
+                       pr_debug("(skipping incremental s/r ");
+
                if (mddev->level == LEVEL_MULTIPATH)
                        /* only need to write one superblock... */
                        break;
@@ -2488,7 +2496,7 @@ repeat:
        if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
                sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (test_and_clear_bit(FaultRecorded, &rdev->flags))
                        clear_bit(Blocked, &rdev->flags);
 
@@ -2545,7 +2553,8 @@ state_show(struct md_rdev *rdev, char *page)
                sep = ",";
        }
        if (test_bit(Blocked, &rdev->flags) ||
-           rdev->badblocks.unacked_exist) {
+           (rdev->badblocks.unacked_exist
+            && !test_bit(Faulty, &rdev->flags))) {
                len += sprintf(page+len, "%sblocked", sep);
                sep = ",";
        }
@@ -2558,6 +2567,15 @@ state_show(struct md_rdev *rdev, char *page)
                len += sprintf(page+len, "%swrite_error", sep);
                sep = ",";
        }
+       if (test_bit(WantReplacement, &rdev->flags)) {
+               len += sprintf(page+len, "%swant_replacement", sep);
+               sep = ",";
+       }
+       if (test_bit(Replacement, &rdev->flags)) {
+               len += sprintf(page+len, "%sreplacement", sep);
+               sep = ",";
+       }
+
        return len+sprintf(page+len, "\n");
 }
 
@@ -2626,6 +2644,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
        } else if (cmd_match(buf, "-write_error")) {
                clear_bit(WriteErrorSeen, &rdev->flags);
                err = 0;
+       } else if (cmd_match(buf, "want_replacement")) {
+               /* Any non-spare device that is not a replacement can
+                * become want_replacement at any time, but we then need to
+                * check if recovery is needed.
+                */
+               if (rdev->raid_disk >= 0 &&
+                   !test_bit(Replacement, &rdev->flags))
+                       set_bit(WantReplacement, &rdev->flags);
+               set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+               md_wakeup_thread(rdev->mddev->thread);
+               err = 0;
+       } else if (cmd_match(buf, "-want_replacement")) {
+               /* Clearing 'want_replacement' is always allowed.
+                * Once replacements starts it is too late though.
+                */
+               err = 0;
+               clear_bit(WantReplacement, &rdev->flags);
+       } else if (cmd_match(buf, "replacement")) {
+               /* Can only set a device as a replacement when array has not
+                * yet been started.  Once running, replacement is automatic
+                * from spares, or by assigning 'slot'.
+                */
+               if (rdev->mddev->pers)
+                       err = -EBUSY;
+               else {
+                       set_bit(Replacement, &rdev->flags);
+                       err = 0;
+               }
+       } else if (cmd_match(buf, "-replacement")) {
+               /* Similarly, can only clear Replacement before start */
+               if (rdev->mddev->pers)
+                       err = -EBUSY;
+               else {
+                       clear_bit(Replacement, &rdev->flags);
+                       err = 0;
+               }
        }
        if (!err)
                sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2687,7 +2741,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_remove_disk == NULL)
                        return -EINVAL;
                err = rdev->mddev->pers->
-                       hot_remove_disk(rdev->mddev, rdev->raid_disk);
+                       hot_remove_disk(rdev->mddev, rdev);
                if (err)
                        return err;
                sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2695,7 +2749,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
-               struct md_rdev *rdev2;
                /* Activating a spare .. or possibly reactivating
                 * if we ever get bitmaps working here.
                 */
@@ -2709,10 +2762,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_add_disk == NULL)
                        return -EINVAL;
 
-               list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
-                       if (rdev2->raid_disk == slot)
-                               return -EEXIST;
-
                if (slot >= rdev->mddev->raid_disks &&
                    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
                        return -ENOSPC;
@@ -2850,7 +2899,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
                        struct md_rdev *rdev2;
 
                        mddev_lock(mddev);
-                       list_for_each_entry(rdev2, &mddev->disks, same_set)
+                       rdev_for_each(rdev2, mddev)
                                if (rdev->bdev == rdev2->bdev &&
                                    rdev != rdev2 &&
                                    overlaps(rdev->data_offset, rdev->sectors,
@@ -3147,7 +3196,7 @@ static void analyze_sbs(struct mddev * mddev)
        char b[BDEVNAME_SIZE];
 
        freshest = NULL;
-       rdev_for_each(rdev, tmp, mddev)
+       rdev_for_each_safe(rdev, tmp, mddev)
                switch (super_types[mddev->major_version].
                        load_super(rdev, freshest, mddev->minor_version)) {
                case 1:
@@ -3168,7 +3217,7 @@ static void analyze_sbs(struct mddev * mddev)
                validate_super(mddev, freshest);
 
        i = 0;
-       rdev_for_each(rdev, tmp, mddev) {
+       rdev_for_each_safe(rdev, tmp, mddev) {
                if (mddev->max_disks &&
                    (rdev->desc_nr >= mddev->max_disks ||
                     i > mddev->max_disks)) {
@@ -3357,7 +3406,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                return -EINVAL;
        }
 
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
                rdev->new_raid_disk = rdev->raid_disk;
 
        /* ->takeover must set new_* and/or delta_disks
@@ -3410,7 +3459,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                mddev->safemode = 0;
        }
 
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3419,7 +3468,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
                        continue;
                sysfs_unlink_rdev(mddev, rdev);
        }
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (rdev->raid_disk < 0)
                        continue;
                if (rdev->new_raid_disk == rdev->raid_disk)
@@ -3787,6 +3836,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
        if (err)
                return err;
        else {
+               if (mddev->hold_active == UNTIL_IOCTL)
+                       mddev->hold_active = 0;
                sysfs_notify_dirent_safe(mddev->sysfs_state);
                return len;
        }
@@ -4486,11 +4537,20 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 
        if (!entry->show)
                return -EIO;
+       spin_lock(&all_mddevs_lock);
+       if (list_empty(&mddev->all_mddevs)) {
+               spin_unlock(&all_mddevs_lock);
+               return -EBUSY;
+       }
+       mddev_get(mddev);
+       spin_unlock(&all_mddevs_lock);
+
        rv = mddev_lock(mddev);
        if (!rv) {
                rv = entry->show(mddev, page);
                mddev_unlock(mddev);
        }
+       mddev_put(mddev);
        return rv;
 }
 
@@ -4506,13 +4566,19 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
                return -EIO;
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
+       spin_lock(&all_mddevs_lock);
+       if (list_empty(&mddev->all_mddevs)) {
+               spin_unlock(&all_mddevs_lock);
+               return -EBUSY;
+       }
+       mddev_get(mddev);
+       spin_unlock(&all_mddevs_lock);
        rv = mddev_lock(mddev);
-       if (mddev->hold_active == UNTIL_IOCTL)
-               mddev->hold_active = 0;
        if (!rv) {
                rv = entry->store(mddev, page, length);
                mddev_unlock(mddev);
        }
+       mddev_put(mddev);
        return rv;
 }
 
@@ -4603,6 +4669,7 @@ static int md_alloc(dev_t dev, char *name)
        mddev->queue->queuedata = mddev;
 
        blk_queue_make_request(mddev->queue, md_make_request);
+       blk_set_stacking_limits(&mddev->queue->limits);
 
        disk = alloc_disk(1 << shift);
        if (!disk) {
@@ -4732,7 +4799,7 @@ int md_run(struct mddev *mddev)
         * the only valid external interface is through the md
         * device.
         */
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                sync_blockdev(rdev->bdev);
@@ -4803,8 +4870,8 @@ int md_run(struct mddev *mddev)
                struct md_rdev *rdev2;
                int warned = 0;
 
-               list_for_each_entry(rdev, &mddev->disks, same_set)
-                       list_for_each_entry(rdev2, &mddev->disks, same_set) {
+               rdev_for_each(rdev, mddev)
+                       rdev_for_each(rdev2, mddev) {
                                if (rdev < rdev2 &&
                                    rdev->bdev->bd_contains ==
                                    rdev2->bdev->bd_contains) {
@@ -4881,7 +4948,7 @@ int md_run(struct mddev *mddev)
        mddev->in_sync = 1;
        smp_wmb();
        mddev->ready = 1;
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
                if (rdev->raid_disk >= 0)
                        if (sysfs_link_rdev(mddev, rdev))
                                /* failure here is OK */;
@@ -5009,6 +5076,7 @@ static void md_clean(struct mddev *mddev)
        mddev->changed = 0;
        mddev->degraded = 0;
        mddev->safemode = 0;
+       mddev->merge_check_needed = 0;
        mddev->bitmap_info.offset = 0;
        mddev->bitmap_info.default_offset = 0;
        mddev->bitmap_info.chunksize = 0;
@@ -5111,7 +5179,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
                /* tell userspace to handle 'inactive' */
                sysfs_notify_dirent_safe(mddev->sysfs_state);
 
-               list_for_each_entry(rdev, &mddev->disks, same_set)
+               rdev_for_each(rdev, mddev)
                        if (rdev->raid_disk >= 0)
                                sysfs_unlink_rdev(mddev, rdev);
 
@@ -5162,7 +5230,7 @@ static void autorun_array(struct mddev *mddev)
 
        printk(KERN_INFO "md: running: ");
 
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                char b[BDEVNAME_SIZE];
                printk("<%s>", bdevname(rdev->bdev,b));
        }
@@ -5292,7 +5360,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
        struct md_rdev *rdev;
 
        nr=working=insync=failed=spare=0;
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
                        failed++;
@@ -5859,7 +5927,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
                 * grow, and re-add.
                 */
                return -EBUSY;
-       list_for_each_entry(rdev, &mddev->disks, same_set) {
+       rdev_for_each(rdev, mddev) {
                sector_t avail = rdev->sectors;
 
                if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6035,8 +6103,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
        struct mddev *mddev = NULL;
        int ro;
 
-       if (!capable(CAP_SYS_ADMIN))
-               return -EACCES;
+       switch (cmd) {
+       case RAID_VERSION:
+       case GET_ARRAY_INFO:
+       case GET_DISK_INFO:
+               break;
+       default:
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EACCES;
+       }
 
        /*
         * Commands dealing with the RAID driver but not any
@@ -6653,7 +6728,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
        struct mddev *mddev = v;
        sector_t sectors;
        struct md_rdev *rdev;
-       struct bitmap *bitmap;
 
        if (v == (void*)1) {
                struct md_personality *pers;
@@ -6687,7 +6761,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                }
 
                sectors = 0;
-               list_for_each_entry(rdev, &mddev->disks, same_set) {
+               rdev_for_each(rdev, mddev) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
                                bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6696,8 +6770,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
                        if (test_bit(Faulty, &rdev->flags)) {
                                seq_printf(seq, "(F)");
                                continue;
-                       } else if (rdev->raid_disk < 0)
+                       }
+                       if (rdev->raid_disk < 0)
                                seq_printf(seq, "(S)"); /* spare */
+                       if (test_bit(Replacement, &rdev->flags))
+                               seq_printf(seq, "(R)");
                        sectors += rdev->sectors;
                }
 
@@ -6738,27 +6815,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                } else
                        seq_printf(seq, "\n       ");
 
-               if ((bitmap = mddev->bitmap)) {
-                       unsigned long chunk_kb;
-                       unsigned long flags;
-                       spin_lock_irqsave(&bitmap->lock, flags);
-                       chunk_kb = mddev->bitmap_info.chunksize >> 10;
-                       seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
-                               "%lu%s chunk",
-                               bitmap->pages - bitmap->missing_pages,
-                               bitmap->pages,
-                               (bitmap->pages - bitmap->missing_pages)
-                                       << (PAGE_SHIFT - 10),
-                               chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
-                               chunk_kb ? "KB" : "B");
-                       if (bitmap->file) {
-                               seq_printf(seq, ", file: ");
-                               seq_path(seq, &bitmap->file->f_path, " \t\n");
-                       }
-
-                       seq_printf(seq, "\n");
-                       spin_unlock_irqrestore(&bitmap->lock, flags);
-               }
+               bitmap_status(seq, mddev->bitmap);
 
                seq_printf(seq, "\n");
        }
@@ -7096,7 +7153,7 @@ void md_do_sync(struct mddev *mddev)
                max_sectors = mddev->dev_sectors;
                j = MaxSector;
                rcu_read_lock();
-               list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+               rdev_for_each_rcu(rdev, mddev)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(In_sync, &rdev->flags) &&
@@ -7259,7 +7316,8 @@ void md_do_sync(struct mddev *mddev)
                                        printk(KERN_INFO
                                               "md: checkpointing %s of %s.\n",
                                               desc, mdname(mddev));
-                                       mddev->recovery_cp = mddev->curr_resync;
+                                       mddev->recovery_cp =
+                                               mddev->curr_resync_completed;
                                }
                        } else
                                mddev->recovery_cp = MaxSector;
@@ -7267,7 +7325,7 @@ void md_do_sync(struct mddev *mddev)
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                                mddev->curr_resync = MaxSector;
                        rcu_read_lock();
-                       list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
+                       rdev_for_each_rcu(rdev, mddev)
                                if (rdev->raid_disk >= 0 &&
                                    mddev->delta_disks >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
@@ -7277,9 +7335,9 @@ void md_do_sync(struct mddev *mddev)
                        rcu_read_unlock();
                }
        }
+ skip:
        set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
- skip:
        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
                /* We completed so min/max setting can be forgotten if used. */
                if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
@@ -7309,40 +7367,43 @@ static int remove_and_add_spares(struct mddev *mddev)
 {
        struct md_rdev *rdev;
        int spares = 0;
+       int removed = 0;
 
        mddev->curr_resync_completed = 0;
 
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
                     ! test_bit(In_sync, &rdev->flags)) &&
                    atomic_read(&rdev->nr_pending)==0) {
                        if (mddev->pers->hot_remove_disk(
-                                   mddev, rdev->raid_disk)==0) {
+                                   mddev, rdev) == 0) {
                                sysfs_unlink_rdev(mddev, rdev);
                                rdev->raid_disk = -1;
+                               removed++;
                        }
                }
+       if (removed)
+               sysfs_notify(&mddev->kobj, NULL,
+                            "degraded");
 
-       if (mddev->degraded) {
-               list_for_each_entry(rdev, &mddev->disks, same_set) {
-                       if (rdev->raid_disk >= 0 &&
-                           !test_bit(In_sync, &rdev->flags) &&
-                           !test_bit(Faulty, &rdev->flags))
+
+       rdev_for_each(rdev, mddev) {
+               if (rdev->raid_disk >= 0 &&
+                   !test_bit(In_sync, &rdev->flags) &&
+                   !test_bit(Faulty, &rdev->flags))
+                       spares++;
+               if (rdev->raid_disk < 0
+                   && !test_bit(Faulty, &rdev->flags)) {
+                       rdev->recovery_offset = 0;
+                       if (mddev->pers->
+                           hot_add_disk(mddev, rdev) == 0) {
+                               if (sysfs_link_rdev(mddev, rdev))
+                                       /* failure here is OK */;
                                spares++;
-                       if (rdev->raid_disk < 0
-                           && !test_bit(Faulty, &rdev->flags)) {
-                               rdev->recovery_offset = 0;
-                               if (mddev->pers->
-                                   hot_add_disk(mddev, rdev) == 0) {
-                                       if (sysfs_link_rdev(mddev, rdev))
-                                               /* failure here is OK */;
-                                       spares++;
-                                       md_new_event(mddev);
-                                       set_bit(MD_CHANGE_DEVS, &mddev->flags);
-                               } else
-                                       break;
+                               md_new_event(mddev);
+                               set_bit(MD_CHANGE_DEVS, &mddev->flags);
                        }
                }
        }
@@ -7366,15 +7427,19 @@ static void reap_sync_thread(struct mddev *mddev)
        if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            mddev->pers->finish_reshape)
                mddev->pers->finish_reshape(mddev);
-       md_update_sb(mddev, 1);
 
-       /* if array is no-longer degraded, then any saved_raid_disk
-        * information must be scrapped
+       /* If array is no-longer degraded, then any saved_raid_disk
+        * information must be scrapped.  Also if any device is now
+        * In_sync we must scrape the saved_raid_disk for that device
+        * do the superblock for an incrementally recovered device
+        * written out.
         */
-       if (!mddev->degraded)
-               list_for_each_entry(rdev, &mddev->disks, same_set)
+       rdev_for_each(rdev, mddev)
+               if (!mddev->degraded ||
+                   test_bit(In_sync, &rdev->flags))
                        rdev->saved_raid_disk = -1;
 
+       md_update_sb(mddev, 1);
        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -7447,13 +7512,13 @@ void md_check_recovery(struct mddev *mddev)
                         * failed devices.
                         */
                        struct md_rdev *rdev;
-                       list_for_each_entry(rdev, &mddev->disks, same_set)
+                       rdev_for_each(rdev, mddev)
                                if (rdev->raid_disk >= 0 &&
                                    !test_bit(Blocked, &rdev->flags) &&
                                    test_bit(Faulty, &rdev->flags) &&
                                    atomic_read(&rdev->nr_pending)==0) {
                                        if (mddev->pers->hot_remove_disk(
-                                                   mddev, rdev->raid_disk)==0) {
+                                                   mddev, rdev) == 0) {
                                                sysfs_unlink_rdev(mddev, rdev);
                                                rdev->raid_disk = -1;
                                        }
@@ -7497,14 +7562,14 @@ void md_check_recovery(struct mddev *mddev)
                 * any transients in the value of "sync_action".
                 */
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
-               clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
                /* Clear some bits that don't mean anything, but
                 * might be left set
                 */
                clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
                clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 
-               if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+               if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
+                   test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
                        goto unlock;
                /* no recovery is running.
                 * remove any failed drives, then
@@ -7835,6 +7900,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
                                  s + rdev->data_offset, sectors, acknowledged);
        if (rv) {
                /* Make sure they get written out promptly */
+               sysfs_notify_dirent_safe(rdev->sysfs_state);
                set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
                md_wakeup_thread(rdev->mddev->thread);
        }
@@ -7957,7 +8023,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
                return;
        write_seqlock_irq(&bb->lock);
 
-       if (bb->changed == 0) {
+       if (bb->changed == 0 && bb->unacked_exist) {
                u64 *p = bb->page;
                int i;
                for (i = 0; i < bb->count ; i++) {
@@ -8074,30 +8140,24 @@ static int md_notify_reboot(struct notifier_block *this,
        struct mddev *mddev;
        int need_delay = 0;
 
-       if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
-
-               printk(KERN_INFO "md: stopping all md devices.\n");
-
-               for_each_mddev(mddev, tmp) {
-                       if (mddev_trylock(mddev)) {
-                               /* Force a switch to readonly even array
-                                * appears to still be in use.  Hence
-                                * the '100'.
-                                */
-                               md_set_readonly(mddev, 100);
-                               mddev_unlock(mddev);
-                       }
-                       need_delay = 1;
+       for_each_mddev(mddev, tmp) {
+               if (mddev_trylock(mddev)) {
+                       if (mddev->pers)
+                               __md_stop_writes(mddev);
+                       mddev->safemode = 2;
+                       mddev_unlock(mddev);
                }
-               /*
-                * certain more exotic SCSI devices are known to be
-                * volatile wrt too early system reboots. While the
-                * right place to handle this issue is the given
-                * driver, we do want to have a safe RAID driver ...
-                */
-               if (need_delay)
-                       mdelay(1000*1);
+               need_delay = 1;
        }
+       /*
+        * certain more exotic SCSI devices are known to be
+        * volatile wrt too early system reboots. While the
+        * right place to handle this issue is the given
+        * driver, we do want to have a safe RAID driver ...
+        */
+       if (need_delay)
+               mdelay(1000*1);
+
        return NOTIFY_DONE;
 }