Merge branch '3.4-urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/nab/target...
[linux-flexiantxendom0-3.2.10.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/kthread.h>
27 #include <asm/div64.h>
28 #include "compat.h"
29 #include "ctree.h"
30 #include "extent_map.h"
31 #include "disk-io.h"
32 #include "transaction.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "async-thread.h"
36 #include "check-integrity.h"
37
38 static int init_first_rw_device(struct btrfs_trans_handle *trans,
39                                 struct btrfs_root *root,
40                                 struct btrfs_device *device);
41 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
42
43 static DEFINE_MUTEX(uuid_mutex);
44 static LIST_HEAD(fs_uuids);
45
46 static void lock_chunks(struct btrfs_root *root)
47 {
48         mutex_lock(&root->fs_info->chunk_mutex);
49 }
50
51 static void unlock_chunks(struct btrfs_root *root)
52 {
53         mutex_unlock(&root->fs_info->chunk_mutex);
54 }
55
56 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
57 {
58         struct btrfs_device *device;
59         WARN_ON(fs_devices->opened);
60         while (!list_empty(&fs_devices->devices)) {
61                 device = list_entry(fs_devices->devices.next,
62                                     struct btrfs_device, dev_list);
63                 list_del(&device->dev_list);
64                 kfree(device->name);
65                 kfree(device);
66         }
67         kfree(fs_devices);
68 }
69
70 void btrfs_cleanup_fs_uuids(void)
71 {
72         struct btrfs_fs_devices *fs_devices;
73
74         while (!list_empty(&fs_uuids)) {
75                 fs_devices = list_entry(fs_uuids.next,
76                                         struct btrfs_fs_devices, list);
77                 list_del(&fs_devices->list);
78                 free_fs_devices(fs_devices);
79         }
80 }
81
82 static noinline struct btrfs_device *__find_device(struct list_head *head,
83                                                    u64 devid, u8 *uuid)
84 {
85         struct btrfs_device *dev;
86
87         list_for_each_entry(dev, head, dev_list) {
88                 if (dev->devid == devid &&
89                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
90                         return dev;
91                 }
92         }
93         return NULL;
94 }
95
96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
97 {
98         struct btrfs_fs_devices *fs_devices;
99
100         list_for_each_entry(fs_devices, &fs_uuids, list) {
101                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
102                         return fs_devices;
103         }
104         return NULL;
105 }
106
107 static void requeue_list(struct btrfs_pending_bios *pending_bios,
108                         struct bio *head, struct bio *tail)
109 {
110
111         struct bio *old_head;
112
113         old_head = pending_bios->head;
114         pending_bios->head = head;
115         if (pending_bios->tail)
116                 tail->bi_next = old_head;
117         else
118                 pending_bios->tail = tail;
119 }
120
121 /*
122  * we try to collect pending bios for a device so we don't get a large
123  * number of procs sending bios down to the same device.  This greatly
124  * improves the schedulers ability to collect and merge the bios.
125  *
126  * But, it also turns into a long list of bios to process and that is sure
127  * to eventually make the worker thread block.  The solution here is to
128  * make some progress and then put this work struct back at the end of
129  * the list if the block device is congested.  This way, multiple devices
130  * can make progress from a single worker thread.
131  */
132 static noinline void run_scheduled_bios(struct btrfs_device *device)
133 {
134         struct bio *pending;
135         struct backing_dev_info *bdi;
136         struct btrfs_fs_info *fs_info;
137         struct btrfs_pending_bios *pending_bios;
138         struct bio *tail;
139         struct bio *cur;
140         int again = 0;
141         unsigned long num_run;
142         unsigned long batch_run = 0;
143         unsigned long limit;
144         unsigned long last_waited = 0;
145         int force_reg = 0;
146         int sync_pending = 0;
147         struct blk_plug plug;
148
149         /*
150          * this function runs all the bios we've collected for
151          * a particular device.  We don't want to wander off to
152          * another device without first sending all of these down.
153          * So, setup a plug here and finish it off before we return
154          */
155         blk_start_plug(&plug);
156
157         bdi = blk_get_backing_dev_info(device->bdev);
158         fs_info = device->dev_root->fs_info;
159         limit = btrfs_async_submit_limit(fs_info);
160         limit = limit * 2 / 3;
161
162 loop:
163         spin_lock(&device->io_lock);
164
165 loop_lock:
166         num_run = 0;
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         if (!force_reg && device->pending_sync_bios.head) {
174                 pending_bios = &device->pending_sync_bios;
175                 force_reg = 1;
176         } else {
177                 pending_bios = &device->pending_bios;
178                 force_reg = 0;
179         }
180
181         pending = pending_bios->head;
182         tail = pending_bios->tail;
183         WARN_ON(pending && !tail);
184
185         /*
186          * if pending was null this time around, no bios need processing
187          * at all and we can stop.  Otherwise it'll loop back up again
188          * and do an additional check so no bios are missed.
189          *
190          * device->running_pending is used to synchronize with the
191          * schedule_bio code.
192          */
193         if (device->pending_sync_bios.head == NULL &&
194             device->pending_bios.head == NULL) {
195                 again = 0;
196                 device->running_pending = 0;
197         } else {
198                 again = 1;
199                 device->running_pending = 1;
200         }
201
202         pending_bios->head = NULL;
203         pending_bios->tail = NULL;
204
205         spin_unlock(&device->io_lock);
206
207         while (pending) {
208
209                 rmb();
210                 /* we want to work on both lists, but do more bios on the
211                  * sync list than the regular list
212                  */
213                 if ((num_run > 32 &&
214                     pending_bios != &device->pending_sync_bios &&
215                     device->pending_sync_bios.head) ||
216                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
217                     device->pending_bios.head)) {
218                         spin_lock(&device->io_lock);
219                         requeue_list(pending_bios, pending, tail);
220                         goto loop_lock;
221                 }
222
223                 cur = pending;
224                 pending = pending->bi_next;
225                 cur->bi_next = NULL;
226                 atomic_dec(&fs_info->nr_async_bios);
227
228                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
229                     waitqueue_active(&fs_info->async_submit_wait))
230                         wake_up(&fs_info->async_submit_wait);
231
232                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
233
234                 /*
235                  * if we're doing the sync list, record that our
236                  * plug has some sync requests on it
237                  *
238                  * If we're doing the regular list and there are
239                  * sync requests sitting around, unplug before
240                  * we add more
241                  */
242                 if (pending_bios == &device->pending_sync_bios) {
243                         sync_pending = 1;
244                 } else if (sync_pending) {
245                         blk_finish_plug(&plug);
246                         blk_start_plug(&plug);
247                         sync_pending = 0;
248                 }
249
250                 btrfsic_submit_bio(cur->bi_rw, cur);
251                 num_run++;
252                 batch_run++;
253                 if (need_resched())
254                         cond_resched();
255
256                 /*
257                  * we made progress, there is more work to do and the bdi
258                  * is now congested.  Back off and let other work structs
259                  * run instead
260                  */
261                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
262                     fs_info->fs_devices->open_devices > 1) {
263                         struct io_context *ioc;
264
265                         ioc = current->io_context;
266
267                         /*
268                          * the main goal here is that we don't want to
269                          * block if we're going to be able to submit
270                          * more requests without blocking.
271                          *
272                          * This code does two great things, it pokes into
273                          * the elevator code from a filesystem _and_
274                          * it makes assumptions about how batching works.
275                          */
276                         if (ioc && ioc->nr_batch_requests > 0 &&
277                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
278                             (last_waited == 0 ||
279                              ioc->last_waited == last_waited)) {
280                                 /*
281                                  * we want to go through our batch of
282                                  * requests and stop.  So, we copy out
283                                  * the ioc->last_waited time and test
284                                  * against it before looping
285                                  */
286                                 last_waited = ioc->last_waited;
287                                 if (need_resched())
288                                         cond_resched();
289                                 continue;
290                         }
291                         spin_lock(&device->io_lock);
292                         requeue_list(pending_bios, pending, tail);
293                         device->running_pending = 1;
294
295                         spin_unlock(&device->io_lock);
296                         btrfs_requeue_work(&device->work);
297                         goto done;
298                 }
299                 /* unplug every 64 requests just for good measure */
300                 if (batch_run % 64 == 0) {
301                         blk_finish_plug(&plug);
302                         blk_start_plug(&plug);
303                         sync_pending = 0;
304                 }
305         }
306
307         cond_resched();
308         if (again)
309                 goto loop;
310
311         spin_lock(&device->io_lock);
312         if (device->pending_bios.head || device->pending_sync_bios.head)
313                 goto loop_lock;
314         spin_unlock(&device->io_lock);
315
316 done:
317         blk_finish_plug(&plug);
318 }
319
320 static void pending_bios_fn(struct btrfs_work *work)
321 {
322         struct btrfs_device *device;
323
324         device = container_of(work, struct btrfs_device, work);
325         run_scheduled_bios(device);
326 }
327
328 static noinline int device_list_add(const char *path,
329                            struct btrfs_super_block *disk_super,
330                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
331 {
332         struct btrfs_device *device;
333         struct btrfs_fs_devices *fs_devices;
334         u64 found_transid = btrfs_super_generation(disk_super);
335         char *name;
336
337         fs_devices = find_fsid(disk_super->fsid);
338         if (!fs_devices) {
339                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
340                 if (!fs_devices)
341                         return -ENOMEM;
342                 INIT_LIST_HEAD(&fs_devices->devices);
343                 INIT_LIST_HEAD(&fs_devices->alloc_list);
344                 list_add(&fs_devices->list, &fs_uuids);
345                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
346                 fs_devices->latest_devid = devid;
347                 fs_devices->latest_trans = found_transid;
348                 mutex_init(&fs_devices->device_list_mutex);
349                 device = NULL;
350         } else {
351                 device = __find_device(&fs_devices->devices, devid,
352                                        disk_super->dev_item.uuid);
353         }
354         if (!device) {
355                 if (fs_devices->opened)
356                         return -EBUSY;
357
358                 device = kzalloc(sizeof(*device), GFP_NOFS);
359                 if (!device) {
360                         /* we can safely leave the fs_devices entry around */
361                         return -ENOMEM;
362                 }
363                 device->devid = devid;
364                 device->work.func = pending_bios_fn;
365                 memcpy(device->uuid, disk_super->dev_item.uuid,
366                        BTRFS_UUID_SIZE);
367                 spin_lock_init(&device->io_lock);
368                 device->name = kstrdup(path, GFP_NOFS);
369                 if (!device->name) {
370                         kfree(device);
371                         return -ENOMEM;
372                 }
373                 INIT_LIST_HEAD(&device->dev_alloc_list);
374
375                 /* init readahead state */
376                 spin_lock_init(&device->reada_lock);
377                 device->reada_curr_zone = NULL;
378                 atomic_set(&device->reada_in_flight, 0);
379                 device->reada_next = 0;
380                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
381                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
382
383                 mutex_lock(&fs_devices->device_list_mutex);
384                 list_add_rcu(&device->dev_list, &fs_devices->devices);
385                 mutex_unlock(&fs_devices->device_list_mutex);
386
387                 device->fs_devices = fs_devices;
388                 fs_devices->num_devices++;
389         } else if (!device->name || strcmp(device->name, path)) {
390                 name = kstrdup(path, GFP_NOFS);
391                 if (!name)
392                         return -ENOMEM;
393                 kfree(device->name);
394                 device->name = name;
395                 if (device->missing) {
396                         fs_devices->missing_devices--;
397                         device->missing = 0;
398                 }
399         }
400
401         if (found_transid > fs_devices->latest_trans) {
402                 fs_devices->latest_devid = devid;
403                 fs_devices->latest_trans = found_transid;
404         }
405         *fs_devices_ret = fs_devices;
406         return 0;
407 }
408
409 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
410 {
411         struct btrfs_fs_devices *fs_devices;
412         struct btrfs_device *device;
413         struct btrfs_device *orig_dev;
414
415         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
416         if (!fs_devices)
417                 return ERR_PTR(-ENOMEM);
418
419         INIT_LIST_HEAD(&fs_devices->devices);
420         INIT_LIST_HEAD(&fs_devices->alloc_list);
421         INIT_LIST_HEAD(&fs_devices->list);
422         mutex_init(&fs_devices->device_list_mutex);
423         fs_devices->latest_devid = orig->latest_devid;
424         fs_devices->latest_trans = orig->latest_trans;
425         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
426
427         /* We have held the volume lock, it is safe to get the devices. */
428         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
429                 device = kzalloc(sizeof(*device), GFP_NOFS);
430                 if (!device)
431                         goto error;
432
433                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
434                 if (!device->name) {
435                         kfree(device);
436                         goto error;
437                 }
438
439                 device->devid = orig_dev->devid;
440                 device->work.func = pending_bios_fn;
441                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
442                 spin_lock_init(&device->io_lock);
443                 INIT_LIST_HEAD(&device->dev_list);
444                 INIT_LIST_HEAD(&device->dev_alloc_list);
445
446                 list_add(&device->dev_list, &fs_devices->devices);
447                 device->fs_devices = fs_devices;
448                 fs_devices->num_devices++;
449         }
450         return fs_devices;
451 error:
452         free_fs_devices(fs_devices);
453         return ERR_PTR(-ENOMEM);
454 }
455
456 void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
457 {
458         struct btrfs_device *device, *next;
459
460         struct block_device *latest_bdev = NULL;
461         u64 latest_devid = 0;
462         u64 latest_transid = 0;
463
464         mutex_lock(&uuid_mutex);
465 again:
466         /* This is the initialized path, it is safe to release the devices. */
467         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
468                 if (device->in_fs_metadata) {
469                         if (!latest_transid ||
470                             device->generation > latest_transid) {
471                                 latest_devid = device->devid;
472                                 latest_transid = device->generation;
473                                 latest_bdev = device->bdev;
474                         }
475                         continue;
476                 }
477
478                 if (device->bdev) {
479                         blkdev_put(device->bdev, device->mode);
480                         device->bdev = NULL;
481                         fs_devices->open_devices--;
482                 }
483                 if (device->writeable) {
484                         list_del_init(&device->dev_alloc_list);
485                         device->writeable = 0;
486                         fs_devices->rw_devices--;
487                 }
488                 list_del_init(&device->dev_list);
489                 fs_devices->num_devices--;
490                 kfree(device->name);
491                 kfree(device);
492         }
493
494         if (fs_devices->seed) {
495                 fs_devices = fs_devices->seed;
496                 goto again;
497         }
498
499         fs_devices->latest_bdev = latest_bdev;
500         fs_devices->latest_devid = latest_devid;
501         fs_devices->latest_trans = latest_transid;
502
503         mutex_unlock(&uuid_mutex);
504 }
505
506 static void __free_device(struct work_struct *work)
507 {
508         struct btrfs_device *device;
509
510         device = container_of(work, struct btrfs_device, rcu_work);
511
512         if (device->bdev)
513                 blkdev_put(device->bdev, device->mode);
514
515         kfree(device->name);
516         kfree(device);
517 }
518
519 static void free_device(struct rcu_head *head)
520 {
521         struct btrfs_device *device;
522
523         device = container_of(head, struct btrfs_device, rcu);
524
525         INIT_WORK(&device->rcu_work, __free_device);
526         schedule_work(&device->rcu_work);
527 }
528
529 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
530 {
531         struct btrfs_device *device;
532
533         if (--fs_devices->opened > 0)
534                 return 0;
535
536         mutex_lock(&fs_devices->device_list_mutex);
537         list_for_each_entry(device, &fs_devices->devices, dev_list) {
538                 struct btrfs_device *new_device;
539
540                 if (device->bdev)
541                         fs_devices->open_devices--;
542
543                 if (device->writeable) {
544                         list_del_init(&device->dev_alloc_list);
545                         fs_devices->rw_devices--;
546                 }
547
548                 if (device->can_discard)
549                         fs_devices->num_can_discard--;
550
551                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
552                 BUG_ON(!new_device); /* -ENOMEM */
553                 memcpy(new_device, device, sizeof(*new_device));
554                 new_device->name = kstrdup(device->name, GFP_NOFS);
555                 BUG_ON(device->name && !new_device->name); /* -ENOMEM */
556                 new_device->bdev = NULL;
557                 new_device->writeable = 0;
558                 new_device->in_fs_metadata = 0;
559                 new_device->can_discard = 0;
560                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
561
562                 call_rcu(&device->rcu, free_device);
563         }
564         mutex_unlock(&fs_devices->device_list_mutex);
565
566         WARN_ON(fs_devices->open_devices);
567         WARN_ON(fs_devices->rw_devices);
568         fs_devices->opened = 0;
569         fs_devices->seeding = 0;
570
571         return 0;
572 }
573
574 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
575 {
576         struct btrfs_fs_devices *seed_devices = NULL;
577         int ret;
578
579         mutex_lock(&uuid_mutex);
580         ret = __btrfs_close_devices(fs_devices);
581         if (!fs_devices->opened) {
582                 seed_devices = fs_devices->seed;
583                 fs_devices->seed = NULL;
584         }
585         mutex_unlock(&uuid_mutex);
586
587         while (seed_devices) {
588                 fs_devices = seed_devices;
589                 seed_devices = fs_devices->seed;
590                 __btrfs_close_devices(fs_devices);
591                 free_fs_devices(fs_devices);
592         }
593         return ret;
594 }
595
596 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597                                 fmode_t flags, void *holder)
598 {
599         struct request_queue *q;
600         struct block_device *bdev;
601         struct list_head *head = &fs_devices->devices;
602         struct btrfs_device *device;
603         struct block_device *latest_bdev = NULL;
604         struct buffer_head *bh;
605         struct btrfs_super_block *disk_super;
606         u64 latest_devid = 0;
607         u64 latest_transid = 0;
608         u64 devid;
609         int seeding = 1;
610         int ret = 0;
611
612         flags |= FMODE_EXCL;
613
614         list_for_each_entry(device, head, dev_list) {
615                 if (device->bdev)
616                         continue;
617                 if (!device->name)
618                         continue;
619
620                 bdev = blkdev_get_by_path(device->name, flags, holder);
621                 if (IS_ERR(bdev)) {
622                         printk(KERN_INFO "open %s failed\n", device->name);
623                         goto error;
624                 }
625                 filemap_write_and_wait(bdev->bd_inode->i_mapping);
626                 invalidate_bdev(bdev);
627                 set_blocksize(bdev, 4096);
628
629                 bh = btrfs_read_dev_super(bdev);
630                 if (!bh)
631                         goto error_close;
632
633                 disk_super = (struct btrfs_super_block *)bh->b_data;
634                 devid = btrfs_stack_device_id(&disk_super->dev_item);
635                 if (devid != device->devid)
636                         goto error_brelse;
637
638                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
639                            BTRFS_UUID_SIZE))
640                         goto error_brelse;
641
642                 device->generation = btrfs_super_generation(disk_super);
643                 if (!latest_transid || device->generation > latest_transid) {
644                         latest_devid = devid;
645                         latest_transid = device->generation;
646                         latest_bdev = bdev;
647                 }
648
649                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
650                         device->writeable = 0;
651                 } else {
652                         device->writeable = !bdev_read_only(bdev);
653                         seeding = 0;
654                 }
655
656                 q = bdev_get_queue(bdev);
657                 if (blk_queue_discard(q)) {
658                         device->can_discard = 1;
659                         fs_devices->num_can_discard++;
660                 }
661
662                 device->bdev = bdev;
663                 device->in_fs_metadata = 0;
664                 device->mode = flags;
665
666                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
667                         fs_devices->rotating = 1;
668
669                 fs_devices->open_devices++;
670                 if (device->writeable) {
671                         fs_devices->rw_devices++;
672                         list_add(&device->dev_alloc_list,
673                                  &fs_devices->alloc_list);
674                 }
675                 brelse(bh);
676                 continue;
677
678 error_brelse:
679                 brelse(bh);
680 error_close:
681                 blkdev_put(bdev, flags);
682 error:
683                 continue;
684         }
685         if (fs_devices->open_devices == 0) {
686                 ret = -EINVAL;
687                 goto out;
688         }
689         fs_devices->seeding = seeding;
690         fs_devices->opened = 1;
691         fs_devices->latest_bdev = latest_bdev;
692         fs_devices->latest_devid = latest_devid;
693         fs_devices->latest_trans = latest_transid;
694         fs_devices->total_rw_bytes = 0;
695 out:
696         return ret;
697 }
698
699 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
700                        fmode_t flags, void *holder)
701 {
702         int ret;
703
704         mutex_lock(&uuid_mutex);
705         if (fs_devices->opened) {
706                 fs_devices->opened++;
707                 ret = 0;
708         } else {
709                 ret = __btrfs_open_devices(fs_devices, flags, holder);
710         }
711         mutex_unlock(&uuid_mutex);
712         return ret;
713 }
714
715 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716                           struct btrfs_fs_devices **fs_devices_ret)
717 {
718         struct btrfs_super_block *disk_super;
719         struct block_device *bdev;
720         struct buffer_head *bh;
721         int ret;
722         u64 devid;
723         u64 transid;
724
725         flags |= FMODE_EXCL;
726         bdev = blkdev_get_by_path(path, flags, holder);
727
728         if (IS_ERR(bdev)) {
729                 ret = PTR_ERR(bdev);
730                 goto error;
731         }
732
733         mutex_lock(&uuid_mutex);
734         ret = set_blocksize(bdev, 4096);
735         if (ret)
736                 goto error_close;
737         bh = btrfs_read_dev_super(bdev);
738         if (!bh) {
739                 ret = -EINVAL;
740                 goto error_close;
741         }
742         disk_super = (struct btrfs_super_block *)bh->b_data;
743         devid = btrfs_stack_device_id(&disk_super->dev_item);
744         transid = btrfs_super_generation(disk_super);
745         if (disk_super->label[0])
746                 printk(KERN_INFO "device label %s ", disk_super->label);
747         else
748                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
749         printk(KERN_CONT "devid %llu transid %llu %s\n",
750                (unsigned long long)devid, (unsigned long long)transid, path);
751         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
752
753         brelse(bh);
754 error_close:
755         mutex_unlock(&uuid_mutex);
756         blkdev_put(bdev, flags);
757 error:
758         return ret;
759 }
760
761 /* helper to account the used device space in the range */
762 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
763                                    u64 end, u64 *length)
764 {
765         struct btrfs_key key;
766         struct btrfs_root *root = device->dev_root;
767         struct btrfs_dev_extent *dev_extent;
768         struct btrfs_path *path;
769         u64 extent_end;
770         int ret;
771         int slot;
772         struct extent_buffer *l;
773
774         *length = 0;
775
776         if (start >= device->total_bytes)
777                 return 0;
778
779         path = btrfs_alloc_path();
780         if (!path)
781                 return -ENOMEM;
782         path->reada = 2;
783
784         key.objectid = device->devid;
785         key.offset = start;
786         key.type = BTRFS_DEV_EXTENT_KEY;
787
788         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
789         if (ret < 0)
790                 goto out;
791         if (ret > 0) {
792                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
793                 if (ret < 0)
794                         goto out;
795         }
796
797         while (1) {
798                 l = path->nodes[0];
799                 slot = path->slots[0];
800                 if (slot >= btrfs_header_nritems(l)) {
801                         ret = btrfs_next_leaf(root, path);
802                         if (ret == 0)
803                                 continue;
804                         if (ret < 0)
805                                 goto out;
806
807                         break;
808                 }
809                 btrfs_item_key_to_cpu(l, &key, slot);
810
811                 if (key.objectid < device->devid)
812                         goto next;
813
814                 if (key.objectid > device->devid)
815                         break;
816
817                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
818                         goto next;
819
820                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
821                 extent_end = key.offset + btrfs_dev_extent_length(l,
822                                                                   dev_extent);
823                 if (key.offset <= start && extent_end > end) {
824                         *length = end - start + 1;
825                         break;
826                 } else if (key.offset <= start && extent_end > start)
827                         *length += extent_end - start;
828                 else if (key.offset > start && extent_end <= end)
829                         *length += extent_end - key.offset;
830                 else if (key.offset > start && key.offset <= end) {
831                         *length += end - key.offset + 1;
832                         break;
833                 } else if (key.offset > end)
834                         break;
835
836 next:
837                 path->slots[0]++;
838         }
839         ret = 0;
840 out:
841         btrfs_free_path(path);
842         return ret;
843 }
844
845 /*
846  * find_free_dev_extent - find free space in the specified device
847  * @device:     the device which we search the free space in
848  * @num_bytes:  the size of the free space that we need
849  * @start:      store the start of the free space.
850  * @len:        the size of the free space. that we find, or the size of the max
851  *              free space if we don't find suitable free space
852  *
853  * this uses a pretty simple search, the expectation is that it is
854  * called very infrequently and that a given device has a small number
855  * of extents
856  *
857  * @start is used to store the start of the free space if we find. But if we
858  * don't find suitable free space, it will be used to store the start position
859  * of the max free space.
860  *
861  * @len is used to store the size of the free space that we find.
862  * But if we don't find suitable free space, it is used to store the size of
863  * the max free space.
864  */
865 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
866                          u64 *start, u64 *len)
867 {
868         struct btrfs_key key;
869         struct btrfs_root *root = device->dev_root;
870         struct btrfs_dev_extent *dev_extent;
871         struct btrfs_path *path;
872         u64 hole_size;
873         u64 max_hole_start;
874         u64 max_hole_size;
875         u64 extent_end;
876         u64 search_start;
877         u64 search_end = device->total_bytes;
878         int ret;
879         int slot;
880         struct extent_buffer *l;
881
882         /* FIXME use last free of some kind */
883
884         /* we don't want to overwrite the superblock on the drive,
885          * so we make sure to start at an offset of at least 1MB
886          */
887         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
888
889         max_hole_start = search_start;
890         max_hole_size = 0;
891         hole_size = 0;
892
893         if (search_start >= search_end) {
894                 ret = -ENOSPC;
895                 goto error;
896         }
897
898         path = btrfs_alloc_path();
899         if (!path) {
900                 ret = -ENOMEM;
901                 goto error;
902         }
903         path->reada = 2;
904
905         key.objectid = device->devid;
906         key.offset = search_start;
907         key.type = BTRFS_DEV_EXTENT_KEY;
908
909         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
910         if (ret < 0)
911                 goto out;
912         if (ret > 0) {
913                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
914                 if (ret < 0)
915                         goto out;
916         }
917
918         while (1) {
919                 l = path->nodes[0];
920                 slot = path->slots[0];
921                 if (slot >= btrfs_header_nritems(l)) {
922                         ret = btrfs_next_leaf(root, path);
923                         if (ret == 0)
924                                 continue;
925                         if (ret < 0)
926                                 goto out;
927
928                         break;
929                 }
930                 btrfs_item_key_to_cpu(l, &key, slot);
931
932                 if (key.objectid < device->devid)
933                         goto next;
934
935                 if (key.objectid > device->devid)
936                         break;
937
938                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
939                         goto next;
940
941                 if (key.offset > search_start) {
942                         hole_size = key.offset - search_start;
943
944                         if (hole_size > max_hole_size) {
945                                 max_hole_start = search_start;
946                                 max_hole_size = hole_size;
947                         }
948
949                         /*
950                          * If this free space is greater than which we need,
951                          * it must be the max free space that we have found
952                          * until now, so max_hole_start must point to the start
953                          * of this free space and the length of this free space
954                          * is stored in max_hole_size. Thus, we return
955                          * max_hole_start and max_hole_size and go back to the
956                          * caller.
957                          */
958                         if (hole_size >= num_bytes) {
959                                 ret = 0;
960                                 goto out;
961                         }
962                 }
963
964                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
965                 extent_end = key.offset + btrfs_dev_extent_length(l,
966                                                                   dev_extent);
967                 if (extent_end > search_start)
968                         search_start = extent_end;
969 next:
970                 path->slots[0]++;
971                 cond_resched();
972         }
973
974         /*
975          * At this point, search_start should be the end of
976          * allocated dev extents, and when shrinking the device,
977          * search_end may be smaller than search_start.
978          */
979         if (search_end > search_start)
980                 hole_size = search_end - search_start;
981
982         if (hole_size > max_hole_size) {
983                 max_hole_start = search_start;
984                 max_hole_size = hole_size;
985         }
986
987         /* See above. */
988         if (hole_size < num_bytes)
989                 ret = -ENOSPC;
990         else
991                 ret = 0;
992
993 out:
994         btrfs_free_path(path);
995 error:
996         *start = max_hole_start;
997         if (len)
998                 *len = max_hole_size;
999         return ret;
1000 }
1001
1002 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1003                           struct btrfs_device *device,
1004                           u64 start)
1005 {
1006         int ret;
1007         struct btrfs_path *path;
1008         struct btrfs_root *root = device->dev_root;
1009         struct btrfs_key key;
1010         struct btrfs_key found_key;
1011         struct extent_buffer *leaf = NULL;
1012         struct btrfs_dev_extent *extent = NULL;
1013
1014         path = btrfs_alloc_path();
1015         if (!path)
1016                 return -ENOMEM;
1017
1018         key.objectid = device->devid;
1019         key.offset = start;
1020         key.type = BTRFS_DEV_EXTENT_KEY;
1021 again:
1022         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1023         if (ret > 0) {
1024                 ret = btrfs_previous_item(root, path, key.objectid,
1025                                           BTRFS_DEV_EXTENT_KEY);
1026                 if (ret)
1027                         goto out;
1028                 leaf = path->nodes[0];
1029                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1030                 extent = btrfs_item_ptr(leaf, path->slots[0],
1031                                         struct btrfs_dev_extent);
1032                 BUG_ON(found_key.offset > start || found_key.offset +
1033                        btrfs_dev_extent_length(leaf, extent) < start);
1034                 key = found_key;
1035                 btrfs_release_path(path);
1036                 goto again;
1037         } else if (ret == 0) {
1038                 leaf = path->nodes[0];
1039                 extent = btrfs_item_ptr(leaf, path->slots[0],
1040                                         struct btrfs_dev_extent);
1041         } else {
1042                 btrfs_error(root->fs_info, ret, "Slot search failed");
1043                 goto out;
1044         }
1045
1046         if (device->bytes_used > 0) {
1047                 u64 len = btrfs_dev_extent_length(leaf, extent);
1048                 device->bytes_used -= len;
1049                 spin_lock(&root->fs_info->free_chunk_lock);
1050                 root->fs_info->free_chunk_space += len;
1051                 spin_unlock(&root->fs_info->free_chunk_lock);
1052         }
1053         ret = btrfs_del_item(trans, root, path);
1054         if (ret) {
1055                 btrfs_error(root->fs_info, ret,
1056                             "Failed to remove dev extent item");
1057         }
1058 out:
1059         btrfs_free_path(path);
1060         return ret;
1061 }
1062
1063 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1064                            struct btrfs_device *device,
1065                            u64 chunk_tree, u64 chunk_objectid,
1066                            u64 chunk_offset, u64 start, u64 num_bytes)
1067 {
1068         int ret;
1069         struct btrfs_path *path;
1070         struct btrfs_root *root = device->dev_root;
1071         struct btrfs_dev_extent *extent;
1072         struct extent_buffer *leaf;
1073         struct btrfs_key key;
1074
1075         WARN_ON(!device->in_fs_metadata);
1076         path = btrfs_alloc_path();
1077         if (!path)
1078                 return -ENOMEM;
1079
1080         key.objectid = device->devid;
1081         key.offset = start;
1082         key.type = BTRFS_DEV_EXTENT_KEY;
1083         ret = btrfs_insert_empty_item(trans, root, path, &key,
1084                                       sizeof(*extent));
1085         if (ret)
1086                 goto out;
1087
1088         leaf = path->nodes[0];
1089         extent = btrfs_item_ptr(leaf, path->slots[0],
1090                                 struct btrfs_dev_extent);
1091         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1092         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1093         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1094
1095         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1096                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1097                     BTRFS_UUID_SIZE);
1098
1099         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1100         btrfs_mark_buffer_dirty(leaf);
1101 out:
1102         btrfs_free_path(path);
1103         return ret;
1104 }
1105
1106 static noinline int find_next_chunk(struct btrfs_root *root,
1107                                     u64 objectid, u64 *offset)
1108 {
1109         struct btrfs_path *path;
1110         int ret;
1111         struct btrfs_key key;
1112         struct btrfs_chunk *chunk;
1113         struct btrfs_key found_key;
1114
1115         path = btrfs_alloc_path();
1116         if (!path)
1117                 return -ENOMEM;
1118
1119         key.objectid = objectid;
1120         key.offset = (u64)-1;
1121         key.type = BTRFS_CHUNK_ITEM_KEY;
1122
1123         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1124         if (ret < 0)
1125                 goto error;
1126
1127         BUG_ON(ret == 0); /* Corruption */
1128
1129         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1130         if (ret) {
1131                 *offset = 0;
1132         } else {
1133                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1134                                       path->slots[0]);
1135                 if (found_key.objectid != objectid)
1136                         *offset = 0;
1137                 else {
1138                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1139                                                struct btrfs_chunk);
1140                         *offset = found_key.offset +
1141                                 btrfs_chunk_length(path->nodes[0], chunk);
1142                 }
1143         }
1144         ret = 0;
1145 error:
1146         btrfs_free_path(path);
1147         return ret;
1148 }
1149
1150 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1151 {
1152         int ret;
1153         struct btrfs_key key;
1154         struct btrfs_key found_key;
1155         struct btrfs_path *path;
1156
1157         root = root->fs_info->chunk_root;
1158
1159         path = btrfs_alloc_path();
1160         if (!path)
1161                 return -ENOMEM;
1162
1163         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1164         key.type = BTRFS_DEV_ITEM_KEY;
1165         key.offset = (u64)-1;
1166
1167         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1168         if (ret < 0)
1169                 goto error;
1170
1171         BUG_ON(ret == 0); /* Corruption */
1172
1173         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1174                                   BTRFS_DEV_ITEM_KEY);
1175         if (ret) {
1176                 *objectid = 1;
1177         } else {
1178                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1179                                       path->slots[0]);
1180                 *objectid = found_key.offset + 1;
1181         }
1182         ret = 0;
1183 error:
1184         btrfs_free_path(path);
1185         return ret;
1186 }
1187
1188 /*
1189  * the device information is stored in the chunk root
1190  * the btrfs_device struct should be fully filled in
1191  */
1192 int btrfs_add_device(struct btrfs_trans_handle *trans,
1193                      struct btrfs_root *root,
1194                      struct btrfs_device *device)
1195 {
1196         int ret;
1197         struct btrfs_path *path;
1198         struct btrfs_dev_item *dev_item;
1199         struct extent_buffer *leaf;
1200         struct btrfs_key key;
1201         unsigned long ptr;
1202
1203         root = root->fs_info->chunk_root;
1204
1205         path = btrfs_alloc_path();
1206         if (!path)
1207                 return -ENOMEM;
1208
1209         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1210         key.type = BTRFS_DEV_ITEM_KEY;
1211         key.offset = device->devid;
1212
1213         ret = btrfs_insert_empty_item(trans, root, path, &key,
1214                                       sizeof(*dev_item));
1215         if (ret)
1216                 goto out;
1217
1218         leaf = path->nodes[0];
1219         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1220
1221         btrfs_set_device_id(leaf, dev_item, device->devid);
1222         btrfs_set_device_generation(leaf, dev_item, 0);
1223         btrfs_set_device_type(leaf, dev_item, device->type);
1224         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1225         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1226         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1227         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1228         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1229         btrfs_set_device_group(leaf, dev_item, 0);
1230         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1231         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1232         btrfs_set_device_start_offset(leaf, dev_item, 0);
1233
1234         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1235         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1236         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1237         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1238         btrfs_mark_buffer_dirty(leaf);
1239
1240         ret = 0;
1241 out:
1242         btrfs_free_path(path);
1243         return ret;
1244 }
1245
1246 static int btrfs_rm_dev_item(struct btrfs_root *root,
1247                              struct btrfs_device *device)
1248 {
1249         int ret;
1250         struct btrfs_path *path;
1251         struct btrfs_key key;
1252         struct btrfs_trans_handle *trans;
1253
1254         root = root->fs_info->chunk_root;
1255
1256         path = btrfs_alloc_path();
1257         if (!path)
1258                 return -ENOMEM;
1259
1260         trans = btrfs_start_transaction(root, 0);
1261         if (IS_ERR(trans)) {
1262                 btrfs_free_path(path);
1263                 return PTR_ERR(trans);
1264         }
1265         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1266         key.type = BTRFS_DEV_ITEM_KEY;
1267         key.offset = device->devid;
1268         lock_chunks(root);
1269
1270         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1271         if (ret < 0)
1272                 goto out;
1273
1274         if (ret > 0) {
1275                 ret = -ENOENT;
1276                 goto out;
1277         }
1278
1279         ret = btrfs_del_item(trans, root, path);
1280         if (ret)
1281                 goto out;
1282 out:
1283         btrfs_free_path(path);
1284         unlock_chunks(root);
1285         btrfs_commit_transaction(trans, root);
1286         return ret;
1287 }
1288
1289 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1290 {
1291         struct btrfs_device *device;
1292         struct btrfs_device *next_device;
1293         struct block_device *bdev;
1294         struct buffer_head *bh = NULL;
1295         struct btrfs_super_block *disk_super;
1296         struct btrfs_fs_devices *cur_devices;
1297         u64 all_avail;
1298         u64 devid;
1299         u64 num_devices;
1300         u8 *dev_uuid;
1301         int ret = 0;
1302         bool clear_super = false;
1303
1304         mutex_lock(&uuid_mutex);
1305
1306         all_avail = root->fs_info->avail_data_alloc_bits |
1307                 root->fs_info->avail_system_alloc_bits |
1308                 root->fs_info->avail_metadata_alloc_bits;
1309
1310         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1311             root->fs_info->fs_devices->num_devices <= 4) {
1312                 printk(KERN_ERR "btrfs: unable to go below four devices "
1313                        "on raid10\n");
1314                 ret = -EINVAL;
1315                 goto out;
1316         }
1317
1318         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1319             root->fs_info->fs_devices->num_devices <= 2) {
1320                 printk(KERN_ERR "btrfs: unable to go below two "
1321                        "devices on raid1\n");
1322                 ret = -EINVAL;
1323                 goto out;
1324         }
1325
1326         if (strcmp(device_path, "missing") == 0) {
1327                 struct list_head *devices;
1328                 struct btrfs_device *tmp;
1329
1330                 device = NULL;
1331                 devices = &root->fs_info->fs_devices->devices;
1332                 /*
1333                  * It is safe to read the devices since the volume_mutex
1334                  * is held.
1335                  */
1336                 list_for_each_entry(tmp, devices, dev_list) {
1337                         if (tmp->in_fs_metadata && !tmp->bdev) {
1338                                 device = tmp;
1339                                 break;
1340                         }
1341                 }
1342                 bdev = NULL;
1343                 bh = NULL;
1344                 disk_super = NULL;
1345                 if (!device) {
1346                         printk(KERN_ERR "btrfs: no missing devices found to "
1347                                "remove\n");
1348                         goto out;
1349                 }
1350         } else {
1351                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1352                                           root->fs_info->bdev_holder);
1353                 if (IS_ERR(bdev)) {
1354                         ret = PTR_ERR(bdev);
1355                         goto out;
1356                 }
1357
1358                 set_blocksize(bdev, 4096);
1359                 invalidate_bdev(bdev);
1360                 bh = btrfs_read_dev_super(bdev);
1361                 if (!bh) {
1362                         ret = -EINVAL;
1363                         goto error_close;
1364                 }
1365                 disk_super = (struct btrfs_super_block *)bh->b_data;
1366                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1367                 dev_uuid = disk_super->dev_item.uuid;
1368                 device = btrfs_find_device(root, devid, dev_uuid,
1369                                            disk_super->fsid);
1370                 if (!device) {
1371                         ret = -ENOENT;
1372                         goto error_brelse;
1373                 }
1374         }
1375
1376         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1377                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1378                        "device\n");
1379                 ret = -EINVAL;
1380                 goto error_brelse;
1381         }
1382
1383         if (device->writeable) {
1384                 lock_chunks(root);
1385                 list_del_init(&device->dev_alloc_list);
1386                 unlock_chunks(root);
1387                 root->fs_info->fs_devices->rw_devices--;
1388                 clear_super = true;
1389         }
1390
1391         ret = btrfs_shrink_device(device, 0);
1392         if (ret)
1393                 goto error_undo;
1394
1395         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1396         if (ret)
1397                 goto error_undo;
1398
1399         spin_lock(&root->fs_info->free_chunk_lock);
1400         root->fs_info->free_chunk_space = device->total_bytes -
1401                 device->bytes_used;
1402         spin_unlock(&root->fs_info->free_chunk_lock);
1403
1404         device->in_fs_metadata = 0;
1405         btrfs_scrub_cancel_dev(root, device);
1406
1407         /*
1408          * the device list mutex makes sure that we don't change
1409          * the device list while someone else is writing out all
1410          * the device supers.
1411          */
1412
1413         cur_devices = device->fs_devices;
1414         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1415         list_del_rcu(&device->dev_list);
1416
1417         device->fs_devices->num_devices--;
1418
1419         if (device->missing)
1420                 root->fs_info->fs_devices->missing_devices--;
1421
1422         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1423                                  struct btrfs_device, dev_list);
1424         if (device->bdev == root->fs_info->sb->s_bdev)
1425                 root->fs_info->sb->s_bdev = next_device->bdev;
1426         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1427                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1428
1429         if (device->bdev)
1430                 device->fs_devices->open_devices--;
1431
1432         call_rcu(&device->rcu, free_device);
1433         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1434
1435         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1436         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1437
1438         if (cur_devices->open_devices == 0) {
1439                 struct btrfs_fs_devices *fs_devices;
1440                 fs_devices = root->fs_info->fs_devices;
1441                 while (fs_devices) {
1442                         if (fs_devices->seed == cur_devices)
1443                                 break;
1444                         fs_devices = fs_devices->seed;
1445                 }
1446                 fs_devices->seed = cur_devices->seed;
1447                 cur_devices->seed = NULL;
1448                 lock_chunks(root);
1449                 __btrfs_close_devices(cur_devices);
1450                 unlock_chunks(root);
1451                 free_fs_devices(cur_devices);
1452         }
1453
1454         /*
1455          * at this point, the device is zero sized.  We want to
1456          * remove it from the devices list and zero out the old super
1457          */
1458         if (clear_super) {
1459                 /* make sure this device isn't detected as part of
1460                  * the FS anymore
1461                  */
1462                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1463                 set_buffer_dirty(bh);
1464                 sync_dirty_buffer(bh);
1465         }
1466
1467         ret = 0;
1468
1469 error_brelse:
1470         brelse(bh);
1471 error_close:
1472         if (bdev)
1473                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1474 out:
1475         mutex_unlock(&uuid_mutex);
1476         return ret;
1477 error_undo:
1478         if (device->writeable) {
1479                 lock_chunks(root);
1480                 list_add(&device->dev_alloc_list,
1481                          &root->fs_info->fs_devices->alloc_list);
1482                 unlock_chunks(root);
1483                 root->fs_info->fs_devices->rw_devices++;
1484         }
1485         goto error_brelse;
1486 }
1487
1488 /*
1489  * does all the dirty work required for changing file system's UUID.
1490  */
1491 static int btrfs_prepare_sprout(struct btrfs_root *root)
1492 {
1493         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1494         struct btrfs_fs_devices *old_devices;
1495         struct btrfs_fs_devices *seed_devices;
1496         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1497         struct btrfs_device *device;
1498         u64 super_flags;
1499
1500         BUG_ON(!mutex_is_locked(&uuid_mutex));
1501         if (!fs_devices->seeding)
1502                 return -EINVAL;
1503
1504         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1505         if (!seed_devices)
1506                 return -ENOMEM;
1507
1508         old_devices = clone_fs_devices(fs_devices);
1509         if (IS_ERR(old_devices)) {
1510                 kfree(seed_devices);
1511                 return PTR_ERR(old_devices);
1512         }
1513
1514         list_add(&old_devices->list, &fs_uuids);
1515
1516         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1517         seed_devices->opened = 1;
1518         INIT_LIST_HEAD(&seed_devices->devices);
1519         INIT_LIST_HEAD(&seed_devices->alloc_list);
1520         mutex_init(&seed_devices->device_list_mutex);
1521
1522         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1523         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1524                               synchronize_rcu);
1525         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1526
1527         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1528         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1529                 device->fs_devices = seed_devices;
1530         }
1531
1532         fs_devices->seeding = 0;
1533         fs_devices->num_devices = 0;
1534         fs_devices->open_devices = 0;
1535         fs_devices->seed = seed_devices;
1536
1537         generate_random_uuid(fs_devices->fsid);
1538         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1539         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1540         super_flags = btrfs_super_flags(disk_super) &
1541                       ~BTRFS_SUPER_FLAG_SEEDING;
1542         btrfs_set_super_flags(disk_super, super_flags);
1543
1544         return 0;
1545 }
1546
1547 /*
1548  * strore the expected generation for seed devices in device items.
1549  */
1550 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1551                                struct btrfs_root *root)
1552 {
1553         struct btrfs_path *path;
1554         struct extent_buffer *leaf;
1555         struct btrfs_dev_item *dev_item;
1556         struct btrfs_device *device;
1557         struct btrfs_key key;
1558         u8 fs_uuid[BTRFS_UUID_SIZE];
1559         u8 dev_uuid[BTRFS_UUID_SIZE];
1560         u64 devid;
1561         int ret;
1562
1563         path = btrfs_alloc_path();
1564         if (!path)
1565                 return -ENOMEM;
1566
1567         root = root->fs_info->chunk_root;
1568         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1569         key.offset = 0;
1570         key.type = BTRFS_DEV_ITEM_KEY;
1571
1572         while (1) {
1573                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1574                 if (ret < 0)
1575                         goto error;
1576
1577                 leaf = path->nodes[0];
1578 next_slot:
1579                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1580                         ret = btrfs_next_leaf(root, path);
1581                         if (ret > 0)
1582                                 break;
1583                         if (ret < 0)
1584                                 goto error;
1585                         leaf = path->nodes[0];
1586                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1587                         btrfs_release_path(path);
1588                         continue;
1589                 }
1590
1591                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1592                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1593                     key.type != BTRFS_DEV_ITEM_KEY)
1594                         break;
1595
1596                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1597                                           struct btrfs_dev_item);
1598                 devid = btrfs_device_id(leaf, dev_item);
1599                 read_extent_buffer(leaf, dev_uuid,
1600                                    (unsigned long)btrfs_device_uuid(dev_item),
1601                                    BTRFS_UUID_SIZE);
1602                 read_extent_buffer(leaf, fs_uuid,
1603                                    (unsigned long)btrfs_device_fsid(dev_item),
1604                                    BTRFS_UUID_SIZE);
1605                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1606                 BUG_ON(!device); /* Logic error */
1607
1608                 if (device->fs_devices->seeding) {
1609                         btrfs_set_device_generation(leaf, dev_item,
1610                                                     device->generation);
1611                         btrfs_mark_buffer_dirty(leaf);
1612                 }
1613
1614                 path->slots[0]++;
1615                 goto next_slot;
1616         }
1617         ret = 0;
1618 error:
1619         btrfs_free_path(path);
1620         return ret;
1621 }
1622
1623 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1624 {
1625         struct request_queue *q;
1626         struct btrfs_trans_handle *trans;
1627         struct btrfs_device *device;
1628         struct block_device *bdev;
1629         struct list_head *devices;
1630         struct super_block *sb = root->fs_info->sb;
1631         u64 total_bytes;
1632         int seeding_dev = 0;
1633         int ret = 0;
1634
1635         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1636                 return -EINVAL;
1637
1638         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1639                                   root->fs_info->bdev_holder);
1640         if (IS_ERR(bdev))
1641                 return PTR_ERR(bdev);
1642
1643         if (root->fs_info->fs_devices->seeding) {
1644                 seeding_dev = 1;
1645                 down_write(&sb->s_umount);
1646                 mutex_lock(&uuid_mutex);
1647         }
1648
1649         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1650
1651         devices = &root->fs_info->fs_devices->devices;
1652         /*
1653          * we have the volume lock, so we don't need the extra
1654          * device list mutex while reading the list here.
1655          */
1656         list_for_each_entry(device, devices, dev_list) {
1657                 if (device->bdev == bdev) {
1658                         ret = -EEXIST;
1659                         goto error;
1660                 }
1661         }
1662
1663         device = kzalloc(sizeof(*device), GFP_NOFS);
1664         if (!device) {
1665                 /* we can safely leave the fs_devices entry around */
1666                 ret = -ENOMEM;
1667                 goto error;
1668         }
1669
1670         device->name = kstrdup(device_path, GFP_NOFS);
1671         if (!device->name) {
1672                 kfree(device);
1673                 ret = -ENOMEM;
1674                 goto error;
1675         }
1676
1677         ret = find_next_devid(root, &device->devid);
1678         if (ret) {
1679                 kfree(device->name);
1680                 kfree(device);
1681                 goto error;
1682         }
1683
1684         trans = btrfs_start_transaction(root, 0);
1685         if (IS_ERR(trans)) {
1686                 kfree(device->name);
1687                 kfree(device);
1688                 ret = PTR_ERR(trans);
1689                 goto error;
1690         }
1691
1692         lock_chunks(root);
1693
1694         q = bdev_get_queue(bdev);
1695         if (blk_queue_discard(q))
1696                 device->can_discard = 1;
1697         device->writeable = 1;
1698         device->work.func = pending_bios_fn;
1699         generate_random_uuid(device->uuid);
1700         spin_lock_init(&device->io_lock);
1701         device->generation = trans->transid;
1702         device->io_width = root->sectorsize;
1703         device->io_align = root->sectorsize;
1704         device->sector_size = root->sectorsize;
1705         device->total_bytes = i_size_read(bdev->bd_inode);
1706         device->disk_total_bytes = device->total_bytes;
1707         device->dev_root = root->fs_info->dev_root;
1708         device->bdev = bdev;
1709         device->in_fs_metadata = 1;
1710         device->mode = FMODE_EXCL;
1711         set_blocksize(device->bdev, 4096);
1712
1713         if (seeding_dev) {
1714                 sb->s_flags &= ~MS_RDONLY;
1715                 ret = btrfs_prepare_sprout(root);
1716                 BUG_ON(ret); /* -ENOMEM */
1717         }
1718
1719         device->fs_devices = root->fs_info->fs_devices;
1720
1721         /*
1722          * we don't want write_supers to jump in here with our device
1723          * half setup
1724          */
1725         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1726         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1727         list_add(&device->dev_alloc_list,
1728                  &root->fs_info->fs_devices->alloc_list);
1729         root->fs_info->fs_devices->num_devices++;
1730         root->fs_info->fs_devices->open_devices++;
1731         root->fs_info->fs_devices->rw_devices++;
1732         if (device->can_discard)
1733                 root->fs_info->fs_devices->num_can_discard++;
1734         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1735
1736         spin_lock(&root->fs_info->free_chunk_lock);
1737         root->fs_info->free_chunk_space += device->total_bytes;
1738         spin_unlock(&root->fs_info->free_chunk_lock);
1739
1740         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1741                 root->fs_info->fs_devices->rotating = 1;
1742
1743         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1744         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1745                                     total_bytes + device->total_bytes);
1746
1747         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1748         btrfs_set_super_num_devices(root->fs_info->super_copy,
1749                                     total_bytes + 1);
1750         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1751
1752         if (seeding_dev) {
1753                 ret = init_first_rw_device(trans, root, device);
1754                 if (ret)
1755                         goto error_trans;
1756                 ret = btrfs_finish_sprout(trans, root);
1757                 if (ret)
1758                         goto error_trans;
1759         } else {
1760                 ret = btrfs_add_device(trans, root, device);
1761                 if (ret)
1762                         goto error_trans;
1763         }
1764
1765         /*
1766          * we've got more storage, clear any full flags on the space
1767          * infos
1768          */
1769         btrfs_clear_space_info_full(root->fs_info);
1770
1771         unlock_chunks(root);
1772         ret = btrfs_commit_transaction(trans, root);
1773
1774         if (seeding_dev) {
1775                 mutex_unlock(&uuid_mutex);
1776                 up_write(&sb->s_umount);
1777
1778                 if (ret) /* transaction commit */
1779                         return ret;
1780
1781                 ret = btrfs_relocate_sys_chunks(root);
1782                 if (ret < 0)
1783                         btrfs_error(root->fs_info, ret,
1784                                     "Failed to relocate sys chunks after "
1785                                     "device initialization. This can be fixed "
1786                                     "using the \"btrfs balance\" command.");
1787         }
1788
1789         return ret;
1790
1791 error_trans:
1792         unlock_chunks(root);
1793         btrfs_abort_transaction(trans, root, ret);
1794         btrfs_end_transaction(trans, root);
1795         kfree(device->name);
1796         kfree(device);
1797 error:
1798         blkdev_put(bdev, FMODE_EXCL);
1799         if (seeding_dev) {
1800                 mutex_unlock(&uuid_mutex);
1801                 up_write(&sb->s_umount);
1802         }
1803         return ret;
1804 }
1805
1806 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1807                                         struct btrfs_device *device)
1808 {
1809         int ret;
1810         struct btrfs_path *path;
1811         struct btrfs_root *root;
1812         struct btrfs_dev_item *dev_item;
1813         struct extent_buffer *leaf;
1814         struct btrfs_key key;
1815
1816         root = device->dev_root->fs_info->chunk_root;
1817
1818         path = btrfs_alloc_path();
1819         if (!path)
1820                 return -ENOMEM;
1821
1822         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1823         key.type = BTRFS_DEV_ITEM_KEY;
1824         key.offset = device->devid;
1825
1826         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1827         if (ret < 0)
1828                 goto out;
1829
1830         if (ret > 0) {
1831                 ret = -ENOENT;
1832                 goto out;
1833         }
1834
1835         leaf = path->nodes[0];
1836         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1837
1838         btrfs_set_device_id(leaf, dev_item, device->devid);
1839         btrfs_set_device_type(leaf, dev_item, device->type);
1840         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1841         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1842         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1843         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1844         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1845         btrfs_mark_buffer_dirty(leaf);
1846
1847 out:
1848         btrfs_free_path(path);
1849         return ret;
1850 }
1851
1852 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1853                       struct btrfs_device *device, u64 new_size)
1854 {
1855         struct btrfs_super_block *super_copy =
1856                 device->dev_root->fs_info->super_copy;
1857         u64 old_total = btrfs_super_total_bytes(super_copy);
1858         u64 diff = new_size - device->total_bytes;
1859
1860         if (!device->writeable)
1861                 return -EACCES;
1862         if (new_size <= device->total_bytes)
1863                 return -EINVAL;
1864
1865         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1866         device->fs_devices->total_rw_bytes += diff;
1867
1868         device->total_bytes = new_size;
1869         device->disk_total_bytes = new_size;
1870         btrfs_clear_space_info_full(device->dev_root->fs_info);
1871
1872         return btrfs_update_device(trans, device);
1873 }
1874
1875 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1876                       struct btrfs_device *device, u64 new_size)
1877 {
1878         int ret;
1879         lock_chunks(device->dev_root);
1880         ret = __btrfs_grow_device(trans, device, new_size);
1881         unlock_chunks(device->dev_root);
1882         return ret;
1883 }
1884
1885 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1886                             struct btrfs_root *root,
1887                             u64 chunk_tree, u64 chunk_objectid,
1888                             u64 chunk_offset)
1889 {
1890         int ret;
1891         struct btrfs_path *path;
1892         struct btrfs_key key;
1893
1894         root = root->fs_info->chunk_root;
1895         path = btrfs_alloc_path();
1896         if (!path)
1897                 return -ENOMEM;
1898
1899         key.objectid = chunk_objectid;
1900         key.offset = chunk_offset;
1901         key.type = BTRFS_CHUNK_ITEM_KEY;
1902
1903         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1904         if (ret < 0)
1905                 goto out;
1906         else if (ret > 0) { /* Logic error or corruption */
1907                 btrfs_error(root->fs_info, -ENOENT,
1908                             "Failed lookup while freeing chunk.");
1909                 ret = -ENOENT;
1910                 goto out;
1911         }
1912
1913         ret = btrfs_del_item(trans, root, path);
1914         if (ret < 0)
1915                 btrfs_error(root->fs_info, ret,
1916                             "Failed to delete chunk item.");
1917 out:
1918         btrfs_free_path(path);
1919         return ret;
1920 }
1921
1922 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1923                         chunk_offset)
1924 {
1925         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1926         struct btrfs_disk_key *disk_key;
1927         struct btrfs_chunk *chunk;
1928         u8 *ptr;
1929         int ret = 0;
1930         u32 num_stripes;
1931         u32 array_size;
1932         u32 len = 0;
1933         u32 cur;
1934         struct btrfs_key key;
1935
1936         array_size = btrfs_super_sys_array_size(super_copy);
1937
1938         ptr = super_copy->sys_chunk_array;
1939         cur = 0;
1940
1941         while (cur < array_size) {
1942                 disk_key = (struct btrfs_disk_key *)ptr;
1943                 btrfs_disk_key_to_cpu(&key, disk_key);
1944
1945                 len = sizeof(*disk_key);
1946
1947                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1948                         chunk = (struct btrfs_chunk *)(ptr + len);
1949                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1950                         len += btrfs_chunk_item_size(num_stripes);
1951                 } else {
1952                         ret = -EIO;
1953                         break;
1954                 }
1955                 if (key.objectid == chunk_objectid &&
1956                     key.offset == chunk_offset) {
1957                         memmove(ptr, ptr + len, array_size - (cur + len));
1958                         array_size -= len;
1959                         btrfs_set_super_sys_array_size(super_copy, array_size);
1960                 } else {
1961                         ptr += len;
1962                         cur += len;
1963                 }
1964         }
1965         return ret;
1966 }
1967
1968 static int btrfs_relocate_chunk(struct btrfs_root *root,
1969                          u64 chunk_tree, u64 chunk_objectid,
1970                          u64 chunk_offset)
1971 {
1972         struct extent_map_tree *em_tree;
1973         struct btrfs_root *extent_root;
1974         struct btrfs_trans_handle *trans;
1975         struct extent_map *em;
1976         struct map_lookup *map;
1977         int ret;
1978         int i;
1979
1980         root = root->fs_info->chunk_root;
1981         extent_root = root->fs_info->extent_root;
1982         em_tree = &root->fs_info->mapping_tree.map_tree;
1983
1984         ret = btrfs_can_relocate(extent_root, chunk_offset);
1985         if (ret)
1986                 return -ENOSPC;
1987
1988         /* step one, relocate all the extents inside this chunk */
1989         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1990         if (ret)
1991                 return ret;
1992
1993         trans = btrfs_start_transaction(root, 0);
1994         BUG_ON(IS_ERR(trans));
1995
1996         lock_chunks(root);
1997
1998         /*
1999          * step two, delete the device extents and the
2000          * chunk tree entries
2001          */
2002         read_lock(&em_tree->lock);
2003         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2004         read_unlock(&em_tree->lock);
2005
2006         BUG_ON(!em || em->start > chunk_offset ||
2007                em->start + em->len < chunk_offset);
2008         map = (struct map_lookup *)em->bdev;
2009
2010         for (i = 0; i < map->num_stripes; i++) {
2011                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
2012                                             map->stripes[i].physical);
2013                 BUG_ON(ret);
2014
2015                 if (map->stripes[i].dev) {
2016                         ret = btrfs_update_device(trans, map->stripes[i].dev);
2017                         BUG_ON(ret);
2018                 }
2019         }
2020         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2021                                chunk_offset);
2022
2023         BUG_ON(ret);
2024
2025         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2026
2027         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2028                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2029                 BUG_ON(ret);
2030         }
2031
2032         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2033         BUG_ON(ret);
2034
2035         write_lock(&em_tree->lock);
2036         remove_extent_mapping(em_tree, em);
2037         write_unlock(&em_tree->lock);
2038
2039         kfree(map);
2040         em->bdev = NULL;
2041
2042         /* once for the tree */
2043         free_extent_map(em);
2044         /* once for us */
2045         free_extent_map(em);
2046
2047         unlock_chunks(root);
2048         btrfs_end_transaction(trans, root);
2049         return 0;
2050 }
2051
2052 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2053 {
2054         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2055         struct btrfs_path *path;
2056         struct extent_buffer *leaf;
2057         struct btrfs_chunk *chunk;
2058         struct btrfs_key key;
2059         struct btrfs_key found_key;
2060         u64 chunk_tree = chunk_root->root_key.objectid;
2061         u64 chunk_type;
2062         bool retried = false;
2063         int failed = 0;
2064         int ret;
2065
2066         path = btrfs_alloc_path();
2067         if (!path)
2068                 return -ENOMEM;
2069
2070 again:
2071         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2072         key.offset = (u64)-1;
2073         key.type = BTRFS_CHUNK_ITEM_KEY;
2074
2075         while (1) {
2076                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2077                 if (ret < 0)
2078                         goto error;
2079                 BUG_ON(ret == 0); /* Corruption */
2080
2081                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2082                                           key.type);
2083                 if (ret < 0)
2084                         goto error;
2085                 if (ret > 0)
2086                         break;
2087
2088                 leaf = path->nodes[0];
2089                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2090
2091                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2092                                        struct btrfs_chunk);
2093                 chunk_type = btrfs_chunk_type(leaf, chunk);
2094                 btrfs_release_path(path);
2095
2096                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2097                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2098                                                    found_key.objectid,
2099                                                    found_key.offset);
2100                         if (ret == -ENOSPC)
2101                                 failed++;
2102                         else if (ret)
2103                                 BUG();
2104                 }
2105
2106                 if (found_key.offset == 0)
2107                         break;
2108                 key.offset = found_key.offset - 1;
2109         }
2110         ret = 0;
2111         if (failed && !retried) {
2112                 failed = 0;
2113                 retried = true;
2114                 goto again;
2115         } else if (failed && retried) {
2116                 WARN_ON(1);
2117                 ret = -ENOSPC;
2118         }
2119 error:
2120         btrfs_free_path(path);
2121         return ret;
2122 }
2123
2124 static int insert_balance_item(struct btrfs_root *root,
2125                                struct btrfs_balance_control *bctl)
2126 {
2127         struct btrfs_trans_handle *trans;
2128         struct btrfs_balance_item *item;
2129         struct btrfs_disk_balance_args disk_bargs;
2130         struct btrfs_path *path;
2131         struct extent_buffer *leaf;
2132         struct btrfs_key key;
2133         int ret, err;
2134
2135         path = btrfs_alloc_path();
2136         if (!path)
2137                 return -ENOMEM;
2138
2139         trans = btrfs_start_transaction(root, 0);
2140         if (IS_ERR(trans)) {
2141                 btrfs_free_path(path);
2142                 return PTR_ERR(trans);
2143         }
2144
2145         key.objectid = BTRFS_BALANCE_OBJECTID;
2146         key.type = BTRFS_BALANCE_ITEM_KEY;
2147         key.offset = 0;
2148
2149         ret = btrfs_insert_empty_item(trans, root, path, &key,
2150                                       sizeof(*item));
2151         if (ret)
2152                 goto out;
2153
2154         leaf = path->nodes[0];
2155         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2156
2157         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2158
2159         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2160         btrfs_set_balance_data(leaf, item, &disk_bargs);
2161         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2162         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2163         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2164         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2165
2166         btrfs_set_balance_flags(leaf, item, bctl->flags);
2167
2168         btrfs_mark_buffer_dirty(leaf);
2169 out:
2170         btrfs_free_path(path);
2171         err = btrfs_commit_transaction(trans, root);
2172         if (err && !ret)
2173                 ret = err;
2174         return ret;
2175 }
2176
2177 static int del_balance_item(struct btrfs_root *root)
2178 {
2179         struct btrfs_trans_handle *trans;
2180         struct btrfs_path *path;
2181         struct btrfs_key key;
2182         int ret, err;
2183
2184         path = btrfs_alloc_path();
2185         if (!path)
2186                 return -ENOMEM;
2187
2188         trans = btrfs_start_transaction(root, 0);
2189         if (IS_ERR(trans)) {
2190                 btrfs_free_path(path);
2191                 return PTR_ERR(trans);
2192         }
2193
2194         key.objectid = BTRFS_BALANCE_OBJECTID;
2195         key.type = BTRFS_BALANCE_ITEM_KEY;
2196         key.offset = 0;
2197
2198         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2199         if (ret < 0)
2200                 goto out;
2201         if (ret > 0) {
2202                 ret = -ENOENT;
2203                 goto out;
2204         }
2205
2206         ret = btrfs_del_item(trans, root, path);
2207 out:
2208         btrfs_free_path(path);
2209         err = btrfs_commit_transaction(trans, root);
2210         if (err && !ret)
2211                 ret = err;
2212         return ret;
2213 }
2214
2215 /*
2216  * This is a heuristic used to reduce the number of chunks balanced on
2217  * resume after balance was interrupted.
2218  */
2219 static void update_balance_args(struct btrfs_balance_control *bctl)
2220 {
2221         /*
2222          * Turn on soft mode for chunk types that were being converted.
2223          */
2224         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2225                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2226         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2227                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2228         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2229                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2230
2231         /*
2232          * Turn on usage filter if is not already used.  The idea is
2233          * that chunks that we have already balanced should be
2234          * reasonably full.  Don't do it for chunks that are being
2235          * converted - that will keep us from relocating unconverted
2236          * (albeit full) chunks.
2237          */
2238         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2239             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2240                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2241                 bctl->data.usage = 90;
2242         }
2243         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2244             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2245                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2246                 bctl->sys.usage = 90;
2247         }
2248         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2249             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2250                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2251                 bctl->meta.usage = 90;
2252         }
2253 }
2254
2255 /*
2256  * Should be called with both balance and volume mutexes held to
2257  * serialize other volume operations (add_dev/rm_dev/resize) with
2258  * restriper.  Same goes for unset_balance_control.
2259  */
2260 static void set_balance_control(struct btrfs_balance_control *bctl)
2261 {
2262         struct btrfs_fs_info *fs_info = bctl->fs_info;
2263
2264         BUG_ON(fs_info->balance_ctl);
2265
2266         spin_lock(&fs_info->balance_lock);
2267         fs_info->balance_ctl = bctl;
2268         spin_unlock(&fs_info->balance_lock);
2269 }
2270
2271 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2272 {
2273         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2274
2275         BUG_ON(!fs_info->balance_ctl);
2276
2277         spin_lock(&fs_info->balance_lock);
2278         fs_info->balance_ctl = NULL;
2279         spin_unlock(&fs_info->balance_lock);
2280
2281         kfree(bctl);
2282 }
2283
2284 /*
2285  * Balance filters.  Return 1 if chunk should be filtered out
2286  * (should not be balanced).
2287  */
2288 static int chunk_profiles_filter(u64 chunk_type,
2289                                  struct btrfs_balance_args *bargs)
2290 {
2291         chunk_type = chunk_to_extended(chunk_type) &
2292                                 BTRFS_EXTENDED_PROFILE_MASK;
2293
2294         if (bargs->profiles & chunk_type)
2295                 return 0;
2296
2297         return 1;
2298 }
2299
2300 static u64 div_factor_fine(u64 num, int factor)
2301 {
2302         if (factor <= 0)
2303                 return 0;
2304         if (factor >= 100)
2305                 return num;
2306
2307         num *= factor;
2308         do_div(num, 100);
2309         return num;
2310 }
2311
2312 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2313                               struct btrfs_balance_args *bargs)
2314 {
2315         struct btrfs_block_group_cache *cache;
2316         u64 chunk_used, user_thresh;
2317         int ret = 1;
2318
2319         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2320         chunk_used = btrfs_block_group_used(&cache->item);
2321
2322         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2323         if (chunk_used < user_thresh)
2324                 ret = 0;
2325
2326         btrfs_put_block_group(cache);
2327         return ret;
2328 }
2329
2330 static int chunk_devid_filter(struct extent_buffer *leaf,
2331                               struct btrfs_chunk *chunk,
2332                               struct btrfs_balance_args *bargs)
2333 {
2334         struct btrfs_stripe *stripe;
2335         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2336         int i;
2337
2338         for (i = 0; i < num_stripes; i++) {
2339                 stripe = btrfs_stripe_nr(chunk, i);
2340                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2341                         return 0;
2342         }
2343
2344         return 1;
2345 }
2346
2347 /* [pstart, pend) */
2348 static int chunk_drange_filter(struct extent_buffer *leaf,
2349                                struct btrfs_chunk *chunk,
2350                                u64 chunk_offset,
2351                                struct btrfs_balance_args *bargs)
2352 {
2353         struct btrfs_stripe *stripe;
2354         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2355         u64 stripe_offset;
2356         u64 stripe_length;
2357         int factor;
2358         int i;
2359
2360         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2361                 return 0;
2362
2363         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2364              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2365                 factor = 2;
2366         else
2367                 factor = 1;
2368         factor = num_stripes / factor;
2369
2370         for (i = 0; i < num_stripes; i++) {
2371                 stripe = btrfs_stripe_nr(chunk, i);
2372                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2373                         continue;
2374
2375                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2376                 stripe_length = btrfs_chunk_length(leaf, chunk);
2377                 do_div(stripe_length, factor);
2378
2379                 if (stripe_offset < bargs->pend &&
2380                     stripe_offset + stripe_length > bargs->pstart)
2381                         return 0;
2382         }
2383
2384         return 1;
2385 }
2386
2387 /* [vstart, vend) */
2388 static int chunk_vrange_filter(struct extent_buffer *leaf,
2389                                struct btrfs_chunk *chunk,
2390                                u64 chunk_offset,
2391                                struct btrfs_balance_args *bargs)
2392 {
2393         if (chunk_offset < bargs->vend &&
2394             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2395                 /* at least part of the chunk is inside this vrange */
2396                 return 0;
2397
2398         return 1;
2399 }
2400
2401 static int chunk_soft_convert_filter(u64 chunk_type,
2402                                      struct btrfs_balance_args *bargs)
2403 {
2404         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2405                 return 0;
2406
2407         chunk_type = chunk_to_extended(chunk_type) &
2408                                 BTRFS_EXTENDED_PROFILE_MASK;
2409
2410         if (bargs->target == chunk_type)
2411                 return 1;
2412
2413         return 0;
2414 }
2415
2416 static int should_balance_chunk(struct btrfs_root *root,
2417                                 struct extent_buffer *leaf,
2418                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2419 {
2420         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2421         struct btrfs_balance_args *bargs = NULL;
2422         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2423
2424         /* type filter */
2425         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2426               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2427                 return 0;
2428         }
2429
2430         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2431                 bargs = &bctl->data;
2432         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2433                 bargs = &bctl->sys;
2434         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2435                 bargs = &bctl->meta;
2436
2437         /* profiles filter */
2438         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2439             chunk_profiles_filter(chunk_type, bargs)) {
2440                 return 0;
2441         }
2442
2443         /* usage filter */
2444         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2445             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2446                 return 0;
2447         }
2448
2449         /* devid filter */
2450         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2451             chunk_devid_filter(leaf, chunk, bargs)) {
2452                 return 0;
2453         }
2454
2455         /* drange filter, makes sense only with devid filter */
2456         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2457             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2458                 return 0;
2459         }
2460
2461         /* vrange filter */
2462         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2463             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2464                 return 0;
2465         }
2466
2467         /* soft profile changing mode */
2468         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2469             chunk_soft_convert_filter(chunk_type, bargs)) {
2470                 return 0;
2471         }
2472
2473         return 1;
2474 }
2475
2476 static u64 div_factor(u64 num, int factor)
2477 {
2478         if (factor == 10)
2479                 return num;
2480         num *= factor;
2481         do_div(num, 10);
2482         return num;
2483 }
2484
2485 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2486 {
2487         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2488         struct btrfs_root *chunk_root = fs_info->chunk_root;
2489         struct btrfs_root *dev_root = fs_info->dev_root;
2490         struct list_head *devices;
2491         struct btrfs_device *device;
2492         u64 old_size;
2493         u64 size_to_free;
2494         struct btrfs_chunk *chunk;
2495         struct btrfs_path *path;
2496         struct btrfs_key key;
2497         struct btrfs_key found_key;
2498         struct btrfs_trans_handle *trans;
2499         struct extent_buffer *leaf;
2500         int slot;
2501         int ret;
2502         int enospc_errors = 0;
2503         bool counting = true;
2504
2505         /* step one make some room on all the devices */
2506         devices = &fs_info->fs_devices->devices;
2507         list_for_each_entry(device, devices, dev_list) {
2508                 old_size = device->total_bytes;
2509                 size_to_free = div_factor(old_size, 1);
2510                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2511                 if (!device->writeable ||
2512                     device->total_bytes - device->bytes_used > size_to_free)
2513                         continue;
2514
2515                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2516                 if (ret == -ENOSPC)
2517                         break;
2518                 BUG_ON(ret);
2519
2520                 trans = btrfs_start_transaction(dev_root, 0);
2521                 BUG_ON(IS_ERR(trans));
2522
2523                 ret = btrfs_grow_device(trans, device, old_size);
2524                 BUG_ON(ret);
2525
2526                 btrfs_end_transaction(trans, dev_root);
2527         }
2528
2529         /* step two, relocate all the chunks */
2530         path = btrfs_alloc_path();
2531         if (!path) {
2532                 ret = -ENOMEM;
2533                 goto error;
2534         }
2535
2536         /* zero out stat counters */
2537         spin_lock(&fs_info->balance_lock);
2538         memset(&bctl->stat, 0, sizeof(bctl->stat));
2539         spin_unlock(&fs_info->balance_lock);
2540 again:
2541         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2542         key.offset = (u64)-1;
2543         key.type = BTRFS_CHUNK_ITEM_KEY;
2544
2545         while (1) {
2546                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2547                     atomic_read(&fs_info->balance_cancel_req)) {
2548                         ret = -ECANCELED;
2549                         goto error;
2550                 }
2551
2552                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2553                 if (ret < 0)
2554                         goto error;
2555
2556                 /*
2557                  * this shouldn't happen, it means the last relocate
2558                  * failed
2559                  */
2560                 if (ret == 0)
2561                         BUG(); /* FIXME break ? */
2562
2563                 ret = btrfs_previous_item(chunk_root, path, 0,
2564                                           BTRFS_CHUNK_ITEM_KEY);
2565                 if (ret) {
2566                         ret = 0;
2567                         break;
2568                 }
2569
2570                 leaf = path->nodes[0];
2571                 slot = path->slots[0];
2572                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2573
2574                 if (found_key.objectid != key.objectid)
2575                         break;
2576
2577                 /* chunk zero is special */
2578                 if (found_key.offset == 0)
2579                         break;
2580
2581                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2582
2583                 if (!counting) {
2584                         spin_lock(&fs_info->balance_lock);
2585                         bctl->stat.considered++;
2586                         spin_unlock(&fs_info->balance_lock);
2587                 }
2588
2589                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2590                                            found_key.offset);
2591                 btrfs_release_path(path);
2592                 if (!ret)
2593                         goto loop;
2594
2595                 if (counting) {
2596                         spin_lock(&fs_info->balance_lock);
2597                         bctl->stat.expected++;
2598                         spin_unlock(&fs_info->balance_lock);
2599                         goto loop;
2600                 }
2601
2602                 ret = btrfs_relocate_chunk(chunk_root,
2603                                            chunk_root->root_key.objectid,
2604                                            found_key.objectid,
2605                                            found_key.offset);
2606                 if (ret && ret != -ENOSPC)
2607                         goto error;
2608                 if (ret == -ENOSPC) {
2609                         enospc_errors++;
2610                 } else {
2611                         spin_lock(&fs_info->balance_lock);
2612                         bctl->stat.completed++;
2613                         spin_unlock(&fs_info->balance_lock);
2614                 }
2615 loop:
2616                 key.offset = found_key.offset - 1;
2617         }
2618
2619         if (counting) {
2620                 btrfs_release_path(path);
2621                 counting = false;
2622                 goto again;
2623         }
2624 error:
2625         btrfs_free_path(path);
2626         if (enospc_errors) {
2627                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2628                        enospc_errors);
2629                 if (!ret)
2630                         ret = -ENOSPC;
2631         }
2632
2633         return ret;
2634 }
2635
2636 /**
2637  * alloc_profile_is_valid - see if a given profile is valid and reduced
2638  * @flags: profile to validate
2639  * @extended: if true @flags is treated as an extended profile
2640  */
2641 static int alloc_profile_is_valid(u64 flags, int extended)
2642 {
2643         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
2644                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
2645
2646         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2647
2648         /* 1) check that all other bits are zeroed */
2649         if (flags & ~mask)
2650                 return 0;
2651
2652         /* 2) see if profile is reduced */
2653         if (flags == 0)
2654                 return !extended; /* "0" is valid for usual profiles */
2655
2656         /* true if exactly one bit set */
2657         return (flags & (flags - 1)) == 0;
2658 }
2659
2660 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2661 {
2662         /* cancel requested || normal exit path */
2663         return atomic_read(&fs_info->balance_cancel_req) ||
2664                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2665                  atomic_read(&fs_info->balance_cancel_req) == 0);
2666 }
2667
2668 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2669 {
2670         int ret;
2671
2672         unset_balance_control(fs_info);
2673         ret = del_balance_item(fs_info->tree_root);
2674         BUG_ON(ret);
2675 }
2676
2677 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2678                                struct btrfs_ioctl_balance_args *bargs);
2679
2680 /*
2681  * Should be called with both balance and volume mutexes held
2682  */
2683 int btrfs_balance(struct btrfs_balance_control *bctl,
2684                   struct btrfs_ioctl_balance_args *bargs)
2685 {
2686         struct btrfs_fs_info *fs_info = bctl->fs_info;
2687         u64 allowed;
2688         int mixed = 0;
2689         int ret;
2690
2691         if (btrfs_fs_closing(fs_info) ||
2692             atomic_read(&fs_info->balance_pause_req) ||
2693             atomic_read(&fs_info->balance_cancel_req)) {
2694                 ret = -EINVAL;
2695                 goto out;
2696         }
2697
2698         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2699         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
2700                 mixed = 1;
2701
2702         /*
2703          * In case of mixed groups both data and meta should be picked,
2704          * and identical options should be given for both of them.
2705          */
2706         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
2707         if (mixed && (bctl->flags & allowed)) {
2708                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2709                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2710                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2711                         printk(KERN_ERR "btrfs: with mixed groups data and "
2712                                "metadata balance options must be the same\n");
2713                         ret = -EINVAL;
2714                         goto out;
2715                 }
2716         }
2717
2718         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2719         if (fs_info->fs_devices->num_devices == 1)
2720                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2721         else if (fs_info->fs_devices->num_devices < 4)
2722                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2723         else
2724                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2725                                 BTRFS_BLOCK_GROUP_RAID10);
2726
2727         if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2728             (!alloc_profile_is_valid(bctl->data.target, 1) ||
2729              (bctl->data.target & ~allowed))) {
2730                 printk(KERN_ERR "btrfs: unable to start balance with target "
2731                        "data profile %llu\n",
2732                        (unsigned long long)bctl->data.target);
2733                 ret = -EINVAL;
2734                 goto out;
2735         }
2736         if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2737             (!alloc_profile_is_valid(bctl->meta.target, 1) ||
2738              (bctl->meta.target & ~allowed))) {
2739                 printk(KERN_ERR "btrfs: unable to start balance with target "
2740                        "metadata profile %llu\n",
2741                        (unsigned long long)bctl->meta.target);
2742                 ret = -EINVAL;
2743                 goto out;
2744         }
2745         if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2746             (!alloc_profile_is_valid(bctl->sys.target, 1) ||
2747              (bctl->sys.target & ~allowed))) {
2748                 printk(KERN_ERR "btrfs: unable to start balance with target "
2749                        "system profile %llu\n",
2750                        (unsigned long long)bctl->sys.target);
2751                 ret = -EINVAL;
2752                 goto out;
2753         }
2754
2755         /* allow dup'ed data chunks only in mixed mode */
2756         if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2757             (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
2758                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2759                 ret = -EINVAL;
2760                 goto out;
2761         }
2762
2763         /* allow to reduce meta or sys integrity only if force set */
2764         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2765                         BTRFS_BLOCK_GROUP_RAID10;
2766         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2767              (fs_info->avail_system_alloc_bits & allowed) &&
2768              !(bctl->sys.target & allowed)) ||
2769             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2770              (fs_info->avail_metadata_alloc_bits & allowed) &&
2771              !(bctl->meta.target & allowed))) {
2772                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2773                         printk(KERN_INFO "btrfs: force reducing metadata "
2774                                "integrity\n");
2775                 } else {
2776                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2777                                "integrity, use force if you want this\n");
2778                         ret = -EINVAL;
2779                         goto out;
2780                 }
2781         }
2782
2783         ret = insert_balance_item(fs_info->tree_root, bctl);
2784         if (ret && ret != -EEXIST)
2785                 goto out;
2786
2787         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2788                 BUG_ON(ret == -EEXIST);
2789                 set_balance_control(bctl);
2790         } else {
2791                 BUG_ON(ret != -EEXIST);
2792                 spin_lock(&fs_info->balance_lock);
2793                 update_balance_args(bctl);
2794                 spin_unlock(&fs_info->balance_lock);
2795         }
2796
2797         atomic_inc(&fs_info->balance_running);
2798         mutex_unlock(&fs_info->balance_mutex);
2799
2800         ret = __btrfs_balance(fs_info);
2801
2802         mutex_lock(&fs_info->balance_mutex);
2803         atomic_dec(&fs_info->balance_running);
2804
2805         if (bargs) {
2806                 memset(bargs, 0, sizeof(*bargs));
2807                 update_ioctl_balance_args(fs_info, 0, bargs);
2808         }
2809
2810         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2811             balance_need_close(fs_info)) {
2812                 __cancel_balance(fs_info);
2813         }
2814
2815         wake_up(&fs_info->balance_wait_q);
2816
2817         return ret;
2818 out:
2819         if (bctl->flags & BTRFS_BALANCE_RESUME)
2820                 __cancel_balance(fs_info);
2821         else
2822                 kfree(bctl);
2823         return ret;
2824 }
2825
2826 static int balance_kthread(void *data)
2827 {
2828         struct btrfs_balance_control *bctl =
2829                         (struct btrfs_balance_control *)data;
2830         struct btrfs_fs_info *fs_info = bctl->fs_info;
2831         int ret = 0;
2832
2833         mutex_lock(&fs_info->volume_mutex);
2834         mutex_lock(&fs_info->balance_mutex);
2835
2836         set_balance_control(bctl);
2837
2838         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2839                 printk(KERN_INFO "btrfs: force skipping balance\n");
2840         } else {
2841                 printk(KERN_INFO "btrfs: continuing balance\n");
2842                 ret = btrfs_balance(bctl, NULL);
2843         }
2844
2845         mutex_unlock(&fs_info->balance_mutex);
2846         mutex_unlock(&fs_info->volume_mutex);
2847         return ret;
2848 }
2849
2850 int btrfs_recover_balance(struct btrfs_root *tree_root)
2851 {
2852         struct task_struct *tsk;
2853         struct btrfs_balance_control *bctl;
2854         struct btrfs_balance_item *item;
2855         struct btrfs_disk_balance_args disk_bargs;
2856         struct btrfs_path *path;
2857         struct extent_buffer *leaf;
2858         struct btrfs_key key;
2859         int ret;
2860
2861         path = btrfs_alloc_path();
2862         if (!path)
2863                 return -ENOMEM;
2864
2865         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2866         if (!bctl) {
2867                 ret = -ENOMEM;
2868                 goto out;
2869         }
2870
2871         key.objectid = BTRFS_BALANCE_OBJECTID;
2872         key.type = BTRFS_BALANCE_ITEM_KEY;
2873         key.offset = 0;
2874
2875         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2876         if (ret < 0)
2877                 goto out_bctl;
2878         if (ret > 0) { /* ret = -ENOENT; */
2879                 ret = 0;
2880                 goto out_bctl;
2881         }
2882
2883         leaf = path->nodes[0];
2884         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2885
2886         bctl->fs_info = tree_root->fs_info;
2887         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2888
2889         btrfs_balance_data(leaf, item, &disk_bargs);
2890         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2891         btrfs_balance_meta(leaf, item, &disk_bargs);
2892         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2893         btrfs_balance_sys(leaf, item, &disk_bargs);
2894         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2895
2896         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2897         if (IS_ERR(tsk))
2898                 ret = PTR_ERR(tsk);
2899         else
2900                 goto out;
2901
2902 out_bctl:
2903         kfree(bctl);
2904 out:
2905         btrfs_free_path(path);
2906         return ret;
2907 }
2908
2909 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2910 {
2911         int ret = 0;
2912
2913         mutex_lock(&fs_info->balance_mutex);
2914         if (!fs_info->balance_ctl) {
2915                 mutex_unlock(&fs_info->balance_mutex);
2916                 return -ENOTCONN;
2917         }
2918
2919         if (atomic_read(&fs_info->balance_running)) {
2920                 atomic_inc(&fs_info->balance_pause_req);
2921                 mutex_unlock(&fs_info->balance_mutex);
2922
2923                 wait_event(fs_info->balance_wait_q,
2924                            atomic_read(&fs_info->balance_running) == 0);
2925
2926                 mutex_lock(&fs_info->balance_mutex);
2927                 /* we are good with balance_ctl ripped off from under us */
2928                 BUG_ON(atomic_read(&fs_info->balance_running));
2929                 atomic_dec(&fs_info->balance_pause_req);
2930         } else {
2931                 ret = -ENOTCONN;
2932         }
2933
2934         mutex_unlock(&fs_info->balance_mutex);
2935         return ret;
2936 }
2937
2938 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2939 {
2940         mutex_lock(&fs_info->balance_mutex);
2941         if (!fs_info->balance_ctl) {
2942                 mutex_unlock(&fs_info->balance_mutex);
2943                 return -ENOTCONN;
2944         }
2945
2946         atomic_inc(&fs_info->balance_cancel_req);
2947         /*
2948          * if we are running just wait and return, balance item is
2949          * deleted in btrfs_balance in this case
2950          */
2951         if (atomic_read(&fs_info->balance_running)) {
2952                 mutex_unlock(&fs_info->balance_mutex);
2953                 wait_event(fs_info->balance_wait_q,
2954                            atomic_read(&fs_info->balance_running) == 0);
2955                 mutex_lock(&fs_info->balance_mutex);
2956         } else {
2957                 /* __cancel_balance needs volume_mutex */
2958                 mutex_unlock(&fs_info->balance_mutex);
2959                 mutex_lock(&fs_info->volume_mutex);
2960                 mutex_lock(&fs_info->balance_mutex);
2961
2962                 if (fs_info->balance_ctl)
2963                         __cancel_balance(fs_info);
2964
2965                 mutex_unlock(&fs_info->volume_mutex);
2966         }
2967
2968         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2969         atomic_dec(&fs_info->balance_cancel_req);
2970         mutex_unlock(&fs_info->balance_mutex);
2971         return 0;
2972 }
2973
2974 /*
2975  * shrinking a device means finding all of the device extents past
2976  * the new size, and then following the back refs to the chunks.
2977  * The chunk relocation code actually frees the device extent
2978  */
2979 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2980 {
2981         struct btrfs_trans_handle *trans;
2982         struct btrfs_root *root = device->dev_root;
2983         struct btrfs_dev_extent *dev_extent = NULL;
2984         struct btrfs_path *path;
2985         u64 length;
2986         u64 chunk_tree;
2987         u64 chunk_objectid;
2988         u64 chunk_offset;
2989         int ret;
2990         int slot;
2991         int failed = 0;
2992         bool retried = false;
2993         struct extent_buffer *l;
2994         struct btrfs_key key;
2995         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2996         u64 old_total = btrfs_super_total_bytes(super_copy);
2997         u64 old_size = device->total_bytes;
2998         u64 diff = device->total_bytes - new_size;
2999
3000         if (new_size >= device->total_bytes)
3001                 return -EINVAL;
3002
3003         path = btrfs_alloc_path();
3004         if (!path)
3005                 return -ENOMEM;
3006
3007         path->reada = 2;
3008
3009         lock_chunks(root);
3010
3011         device->total_bytes = new_size;
3012         if (device->writeable) {
3013                 device->fs_devices->total_rw_bytes -= diff;
3014                 spin_lock(&root->fs_info->free_chunk_lock);
3015                 root->fs_info->free_chunk_space -= diff;
3016                 spin_unlock(&root->fs_info->free_chunk_lock);
3017         }
3018         unlock_chunks(root);
3019
3020 again:
3021         key.objectid = device->devid;
3022         key.offset = (u64)-1;
3023         key.type = BTRFS_DEV_EXTENT_KEY;
3024
3025         do {
3026                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3027                 if (ret < 0)
3028                         goto done;
3029
3030                 ret = btrfs_previous_item(root, path, 0, key.type);
3031                 if (ret < 0)
3032                         goto done;
3033                 if (ret) {
3034                         ret = 0;
3035                         btrfs_release_path(path);
3036                         break;
3037                 }
3038
3039                 l = path->nodes[0];
3040                 slot = path->slots[0];
3041                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
3042
3043                 if (key.objectid != device->devid) {
3044                         btrfs_release_path(path);
3045                         break;
3046                 }
3047
3048                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3049                 length = btrfs_dev_extent_length(l, dev_extent);
3050
3051                 if (key.offset + length <= new_size) {
3052                         btrfs_release_path(path);
3053                         break;
3054                 }
3055
3056                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3057                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3058                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3059                 btrfs_release_path(path);
3060
3061                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
3062                                            chunk_offset);
3063                 if (ret && ret != -ENOSPC)
3064                         goto done;
3065                 if (ret == -ENOSPC)
3066                         failed++;
3067         } while (key.offset-- > 0);
3068
3069         if (failed && !retried) {
3070                 failed = 0;
3071                 retried = true;
3072                 goto again;
3073         } else if (failed && retried) {
3074                 ret = -ENOSPC;
3075                 lock_chunks(root);
3076
3077                 device->total_bytes = old_size;
3078                 if (device->writeable)
3079                         device->fs_devices->total_rw_bytes += diff;
3080                 spin_lock(&root->fs_info->free_chunk_lock);
3081                 root->fs_info->free_chunk_space += diff;
3082                 spin_unlock(&root->fs_info->free_chunk_lock);
3083                 unlock_chunks(root);
3084                 goto done;
3085         }
3086
3087         /* Shrinking succeeded, else we would be at "done". */
3088         trans = btrfs_start_transaction(root, 0);
3089         if (IS_ERR(trans)) {
3090                 ret = PTR_ERR(trans);
3091                 goto done;
3092         }
3093
3094         lock_chunks(root);
3095
3096         device->disk_total_bytes = new_size;
3097         /* Now btrfs_update_device() will change the on-disk size. */
3098         ret = btrfs_update_device(trans, device);
3099         if (ret) {
3100                 unlock_chunks(root);
3101                 btrfs_end_transaction(trans, root);
3102                 goto done;
3103         }
3104         WARN_ON(diff > old_total);
3105         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3106         unlock_chunks(root);
3107         btrfs_end_transaction(trans, root);
3108 done:
3109         btrfs_free_path(path);
3110         return ret;
3111 }
3112
3113 static int btrfs_add_system_chunk(struct btrfs_root *root,
3114                            struct btrfs_key *key,
3115                            struct btrfs_chunk *chunk, int item_size)
3116 {
3117         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3118         struct btrfs_disk_key disk_key;
3119         u32 array_size;
3120         u8 *ptr;
3121
3122         array_size = btrfs_super_sys_array_size(super_copy);
3123         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3124                 return -EFBIG;
3125
3126         ptr = super_copy->sys_chunk_array + array_size;
3127         btrfs_cpu_key_to_disk(&disk_key, key);
3128         memcpy(ptr, &disk_key, sizeof(disk_key));
3129         ptr += sizeof(disk_key);
3130         memcpy(ptr, chunk, item_size);
3131         item_size += sizeof(disk_key);
3132         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3133         return 0;
3134 }
3135
3136 /*
3137  * sort the devices in descending order by max_avail, total_avail
3138  */
3139 static int btrfs_cmp_device_info(const void *a, const void *b)
3140 {
3141         const struct btrfs_device_info *di_a = a;
3142         const struct btrfs_device_info *di_b = b;
3143
3144         if (di_a->max_avail > di_b->max_avail)
3145                 return -1;
3146         if (di_a->max_avail < di_b->max_avail)
3147                 return 1;
3148         if (di_a->total_avail > di_b->total_avail)
3149                 return -1;
3150         if (di_a->total_avail < di_b->total_avail)
3151                 return 1;
3152         return 0;
3153 }
3154
3155 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3156                                struct btrfs_root *extent_root,
3157                                struct map_lookup **map_ret,
3158                                u64 *num_bytes_out, u64 *stripe_size_out,
3159                                u64 start, u64 type)
3160 {
3161         struct btrfs_fs_info *info = extent_root->fs_info;
3162         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3163         struct list_head *cur;
3164         struct map_lookup *map = NULL;
3165         struct extent_map_tree *em_tree;
3166         struct extent_map *em;
3167         struct btrfs_device_info *devices_info = NULL;
3168         u64 total_avail;
3169         int num_stripes;        /* total number of stripes to allocate */
3170         int sub_stripes;        /* sub_stripes info for map */
3171         int dev_stripes;        /* stripes per dev */
3172         int devs_max;           /* max devs to use */
3173         int devs_min;           /* min devs needed */
3174         int devs_increment;     /* ndevs has to be a multiple of this */
3175         int ncopies;            /* how many copies to data has */
3176         int ret;
3177         u64 max_stripe_size;
3178         u64 max_chunk_size;
3179         u64 stripe_size;
3180         u64 num_bytes;
3181         int ndevs;
3182         int i;
3183         int j;
3184
3185         BUG_ON(!alloc_profile_is_valid(type, 0));
3186
3187         if (list_empty(&fs_devices->alloc_list))
3188                 return -ENOSPC;
3189
3190         sub_stripes = 1;
3191         dev_stripes = 1;
3192         devs_increment = 1;
3193         ncopies = 1;
3194         devs_max = 0;   /* 0 == as many as possible */
3195         devs_min = 1;
3196
3197         /*
3198          * define the properties of each RAID type.
3199          * FIXME: move this to a global table and use it in all RAID
3200          * calculation code
3201          */
3202         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3203                 dev_stripes = 2;
3204                 ncopies = 2;
3205                 devs_max = 1;
3206         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3207                 devs_min = 2;
3208         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3209                 devs_increment = 2;
3210                 ncopies = 2;
3211                 devs_max = 2;
3212                 devs_min = 2;
3213         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3214                 sub_stripes = 2;
3215                 devs_increment = 2;
3216                 ncopies = 2;
3217                 devs_min = 4;
3218         } else {
3219                 devs_max = 1;
3220         }
3221
3222         if (type & BTRFS_BLOCK_GROUP_DATA) {
3223                 max_stripe_size = 1024 * 1024 * 1024;
3224                 max_chunk_size = 10 * max_stripe_size;
3225         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3226                 /* for larger filesystems, use larger metadata chunks */
3227                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3228                         max_stripe_size = 1024 * 1024 * 1024;
3229                 else
3230                         max_stripe_size = 256 * 1024 * 1024;
3231                 max_chunk_size = max_stripe_size;
3232         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3233                 max_stripe_size = 32 * 1024 * 1024;
3234                 max_chunk_size = 2 * max_stripe_size;
3235         } else {
3236                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3237                        type);
3238                 BUG_ON(1);
3239         }
3240
3241         /* we don't want a chunk larger than 10% of writeable space */
3242         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3243                              max_chunk_size);
3244
3245         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3246                                GFP_NOFS);
3247         if (!devices_info)
3248                 return -ENOMEM;
3249
3250         cur = fs_devices->alloc_list.next;
3251
3252         /*
3253          * in the first pass through the devices list, we gather information
3254          * about the available holes on each device.
3255          */
3256         ndevs = 0;
3257         while (cur != &fs_devices->alloc_list) {
3258                 struct btrfs_device *device;
3259                 u64 max_avail;
3260                 u64 dev_offset;
3261
3262                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3263
3264                 cur = cur->next;
3265
3266                 if (!device->writeable) {
3267                         printk(KERN_ERR
3268                                "btrfs: read-only device in alloc_list\n");
3269                         WARN_ON(1);
3270                         continue;
3271                 }
3272
3273                 if (!device->in_fs_metadata)
3274                         continue;
3275
3276                 if (device->total_bytes > device->bytes_used)
3277                         total_avail = device->total_bytes - device->bytes_used;
3278                 else
3279                         total_avail = 0;
3280
3281                 /* If there is no space on this device, skip it. */
3282                 if (total_avail == 0)
3283                         continue;
3284
3285                 ret = find_free_dev_extent(device,
3286                                            max_stripe_size * dev_stripes,
3287                                            &dev_offset, &max_avail);
3288                 if (ret && ret != -ENOSPC)
3289                         goto error;
3290
3291                 if (ret == 0)
3292                         max_avail = max_stripe_size * dev_stripes;
3293
3294                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3295                         continue;
3296
3297                 devices_info[ndevs].dev_offset = dev_offset;
3298                 devices_info[ndevs].max_avail = max_avail;
3299                 devices_info[ndevs].total_avail = total_avail;
3300                 devices_info[ndevs].dev = device;
3301                 ++ndevs;
3302         }
3303
3304         /*
3305          * now sort the devices by hole size / available space
3306          */
3307         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3308              btrfs_cmp_device_info, NULL);
3309
3310         /* round down to number of usable stripes */
3311         ndevs -= ndevs % devs_increment;
3312
3313         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3314                 ret = -ENOSPC;
3315                 goto error;
3316         }
3317
3318         if (devs_max && ndevs > devs_max)
3319                 ndevs = devs_max;
3320         /*
3321          * the primary goal is to maximize the number of stripes, so use as many
3322          * devices as possible, even if the stripes are not maximum sized.
3323          */
3324         stripe_size = devices_info[ndevs-1].max_avail;
3325         num_stripes = ndevs * dev_stripes;
3326
3327         if (stripe_size * ndevs > max_chunk_size * ncopies) {
3328                 stripe_size = max_chunk_size * ncopies;
3329                 do_div(stripe_size, ndevs);
3330         }
3331
3332         do_div(stripe_size, dev_stripes);
3333
3334         /* align to BTRFS_STRIPE_LEN */
3335         do_div(stripe_size, BTRFS_STRIPE_LEN);
3336         stripe_size *= BTRFS_STRIPE_LEN;
3337
3338         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3339         if (!map) {
3340                 ret = -ENOMEM;
3341                 goto error;
3342         }
3343         map->num_stripes = num_stripes;
3344
3345         for (i = 0; i < ndevs; ++i) {
3346                 for (j = 0; j < dev_stripes; ++j) {
3347                         int s = i * dev_stripes + j;
3348                         map->stripes[s].dev = devices_info[i].dev;
3349                         map->stripes[s].physical = devices_info[i].dev_offset +
3350                                                    j * stripe_size;
3351                 }
3352         }
3353         map->sector_size = extent_root->sectorsize;
3354         map->stripe_len = BTRFS_STRIPE_LEN;
3355         map->io_align = BTRFS_STRIPE_LEN;
3356         map->io_width = BTRFS_STRIPE_LEN;
3357         map->type = type;
3358         map->sub_stripes = sub_stripes;
3359
3360         *map_ret = map;
3361         num_bytes = stripe_size * (num_stripes / ncopies);
3362
3363         *stripe_size_out = stripe_size;
3364         *num_bytes_out = num_bytes;
3365
3366         trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
3367
3368         em = alloc_extent_map();
3369         if (!em) {
3370                 ret = -ENOMEM;
3371                 goto error;
3372         }
3373         em->bdev = (struct block_device *)map;
3374         em->start = start;
3375         em->len = num_bytes;
3376         em->block_start = 0;
3377         em->block_len = em->len;
3378
3379         em_tree = &extent_root->fs_info->mapping_tree.map_tree;
3380         write_lock(&em_tree->lock);
3381         ret = add_extent_mapping(em_tree, em);
3382         write_unlock(&em_tree->lock);
3383         free_extent_map(em);
3384         if (ret)
3385                 goto error;
3386
3387         ret = btrfs_make_block_group(trans, extent_root, 0, type,
3388                                      BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3389                                      start, num_bytes);
3390         if (ret)
3391                 goto error;
3392
3393         for (i = 0; i < map->num_stripes; ++i) {
3394                 struct btrfs_device *device;
3395                 u64 dev_offset;
3396
3397                 device = map->stripes[i].dev;
3398                 dev_offset = map->stripes[i].physical;
3399
3400                 ret = btrfs_alloc_dev_extent(trans, device,
3401                                 info->chunk_root->root_key.objectid,
3402                                 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3403                                 start, dev_offset, stripe_size);
3404                 if (ret) {
3405                         btrfs_abort_transaction(trans, extent_root, ret);
3406                         goto error;
3407                 }
3408         }
3409
3410         kfree(devices_info);
3411         return 0;
3412
3413 error:
3414         kfree(map);
3415         kfree(devices_info);
3416         return ret;
3417 }
3418
3419 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
3420                                 struct btrfs_root *extent_root,
3421                                 struct map_lookup *map, u64 chunk_offset,
3422                                 u64 chunk_size, u64 stripe_size)
3423 {
3424         u64 dev_offset;
3425         struct btrfs_key key;
3426         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3427         struct btrfs_device *device;
3428         struct btrfs_chunk *chunk;
3429         struct btrfs_stripe *stripe;
3430         size_t item_size = btrfs_chunk_item_size(map->num_stripes);
3431         int index = 0;
3432         int ret;
3433
3434         chunk = kzalloc(item_size, GFP_NOFS);
3435         if (!chunk)
3436                 return -ENOMEM;
3437
3438         index = 0;
3439         while (index < map->num_stripes) {
3440                 device = map->stripes[index].dev;
3441                 device->bytes_used += stripe_size;
3442                 ret = btrfs_update_device(trans, device);
3443                 if (ret)
3444                         goto out_free;
3445                 index++;
3446         }
3447
3448         spin_lock(&extent_root->fs_info->free_chunk_lock);
3449         extent_root->fs_info->free_chunk_space -= (stripe_size *
3450                                                    map->num_stripes);
3451         spin_unlock(&extent_root->fs_info->free_chunk_lock);
3452
3453         index = 0;
3454         stripe = &chunk->stripe;
3455         while (index < map->num_stripes) {
3456                 device = map->stripes[index].dev;
3457                 dev_offset = map->stripes[index].physical;
3458
3459                 btrfs_set_stack_stripe_devid(stripe, device->devid);
3460                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
3461                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
3462                 stripe++;
3463                 index++;
3464         }
3465
3466         btrfs_set_stack_chunk_length(chunk, chunk_size);
3467         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
3468         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
3469         btrfs_set_stack_chunk_type(chunk, map->type);
3470         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
3471         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
3472         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
3473         btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
3474         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
3475
3476         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3477         key.type = BTRFS_CHUNK_ITEM_KEY;
3478         key.offset = chunk_offset;
3479
3480         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
3481
3482         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3483                 /*
3484                  * TODO: Cleanup of inserted chunk root in case of
3485                  * failure.
3486                  */
3487                 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
3488                                              item_size);
3489         }
3490
3491 out_free:
3492         kfree(chunk);
3493         return ret;
3494 }
3495
3496 /*
3497  * Chunk allocation falls into two parts. The first part does works
3498  * that make the new allocated chunk useable, but not do any operation
3499  * that modifies the chunk tree. The second part does the works that
3500  * require modifying the chunk tree. This division is important for the
3501  * bootstrap process of adding storage to a seed btrfs.
3502  */
3503 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3504                       struct btrfs_root *extent_root, u64 type)
3505 {
3506         u64 chunk_offset;
3507         u64 chunk_size;
3508         u64 stripe_size;
3509         struct map_lookup *map;
3510         struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
3511         int ret;
3512
3513         ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
3514                               &chunk_offset);
3515         if (ret)
3516                 return ret;
3517
3518         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3519                                   &stripe_size, chunk_offset, type);
3520         if (ret)
3521                 return ret;
3522
3523         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3524                                    chunk_size, stripe_size);
3525         if (ret)
3526                 return ret;
3527         return 0;
3528 }
3529
3530 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
3531                                          struct btrfs_root *root,
3532                                          struct btrfs_device *device)
3533 {
3534         u64 chunk_offset;
3535         u64 sys_chunk_offset;
3536         u64 chunk_size;
3537         u64 sys_chunk_size;
3538         u64 stripe_size;
3539         u64 sys_stripe_size;
3540         u64 alloc_profile;
3541         struct map_lookup *map;
3542         struct map_lookup *sys_map;
3543         struct btrfs_fs_info *fs_info = root->fs_info;
3544         struct btrfs_root *extent_root = fs_info->extent_root;
3545         int ret;
3546
3547         ret = find_next_chunk(fs_info->chunk_root,
3548                               BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
3549         if (ret)
3550                 return ret;
3551
3552         alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
3553                                 fs_info->avail_metadata_alloc_bits;
3554         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3555
3556         ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
3557                                   &stripe_size, chunk_offset, alloc_profile);
3558         if (ret)
3559                 return ret;
3560
3561         sys_chunk_offset = chunk_offset + chunk_size;
3562
3563         alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
3564                                 fs_info->avail_system_alloc_bits;
3565         alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
3566
3567         ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
3568                                   &sys_chunk_size, &sys_stripe_size,
3569                                   sys_chunk_offset, alloc_profile);
3570         if (ret)
3571                 goto abort;
3572
3573         ret = btrfs_add_device(trans, fs_info->chunk_root, device);
3574         if (ret)
3575                 goto abort;
3576
3577         /*
3578          * Modifying chunk tree needs allocating new blocks from both
3579          * system block group and metadata block group. So we only can
3580          * do operations require modifying the chunk tree after both
3581          * block groups were created.
3582          */
3583         ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
3584                                    chunk_size, stripe_size);
3585         if (ret)
3586                 goto abort;
3587
3588         ret = __finish_chunk_alloc(trans, extent_root, sys_map,
3589                                    sys_chunk_offset, sys_chunk_size,
3590                                    sys_stripe_size);
3591         if (ret)
3592                 goto abort;
3593
3594         return 0;
3595
3596 abort:
3597         btrfs_abort_transaction(trans, root, ret);
3598         return ret;
3599 }
3600
3601 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
3602 {
3603         struct extent_map *em;
3604         struct map_lookup *map;
3605         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3606         int readonly = 0;
3607         int i;
3608
3609         read_lock(&map_tree->map_tree.lock);
3610         em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3611         read_unlock(&map_tree->map_tree.lock);
3612         if (!em)
3613                 return 1;
3614
3615         if (btrfs_test_opt(root, DEGRADED)) {
3616                 free_extent_map(em);
3617                 return 0;
3618         }
3619
3620         map = (struct map_lookup *)em->bdev;
3621         for (i = 0; i < map->num_stripes; i++) {
3622                 if (!map->stripes[i].dev->writeable) {
3623                         readonly = 1;
3624                         break;
3625                 }
3626         }
3627         free_extent_map(em);
3628         return readonly;
3629 }
3630
3631 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
3632 {
3633         extent_map_tree_init(&tree->map_tree);
3634 }
3635
3636 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
3637 {
3638         struct extent_map *em;
3639
3640         while (1) {
3641                 write_lock(&tree->map_tree.lock);
3642                 em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
3643                 if (em)
3644                         remove_extent_mapping(&tree->map_tree, em);
3645                 write_unlock(&tree->map_tree.lock);
3646                 if (!em)
3647                         break;
3648                 kfree(em->bdev);
3649                 /* once for us */
3650                 free_extent_map(em);
3651                 /* once for the tree */
3652                 free_extent_map(em);
3653         }
3654 }
3655
3656 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
3657 {
3658         struct extent_map *em;
3659         struct map_lookup *map;
3660         struct extent_map_tree *em_tree = &map_tree->map_tree;
3661         int ret;
3662
3663         read_lock(&em_tree->lock);
3664         em = lookup_extent_mapping(em_tree, logical, len);
3665         read_unlock(&em_tree->lock);
3666         BUG_ON(!em);
3667
3668         BUG_ON(em->start > logical || em->start + em->len < logical);
3669         map = (struct map_lookup *)em->bdev;
3670         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
3671                 ret = map->num_stripes;
3672         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3673                 ret = map->sub_stripes;
3674         else
3675                 ret = 1;
3676         free_extent_map(em);
3677         return ret;
3678 }
3679
3680 static int find_live_mirror(struct map_lookup *map, int first, int num,
3681                             int optimal)
3682 {
3683         int i;
3684         if (map->stripes[optimal].dev->bdev)
3685                 return optimal;
3686         for (i = first; i < first + num; i++) {
3687                 if (map->stripes[i].dev->bdev)
3688                         return i;
3689         }
3690         /* we couldn't find one that doesn't fail.  Just return something
3691          * and the io error handling code will clean up eventually
3692          */
3693         return optimal;
3694 }
3695
3696 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3697                              u64 logical, u64 *length,
3698                              struct btrfs_bio **bbio_ret,
3699                              int mirror_num)
3700 {
3701         struct extent_map *em;
3702         struct map_lookup *map;
3703         struct extent_map_tree *em_tree = &map_tree->map_tree;
3704         u64 offset;
3705         u64 stripe_offset;
3706         u64 stripe_end_offset;
3707         u64 stripe_nr;
3708         u64 stripe_nr_orig;
3709         u64 stripe_nr_end;
3710         int stripe_index;
3711         int i;
3712         int ret = 0;
3713         int num_stripes;
3714         int max_errors = 0;
3715         struct btrfs_bio *bbio = NULL;
3716
3717         read_lock(&em_tree->lock);
3718         em = lookup_extent_mapping(em_tree, logical, *length);
3719         read_unlock(&em_tree->lock);
3720
3721         if (!em) {
3722                 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
3723                        (unsigned long long)logical,
3724                        (unsigned long long)*length);
3725                 BUG();
3726         }
3727
3728         BUG_ON(em->start > logical || em->start + em->len < logical);
3729         map = (struct map_lookup *)em->bdev;
3730         offset = logical - em->start;
3731
3732         if (mirror_num > map->num_stripes)
3733                 mirror_num = 0;
3734
3735         stripe_nr = offset;
3736         /*
3737          * stripe_nr counts the total number of stripes we have to stride
3738          * to get to this block
3739          */
3740         do_div(stripe_nr, map->stripe_len);
3741
3742         stripe_offset = stripe_nr * map->stripe_len;
3743         BUG_ON(offset < stripe_offset);
3744
3745         /* stripe_offset is the offset of this block in its stripe*/
3746         stripe_offset = offset - stripe_offset;
3747
3748         if (rw & REQ_DISCARD)
3749                 *length = min_t(u64, em->len - offset, *length);
3750         else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
3751                 /* we limit the length of each bio to what fits in a stripe */
3752                 *length = min_t(u64, em->len - offset,
3753                                 map->stripe_len - stripe_offset);
3754         } else {
3755                 *length = em->len - offset;
3756         }
3757
3758         if (!bbio_ret)
3759                 goto out;
3760
3761         num_stripes = 1;
3762         stripe_index = 0;
3763         stripe_nr_orig = stripe_nr;
3764         stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3765                         (~(map->stripe_len - 1));
3766         do_div(stripe_nr_end, map->stripe_len);
3767         stripe_end_offset = stripe_nr_end * map->stripe_len -
3768                             (offset + *length);
3769         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3770                 if (rw & REQ_DISCARD)
3771                         num_stripes = min_t(u64, map->num_stripes,
3772                                             stripe_nr_end - stripe_nr_orig);
3773                 stripe_index = do_div(stripe_nr, map->num_stripes);
3774         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3775                 if (rw & (REQ_WRITE | REQ_DISCARD))
3776                         num_stripes = map->num_stripes;
3777                 else if (mirror_num)
3778                         stripe_index = mirror_num - 1;
3779                 else {
3780                         stripe_index = find_live_mirror(map, 0,
3781                                             map->num_stripes,
3782                                             current->pid % map->num_stripes);
3783                         mirror_num = stripe_index + 1;
3784                 }
3785
3786         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3787                 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3788                         num_stripes = map->num_stripes;
3789                 } else if (mirror_num) {
3790                         stripe_index = mirror_num - 1;
3791                 } else {
3792                         mirror_num = 1;
3793                 }
3794
3795         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3796                 int factor = map->num_stripes / map->sub_stripes;
3797
3798                 stripe_index = do_div(stripe_nr, factor);
3799                 stripe_index *= map->sub_stripes;
3800
3801                 if (rw & REQ_WRITE)
3802                         num_stripes = map->sub_stripes;
3803                 else if (rw & REQ_DISCARD)
3804                         num_stripes = min_t(u64, map->sub_stripes *
3805                                             (stripe_nr_end - stripe_nr_orig),
3806                                             map->num_stripes);
3807                 else if (mirror_num)
3808                         stripe_index += mirror_num - 1;
3809                 else {
3810                         int old_stripe_index = stripe_index;
3811                         stripe_index = find_live_mirror(map, stripe_index,
3812                                               map->sub_stripes, stripe_index +
3813                                               current->pid % map->sub_stripes);
3814                         mirror_num = stripe_index - old_stripe_index + 1;
3815                 }
3816         } else {
3817                 /*
3818                  * after this do_div call, stripe_nr is the number of stripes
3819                  * on this device we have to walk to find the data, and
3820                  * stripe_index is the number of our device in the stripe array
3821                  */
3822                 stripe_index = do_div(stripe_nr, map->num_stripes);
3823                 mirror_num = stripe_index + 1;
3824         }
3825         BUG_ON(stripe_index >= map->num_stripes);
3826
3827         bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3828         if (!bbio) {
3829                 ret = -ENOMEM;
3830                 goto out;
3831         }
3832         atomic_set(&bbio->error, 0);
3833
3834         if (rw & REQ_DISCARD) {
3835                 int factor = 0;
3836                 int sub_stripes = 0;
3837                 u64 stripes_per_dev = 0;
3838                 u32 remaining_stripes = 0;
3839                 u32 last_stripe = 0;
3840
3841                 if (map->type &
3842                     (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3843                         if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3844                                 sub_stripes = 1;
3845                         else
3846                                 sub_stripes = map->sub_stripes;
3847
3848                         factor = map->num_stripes / sub_stripes;
3849                         stripes_per_dev = div_u64_rem(stripe_nr_end -
3850                                                       stripe_nr_orig,
3851                                                       factor,
3852                                                       &remaining_stripes);
3853                         div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
3854                         last_stripe *= sub_stripes;
3855                 }
3856
3857                 for (i = 0; i < num_stripes; i++) {
3858                         bbio->stripes[i].physical =
3859                                 map->stripes[stripe_index].physical +
3860                                 stripe_offset + stripe_nr * map->stripe_len;
3861                         bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3862
3863                         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3864                                          BTRFS_BLOCK_GROUP_RAID10)) {
3865                                 bbio->stripes[i].length = stripes_per_dev *
3866                                                           map->stripe_len;
3867
3868                                 if (i / sub_stripes < remaining_stripes)
3869                                         bbio->stripes[i].length +=
3870                                                 map->stripe_len;
3871
3872                                 /*
3873                                  * Special for the first stripe and
3874                                  * the last stripe:
3875                                  *
3876                                  * |-------|...|-------|
3877                                  *     |----------|
3878                                  *    off     end_off
3879                                  */
3880                                 if (i < sub_stripes)
3881                                         bbio->stripes[i].length -=
3882                                                 stripe_offset;
3883
3884                                 if (stripe_index >= last_stripe &&
3885                                     stripe_index <= (last_stripe +
3886                                                      sub_stripes - 1))
3887                                         bbio->stripes[i].length -=
3888                                                 stripe_end_offset;
3889
3890                                 if (i == sub_stripes - 1)
3891                                         stripe_offset = 0;
3892                         } else
3893                                 bbio->stripes[i].length = *length;
3894
3895                         stripe_index++;
3896                         if (stripe_index == map->num_stripes) {
3897                                 /* This could only happen for RAID0/10 */
3898                                 stripe_index = 0;
3899                                 stripe_nr++;
3900                         }
3901                 }
3902         } else {
3903                 for (i = 0; i < num_stripes; i++) {
3904                         bbio->stripes[i].physical =
3905                                 map->stripes[stripe_index].physical +
3906                                 stripe_offset +
3907                                 stripe_nr * map->stripe_len;
3908                         bbio->stripes[i].dev =
3909                                 map->stripes[stripe_index].dev;
3910                         stripe_index++;
3911                 }
3912         }
3913
3914         if (rw & REQ_WRITE) {
3915                 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3916                                  BTRFS_BLOCK_GROUP_RAID10 |
3917                                  BTRFS_BLOCK_GROUP_DUP)) {
3918                         max_errors = 1;
3919                 }
3920         }
3921
3922         *bbio_ret = bbio;
3923         bbio->num_stripes = num_stripes;
3924         bbio->max_errors = max_errors;
3925         bbio->mirror_num = mirror_num;
3926 out:
3927         free_extent_map(em);
3928         return ret;
3929 }
3930
3931 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3932                       u64 logical, u64 *length,
3933                       struct btrfs_bio **bbio_ret, int mirror_num)
3934 {
3935         return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3936                                  mirror_num);
3937 }
3938
3939 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3940                      u64 chunk_start, u64 physical, u64 devid,
3941                      u64 **logical, int *naddrs, int *stripe_len)
3942 {
3943         struct extent_map_tree *em_tree = &map_tree->map_tree;
3944         struct extent_map *em;
3945         struct map_lookup *map;
3946         u64 *buf;
3947         u64 bytenr;
3948         u64 length;
3949         u64 stripe_nr;
3950         int i, j, nr = 0;
3951
3952         read_lock(&em_tree->lock);
3953         em = lookup_extent_mapping(em_tree, chunk_start, 1);
3954         read_unlock(&em_tree->lock);
3955
3956         BUG_ON(!em || em->start != chunk_start);
3957         map = (struct map_lookup *)em->bdev;
3958
3959         length = em->len;
3960         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3961                 do_div(length, map->num_stripes / map->sub_stripes);
3962         else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3963                 do_div(length, map->num_stripes);
3964
3965         buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3966         BUG_ON(!buf); /* -ENOMEM */
3967
3968         for (i = 0; i < map->num_stripes; i++) {
3969                 if (devid && map->stripes[i].dev->devid != devid)
3970                         continue;
3971                 if (map->stripes[i].physical > physical ||
3972                     map->stripes[i].physical + length <= physical)
3973                         continue;
3974
3975                 stripe_nr = physical - map->stripes[i].physical;
3976                 do_div(stripe_nr, map->stripe_len);
3977
3978                 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3979                         stripe_nr = stripe_nr * map->num_stripes + i;
3980                         do_div(stripe_nr, map->sub_stripes);
3981                 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3982                         stripe_nr = stripe_nr * map->num_stripes + i;
3983                 }
3984                 bytenr = chunk_start + stripe_nr * map->stripe_len;
3985                 WARN_ON(nr >= map->num_stripes);
3986                 for (j = 0; j < nr; j++) {
3987                         if (buf[j] == bytenr)
3988                                 break;
3989                 }
3990                 if (j == nr) {
3991                         WARN_ON(nr >= map->num_stripes);
3992                         buf[nr++] = bytenr;
3993                 }
3994         }
3995
3996         *logical = buf;
3997         *naddrs = nr;
3998         *stripe_len = map->stripe_len;
3999
4000         free_extent_map(em);
4001         return 0;
4002 }
4003
4004 static void btrfs_end_bio(struct bio *bio, int err)
4005 {
4006         struct btrfs_bio *bbio = bio->bi_private;
4007         int is_orig_bio = 0;
4008
4009         if (err)
4010                 atomic_inc(&bbio->error);
4011
4012         if (bio == bbio->orig_bio)
4013                 is_orig_bio = 1;
4014
4015         if (atomic_dec_and_test(&bbio->stripes_pending)) {
4016                 if (!is_orig_bio) {
4017                         bio_put(bio);
4018                         bio = bbio->orig_bio;
4019                 }
4020                 bio->bi_private = bbio->private;
4021                 bio->bi_end_io = bbio->end_io;
4022                 bio->bi_bdev = (struct block_device *)
4023                                         (unsigned long)bbio->mirror_num;
4024                 /* only send an error to the higher layers if it is
4025                  * beyond the tolerance of the multi-bio
4026                  */
4027                 if (atomic_read(&bbio->error) > bbio->max_errors) {
4028                         err = -EIO;
4029                 } else {
4030                         /*
4031                          * this bio is actually up to date, we didn't
4032                          * go over the max number of errors
4033                          */
4034                         set_bit(BIO_UPTODATE, &bio->bi_flags);
4035                         err = 0;
4036                 }
4037                 kfree(bbio);
4038
4039                 bio_endio(bio, err);
4040         } else if (!is_orig_bio) {
4041                 bio_put(bio);
4042         }
4043 }
4044
4045 struct async_sched {
4046         struct bio *bio;
4047         int rw;
4048         struct btrfs_fs_info *info;
4049         struct btrfs_work work;
4050 };
4051
4052 /*
4053  * see run_scheduled_bios for a description of why bios are collected for
4054  * async submit.
4055  *
4056  * This will add one bio to the pending list for a device and make sure
4057  * the work struct is scheduled.
4058  */
4059 static noinline void schedule_bio(struct btrfs_root *root,
4060                                  struct btrfs_device *device,
4061                                  int rw, struct bio *bio)
4062 {
4063         int should_queue = 1;
4064         struct btrfs_pending_bios *pending_bios;
4065
4066         /* don't bother with additional async steps for reads, right now */
4067         if (!(rw & REQ_WRITE)) {
4068                 bio_get(bio);
4069                 btrfsic_submit_bio(rw, bio);
4070                 bio_put(bio);
4071                 return;
4072         }
4073
4074         /*
4075          * nr_async_bios allows us to reliably return congestion to the
4076          * higher layers.  Otherwise, the async bio makes it appear we have
4077          * made progress against dirty pages when we've really just put it
4078          * on a queue for later
4079          */
4080         atomic_inc(&root->fs_info->nr_async_bios);
4081         WARN_ON(bio->bi_next);
4082         bio->bi_next = NULL;
4083         bio->bi_rw |= rw;
4084
4085         spin_lock(&device->io_lock);
4086         if (bio->bi_rw & REQ_SYNC)
4087                 pending_bios = &device->pending_sync_bios;
4088         else
4089                 pending_bios = &device->pending_bios;
4090
4091         if (pending_bios->tail)
4092                 pending_bios->tail->bi_next = bio;
4093
4094         pending_bios->tail = bio;
4095         if (!pending_bios->head)
4096                 pending_bios->head = bio;
4097         if (device->running_pending)
4098                 should_queue = 0;
4099
4100         spin_unlock(&device->io_lock);
4101
4102         if (should_queue)
4103                 btrfs_queue_worker(&root->fs_info->submit_workers,
4104                                    &device->work);
4105 }
4106
4107 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
4108                   int mirror_num, int async_submit)
4109 {
4110         struct btrfs_mapping_tree *map_tree;
4111         struct btrfs_device *dev;
4112         struct bio *first_bio = bio;
4113         u64 logical = (u64)bio->bi_sector << 9;
4114         u64 length = 0;
4115         u64 map_length;
4116         int ret;
4117         int dev_nr = 0;
4118         int total_devs = 1;
4119         struct btrfs_bio *bbio = NULL;
4120
4121         length = bio->bi_size;
4122         map_tree = &root->fs_info->mapping_tree;
4123         map_length = length;
4124
4125         ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
4126                               mirror_num);
4127         if (ret) /* -ENOMEM */
4128                 return ret;
4129
4130         total_devs = bbio->num_stripes;
4131         if (map_length < length) {
4132                 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
4133                        "len %llu\n", (unsigned long long)logical,
4134                        (unsigned long long)length,
4135                        (unsigned long long)map_length);
4136                 BUG();
4137         }
4138
4139         bbio->orig_bio = first_bio;
4140         bbio->private = first_bio->bi_private;
4141         bbio->end_io = first_bio->bi_end_io;
4142         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
4143
4144         while (dev_nr < total_devs) {
4145                 if (dev_nr < total_devs - 1) {
4146                         bio = bio_clone(first_bio, GFP_NOFS);
4147                         BUG_ON(!bio); /* -ENOMEM */
4148                 } else {
4149                         bio = first_bio;
4150                 }
4151                 bio->bi_private = bbio;
4152                 bio->bi_end_io = btrfs_end_bio;
4153                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
4154                 dev = bbio->stripes[dev_nr].dev;
4155                 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
4156                         pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
4157                                  "(%s id %llu), size=%u\n", rw,
4158                                  (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
4159                                  dev->name, dev->devid, bio->bi_size);
4160                         bio->bi_bdev = dev->bdev;
4161                         if (async_submit)
4162                                 schedule_bio(root, dev, rw, bio);
4163                         else
4164                                 btrfsic_submit_bio(rw, bio);
4165                 } else {
4166                         bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
4167                         bio->bi_sector = logical >> 9;
4168                         bio_endio(bio, -EIO);
4169                 }
4170                 dev_nr++;
4171         }
4172         return 0;
4173 }
4174
4175 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
4176                                        u8 *uuid, u8 *fsid)
4177 {
4178         struct btrfs_device *device;
4179         struct btrfs_fs_devices *cur_devices;
4180
4181         cur_devices = root->fs_info->fs_devices;
4182         while (cur_devices) {
4183                 if (!fsid ||
4184                     !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4185                         device = __find_device(&cur_devices->devices,
4186                                                devid, uuid);
4187                         if (device)
4188                                 return device;
4189                 }
4190                 cur_devices = cur_devices->seed;
4191         }
4192         return NULL;
4193 }
4194
4195 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
4196                                             u64 devid, u8 *dev_uuid)
4197 {
4198         struct btrfs_device *device;
4199         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
4200
4201         device = kzalloc(sizeof(*device), GFP_NOFS);
4202         if (!device)
4203                 return NULL;
4204         list_add(&device->dev_list,
4205                  &fs_devices->devices);
4206         device->dev_root = root->fs_info->dev_root;
4207         device->devid = devid;
4208         device->work.func = pending_bios_fn;
4209         device->fs_devices = fs_devices;
4210         device->missing = 1;
4211         fs_devices->num_devices++;
4212         fs_devices->missing_devices++;
4213         spin_lock_init(&device->io_lock);
4214         INIT_LIST_HEAD(&device->dev_alloc_list);
4215         memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
4216         return device;
4217 }
4218
4219 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
4220                           struct extent_buffer *leaf,
4221                           struct btrfs_chunk *chunk)
4222 {
4223         struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4224         struct map_lookup *map;
4225         struct extent_map *em;
4226         u64 logical;
4227         u64 length;
4228         u64 devid;
4229         u8 uuid[BTRFS_UUID_SIZE];
4230         int num_stripes;
4231         int ret;
4232         int i;
4233
4234         logical = key->offset;
4235         length = btrfs_chunk_length(leaf, chunk);
4236
4237         read_lock(&map_tree->map_tree.lock);
4238         em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
4239         read_unlock(&map_tree->map_tree.lock);
4240
4241         /* already mapped? */
4242         if (em && em->start <= logical && em->start + em->len > logical) {
4243                 free_extent_map(em);
4244                 return 0;
4245         } else if (em) {
4246                 free_extent_map(em);
4247         }
4248
4249         em = alloc_extent_map();
4250         if (!em)
4251                 return -ENOMEM;
4252         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
4253         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4254         if (!map) {
4255                 free_extent_map(em);
4256                 return -ENOMEM;
4257         }
4258
4259         em->bdev = (struct block_device *)map;
4260         em->start = logical;
4261         em->len = length;
4262         em->block_start = 0;
4263         em->block_len = em->len;
4264
4265         map->num_stripes = num_stripes;
4266         map->io_width = btrfs_chunk_io_width(leaf, chunk);
4267         map->io_align = btrfs_chunk_io_align(leaf, chunk);
4268         map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
4269         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
4270         map->type = btrfs_chunk_type(leaf, chunk);
4271         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
4272         for (i = 0; i < num_stripes; i++) {
4273                 map->stripes[i].physical =
4274                         btrfs_stripe_offset_nr(leaf, chunk, i);
4275                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
4276                 read_extent_buffer(leaf, uuid, (unsigned long)
4277                                    btrfs_stripe_dev_uuid_nr(chunk, i),
4278                                    BTRFS_UUID_SIZE);
4279                 map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
4280                                                         NULL);
4281                 if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
4282                         kfree(map);
4283                         free_extent_map(em);
4284                         return -EIO;
4285                 }
4286                 if (!map->stripes[i].dev) {
4287                         map->stripes[i].dev =
4288                                 add_missing_dev(root, devid, uuid);
4289                         if (!map->stripes[i].dev) {
4290                                 kfree(map);
4291                                 free_extent_map(em);
4292                                 return -EIO;
4293                         }
4294                 }
4295                 map->stripes[i].dev->in_fs_metadata = 1;
4296         }
4297
4298         write_lock(&map_tree->map_tree.lock);
4299         ret = add_extent_mapping(&map_tree->map_tree, em);
4300         write_unlock(&map_tree->map_tree.lock);
4301         BUG_ON(ret); /* Tree corruption */
4302         free_extent_map(em);
4303
4304         return 0;
4305 }
4306
4307 static void fill_device_from_item(struct extent_buffer *leaf,
4308                                  struct btrfs_dev_item *dev_item,
4309                                  struct btrfs_device *device)
4310 {
4311         unsigned long ptr;
4312
4313         device->devid = btrfs_device_id(leaf, dev_item);
4314         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
4315         device->total_bytes = device->disk_total_bytes;
4316         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
4317         device->type = btrfs_device_type(leaf, dev_item);
4318         device->io_align = btrfs_device_io_align(leaf, dev_item);
4319         device->io_width = btrfs_device_io_width(leaf, dev_item);
4320         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
4321
4322         ptr = (unsigned long)btrfs_device_uuid(dev_item);
4323         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
4324 }
4325
4326 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
4327 {
4328         struct btrfs_fs_devices *fs_devices;
4329         int ret;
4330
4331         BUG_ON(!mutex_is_locked(&uuid_mutex));
4332
4333         fs_devices = root->fs_info->fs_devices->seed;
4334         while (fs_devices) {
4335                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
4336                         ret = 0;
4337                         goto out;
4338                 }
4339                 fs_devices = fs_devices->seed;
4340         }
4341
4342         fs_devices = find_fsid(fsid);
4343         if (!fs_devices) {
4344                 ret = -ENOENT;
4345                 goto out;
4346         }
4347
4348         fs_devices = clone_fs_devices(fs_devices);
4349         if (IS_ERR(fs_devices)) {
4350                 ret = PTR_ERR(fs_devices);
4351                 goto out;
4352         }
4353
4354         ret = __btrfs_open_devices(fs_devices, FMODE_READ,
4355                                    root->fs_info->bdev_holder);
4356         if (ret) {
4357                 free_fs_devices(fs_devices);
4358                 goto out;
4359         }
4360
4361         if (!fs_devices->seeding) {
4362                 __btrfs_close_devices(fs_devices);
4363                 free_fs_devices(fs_devices);
4364                 ret = -EINVAL;
4365                 goto out;
4366         }
4367
4368         fs_devices->seed = root->fs_info->fs_devices->seed;
4369         root->fs_info->fs_devices->seed = fs_devices;
4370 out:
4371         return ret;
4372 }
4373
4374 static int read_one_dev(struct btrfs_root *root,
4375                         struct extent_buffer *leaf,
4376                         struct btrfs_dev_item *dev_item)
4377 {
4378         struct btrfs_device *device;
4379         u64 devid;
4380         int ret;
4381         u8 fs_uuid[BTRFS_UUID_SIZE];
4382         u8 dev_uuid[BTRFS_UUID_SIZE];
4383
4384         devid = btrfs_device_id(leaf, dev_item);
4385         read_extent_buffer(leaf, dev_uuid,
4386                            (unsigned long)btrfs_device_uuid(dev_item),
4387                            BTRFS_UUID_SIZE);
4388         read_extent_buffer(leaf, fs_uuid,
4389                            (unsigned long)btrfs_device_fsid(dev_item),
4390                            BTRFS_UUID_SIZE);
4391
4392         if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
4393                 ret = open_seed_devices(root, fs_uuid);
4394                 if (ret && !btrfs_test_opt(root, DEGRADED))
4395                         return ret;
4396         }
4397
4398         device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
4399         if (!device || !device->bdev) {
4400                 if (!btrfs_test_opt(root, DEGRADED))
4401                         return -EIO;
4402
4403                 if (!device) {
4404                         printk(KERN_WARNING "warning devid %llu missing\n",
4405                                (unsigned long long)devid);
4406                         device = add_missing_dev(root, devid, dev_uuid);
4407                         if (!device)
4408                                 return -ENOMEM;
4409                 } else if (!device->missing) {
4410                         /*
4411                          * this happens when a device that was properly setup
4412                          * in the device info lists suddenly goes bad.
4413                          * device->bdev is NULL, and so we have to set
4414                          * device->missing to one here
4415                          */
4416                         root->fs_info->fs_devices->missing_devices++;
4417                         device->missing = 1;
4418                 }
4419         }
4420
4421         if (device->fs_devices != root->fs_info->fs_devices) {
4422                 BUG_ON(device->writeable);
4423                 if (device->generation !=
4424                     btrfs_device_generation(leaf, dev_item))
4425                         return -EINVAL;
4426         }
4427
4428         fill_device_from_item(leaf, dev_item, device);
4429         device->dev_root = root->fs_info->dev_root;
4430         device->in_fs_metadata = 1;
4431         if (device->writeable) {
4432                 device->fs_devices->total_rw_bytes += device->total_bytes;
4433                 spin_lock(&root->fs_info->free_chunk_lock);
4434                 root->fs_info->free_chunk_space += device->total_bytes -
4435                         device->bytes_used;
4436                 spin_unlock(&root->fs_info->free_chunk_lock);
4437         }
4438         ret = 0;
4439         return ret;
4440 }
4441
4442 int btrfs_read_sys_array(struct btrfs_root *root)
4443 {
4444         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4445         struct extent_buffer *sb;
4446         struct btrfs_disk_key *disk_key;
4447         struct btrfs_chunk *chunk;
4448         u8 *ptr;
4449         unsigned long sb_ptr;
4450         int ret = 0;
4451         u32 num_stripes;
4452         u32 array_size;
4453         u32 len = 0;
4454         u32 cur;
4455         struct btrfs_key key;
4456
4457         sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
4458                                           BTRFS_SUPER_INFO_SIZE);
4459         if (!sb)
4460                 return -ENOMEM;
4461         btrfs_set_buffer_uptodate(sb);
4462         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
4463         /*
4464          * The sb extent buffer is artifical and just used to read the system array.
4465          * btrfs_set_buffer_uptodate() call does not properly mark all it's
4466          * pages up-to-date when the page is larger: extent does not cover the
4467          * whole page and consequently check_page_uptodate does not find all
4468          * the page's extents up-to-date (the hole beyond sb),
4469          * write_extent_buffer then triggers a WARN_ON.
4470          *
4471          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
4472          * but sb spans only this function. Add an explicit SetPageUptodate call
4473          * to silence the warning eg. on PowerPC 64.
4474          */
4475         if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
4476                 SetPageUptodate(sb->pages[0]);
4477
4478         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
4479         array_size = btrfs_super_sys_array_size(super_copy);
4480
4481         ptr = super_copy->sys_chunk_array;
4482         sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
4483         cur = 0;
4484
4485         while (cur < array_size) {
4486                 disk_key = (struct btrfs_disk_key *)ptr;
4487                 btrfs_disk_key_to_cpu(&key, disk_key);
4488
4489                 len = sizeof(*disk_key); ptr += len;
4490                 sb_ptr += len;
4491                 cur += len;
4492
4493                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
4494                         chunk = (struct btrfs_chunk *)sb_ptr;
4495                         ret = read_one_chunk(root, &key, sb, chunk);
4496                         if (ret)
4497                                 break;
4498                         num_stripes = btrfs_chunk_num_stripes(sb, chunk);
4499                         len = btrfs_chunk_item_size(num_stripes);
4500                 } else {
4501                         ret = -EIO;
4502                         break;
4503                 }
4504                 ptr += len;
4505                 sb_ptr += len;
4506                 cur += len;
4507         }
4508         free_extent_buffer(sb);
4509         return ret;
4510 }
4511
4512 int btrfs_read_chunk_tree(struct btrfs_root *root)
4513 {
4514         struct btrfs_path *path;
4515         struct extent_buffer *leaf;
4516         struct btrfs_key key;
4517         struct btrfs_key found_key;
4518         int ret;
4519         int slot;
4520
4521         root = root->fs_info->chunk_root;
4522
4523         path = btrfs_alloc_path();
4524         if (!path)
4525                 return -ENOMEM;
4526
4527         mutex_lock(&uuid_mutex);
4528         lock_chunks(root);
4529
4530         /* first we search for all of the device items, and then we
4531          * read in all of the chunk items.  This way we can create chunk
4532          * mappings that reference all of the devices that are afound
4533          */
4534         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
4535         key.offset = 0;
4536         key.type = 0;
4537 again:
4538         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4539         if (ret < 0)
4540                 goto error;
4541         while (1) {
4542                 leaf = path->nodes[0];
4543                 slot = path->slots[0];
4544                 if (slot >= btrfs_header_nritems(leaf)) {
4545                         ret = btrfs_next_leaf(root, path);
4546                         if (ret == 0)
4547                                 continue;
4548                         if (ret < 0)
4549                                 goto error;
4550                         break;
4551                 }
4552                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4553                 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4554                         if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
4555                                 break;
4556                         if (found_key.type == BTRFS_DEV_ITEM_KEY) {
4557                                 struct btrfs_dev_item *dev_item;
4558                                 dev_item = btrfs_item_ptr(leaf, slot,
4559                                                   struct btrfs_dev_item);
4560                                 ret = read_one_dev(root, leaf, dev_item);
4561                                 if (ret)
4562                                         goto error;
4563                         }
4564                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
4565                         struct btrfs_chunk *chunk;
4566                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4567                         ret = read_one_chunk(root, &found_key, leaf, chunk);
4568                         if (ret)
4569                                 goto error;
4570                 }
4571                 path->slots[0]++;
4572         }
4573         if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
4574                 key.objectid = 0;
4575                 btrfs_release_path(path);
4576                 goto again;
4577         }
4578         ret = 0;
4579 error:
4580         unlock_chunks(root);
4581         mutex_unlock(&uuid_mutex);
4582
4583         btrfs_free_path(path);
4584         return ret;
4585 }