4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 #include <linux/precache.h>
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
48 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
51 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
53 bh->b_end_io = handler;
54 bh->b_private = private;
56 EXPORT_SYMBOL(init_buffer);
58 static int sleep_on_buffer(void *word)
64 void __lock_buffer(struct buffer_head *bh)
66 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
67 TASK_UNINTERRUPTIBLE);
69 EXPORT_SYMBOL(__lock_buffer);
71 void unlock_buffer(struct buffer_head *bh)
73 clear_bit_unlock(BH_Lock, &bh->b_state);
74 smp_mb__after_clear_bit();
75 wake_up_bit(&bh->b_state, BH_Lock);
77 EXPORT_SYMBOL(unlock_buffer);
80 * Block until a buffer comes unlocked. This doesn't stop it
81 * from becoming locked again - you have to lock it yourself
82 * if you want to preserve its state.
84 void __wait_on_buffer(struct buffer_head * bh)
86 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
88 EXPORT_SYMBOL(__wait_on_buffer);
91 __clear_page_buffers(struct page *page)
93 ClearPagePrivate(page);
94 set_page_private(page, 0);
95 page_cache_release(page);
99 static int quiet_error(struct buffer_head *bh)
101 if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
107 static void buffer_io_error(struct buffer_head *bh)
109 char b[BDEVNAME_SIZE];
110 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
111 bdevname(bh->b_bdev, b),
112 (unsigned long long)bh->b_blocknr);
116 * End-of-IO handler helper function which does not touch the bh after
118 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
119 * a race there is benign: unlock_buffer() only use the bh's address for
120 * hashing after unlocking the buffer, so it doesn't actually touch the bh
123 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
126 set_buffer_uptodate(bh);
128 /* This happens, due to failed READA attempts. */
129 clear_buffer_uptodate(bh);
135 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
136 * unlock the buffer. This is what ll_rw_block uses too.
138 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
140 __end_buffer_read_notouch(bh, uptodate);
143 EXPORT_SYMBOL(end_buffer_read_sync);
145 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
147 char b[BDEVNAME_SIZE];
150 set_buffer_uptodate(bh);
152 if (!quiet_error(bh)) {
154 printk(KERN_WARNING "lost page write due to "
156 bdevname(bh->b_bdev, b));
158 set_buffer_write_io_error(bh);
159 clear_buffer_uptodate(bh);
164 EXPORT_SYMBOL(end_buffer_write_sync);
167 * Various filesystems appear to want __find_get_block to be non-blocking.
168 * But it's the page lock which protects the buffers. To get around this,
169 * we get exclusion from try_to_free_buffers with the blockdev mapping's
172 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
173 * may be quite high. This code could TryLock the page, and if that
174 * succeeds, there is no need to take private_lock. (But if
175 * private_lock is contended then so is mapping->tree_lock).
177 static struct buffer_head *
178 __find_get_block_slow(struct block_device *bdev, sector_t block)
180 struct inode *bd_inode = bdev->bd_inode;
181 struct address_space *bd_mapping = bd_inode->i_mapping;
182 struct buffer_head *ret = NULL;
184 struct buffer_head *bh;
185 struct buffer_head *head;
189 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
190 page = find_get_page(bd_mapping, index);
194 spin_lock(&bd_mapping->private_lock);
195 if (!page_has_buffers(page))
197 head = page_buffers(page);
200 if (!buffer_mapped(bh))
202 else if (bh->b_blocknr == block) {
207 bh = bh->b_this_page;
208 } while (bh != head);
210 /* we might be here because some of the buffers on this page are
211 * not mapped. This is due to various races between
212 * file io on the block device and getblk. It gets dealt with
213 * elsewhere, don't buffer_error if we had some unmapped buffers
216 printk("__find_get_block_slow() failed. "
217 "block=%llu, b_blocknr=%llu\n",
218 (unsigned long long)block,
219 (unsigned long long)bh->b_blocknr);
220 printk("b_state=0x%08lx, b_size=%zu\n",
221 bh->b_state, bh->b_size);
222 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
225 spin_unlock(&bd_mapping->private_lock);
226 page_cache_release(page);
231 /* If invalidate_buffers() will trash dirty buffers, it means some kind
232 of fs corruption is going on. Trashing dirty data always imply losing
233 information that was supposed to be just stored on the physical layer
236 Thus invalidate_buffers in general usage is not allwowed to trash
237 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
238 be preserved. These buffers are simply skipped.
240 We also skip buffers which are still in use. For example this can
241 happen if a userspace program is reading the block device.
243 NOTE: In the case where the user removed a removable-media-disk even if
244 there's still dirty data not synced on disk (due a bug in the device driver
245 or due an error of the user), by not destroying the dirty buffers we could
246 generate corruption also on the next media inserted, thus a parameter is
247 necessary to handle this case in the most safe way possible (trying
248 to not corrupt also the new disk inserted with the data belonging to
249 the old now corrupted disk). Also for the ramdisk the natural thing
250 to do in order to release the ramdisk memory is to destroy dirty buffers.
252 These are two special cases. Normal usage imply the device driver
253 to issue a sync on the device (without waiting I/O completion) and
254 then an invalidate_buffers call that doesn't trash dirty buffers.
256 For handling cache coherency with the blkdev pagecache the 'update' case
257 is been introduced. It is needed to re-read from disk any pinned
258 buffer. NOTE: re-reading from disk is destructive so we can do it only
259 when we assume nobody is changing the buffercache under our I/O and when
260 we think the disk contains more recent information than the buffercache.
261 The update == 1 pass marks the buffers we need to update, the update == 2
262 pass does the actual I/O. */
263 void invalidate_bdev(struct block_device *bdev)
265 struct address_space *mapping = bdev->bd_inode->i_mapping;
267 if (mapping->nrpages == 0)
270 invalidate_bh_lrus();
271 lru_add_drain_all(); /* make sure all lru add caches are flushed */
272 invalidate_mapping_pages(mapping, 0, -1);
274 /* 99% of the time, we don't need to flush the precache on the bdev.
275 * But, for the strange corners, lets be cautious
277 precache_flush_inode(mapping);
279 EXPORT_SYMBOL(invalidate_bdev);
282 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
284 static void free_more_memory(void)
289 wakeup_flusher_threads(1024);
292 for_each_online_node(nid) {
293 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
294 gfp_zone(GFP_NOFS), NULL,
297 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
303 * I/O completion handler for block_read_full_page() - pages
304 * which come unlocked at the end of I/O.
306 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
309 struct buffer_head *first;
310 struct buffer_head *tmp;
312 int page_uptodate = 1;
314 BUG_ON(!buffer_async_read(bh));
318 set_buffer_uptodate(bh);
320 clear_buffer_uptodate(bh);
321 if (!quiet_error(bh))
327 * Be _very_ careful from here on. Bad things can happen if
328 * two buffer heads end IO at almost the same time and both
329 * decide that the page is now completely done.
331 first = page_buffers(page);
332 local_irq_save(flags);
333 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
334 clear_buffer_async_read(bh);
338 if (!buffer_uptodate(tmp))
340 if (buffer_async_read(tmp)) {
341 BUG_ON(!buffer_locked(tmp));
344 tmp = tmp->b_this_page;
346 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
347 local_irq_restore(flags);
350 * If none of the buffers had errors and they are all
351 * uptodate then we can set the page uptodate.
353 if (page_uptodate && !PageError(page))
354 SetPageUptodate(page);
359 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
360 local_irq_restore(flags);
365 * Completion handler for block_write_full_page() - pages which are unlocked
366 * during I/O, and which have PageWriteback cleared upon I/O completion.
368 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
370 char b[BDEVNAME_SIZE];
372 struct buffer_head *first;
373 struct buffer_head *tmp;
376 BUG_ON(!buffer_async_write(bh));
380 set_buffer_uptodate(bh);
382 if (!quiet_error(bh)) {
384 printk(KERN_WARNING "lost page write due to "
386 bdevname(bh->b_bdev, b));
388 set_bit(AS_EIO, &page->mapping->flags);
389 set_buffer_write_io_error(bh);
390 clear_buffer_uptodate(bh);
394 first = page_buffers(page);
395 local_irq_save(flags);
396 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
398 clear_buffer_async_write(bh);
400 tmp = bh->b_this_page;
402 if (buffer_async_write(tmp)) {
403 BUG_ON(!buffer_locked(tmp));
406 tmp = tmp->b_this_page;
408 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
409 local_irq_restore(flags);
410 end_page_writeback(page);
414 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
415 local_irq_restore(flags);
418 EXPORT_SYMBOL(end_buffer_async_write);
421 * If a page's buffers are under async readin (end_buffer_async_read
422 * completion) then there is a possibility that another thread of
423 * control could lock one of the buffers after it has completed
424 * but while some of the other buffers have not completed. This
425 * locked buffer would confuse end_buffer_async_read() into not unlocking
426 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
427 * that this buffer is not under async I/O.
429 * The page comes unlocked when it has no locked buffer_async buffers
432 * PageLocked prevents anyone starting new async I/O reads any of
435 * PageWriteback is used to prevent simultaneous writeout of the same
438 * PageLocked prevents anyone from starting writeback of a page which is
439 * under read I/O (PageWriteback is only ever set against a locked page).
441 static void mark_buffer_async_read(struct buffer_head *bh)
443 bh->b_end_io = end_buffer_async_read;
444 set_buffer_async_read(bh);
447 static void mark_buffer_async_write_endio(struct buffer_head *bh,
448 bh_end_io_t *handler)
450 bh->b_end_io = handler;
451 set_buffer_async_write(bh);
454 void mark_buffer_async_write(struct buffer_head *bh)
456 mark_buffer_async_write_endio(bh, end_buffer_async_write);
458 EXPORT_SYMBOL(mark_buffer_async_write);
462 * fs/buffer.c contains helper functions for buffer-backed address space's
463 * fsync functions. A common requirement for buffer-based filesystems is
464 * that certain data from the backing blockdev needs to be written out for
465 * a successful fsync(). For example, ext2 indirect blocks need to be
466 * written back and waited upon before fsync() returns.
468 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
469 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
470 * management of a list of dependent buffers at ->i_mapping->private_list.
472 * Locking is a little subtle: try_to_free_buffers() will remove buffers
473 * from their controlling inode's queue when they are being freed. But
474 * try_to_free_buffers() will be operating against the *blockdev* mapping
475 * at the time, not against the S_ISREG file which depends on those buffers.
476 * So the locking for private_list is via the private_lock in the address_space
477 * which backs the buffers. Which is different from the address_space
478 * against which the buffers are listed. So for a particular address_space,
479 * mapping->private_lock does *not* protect mapping->private_list! In fact,
480 * mapping->private_list will always be protected by the backing blockdev's
483 * Which introduces a requirement: all buffers on an address_space's
484 * ->private_list must be from the same address_space: the blockdev's.
486 * address_spaces which do not place buffers at ->private_list via these
487 * utility functions are free to use private_lock and private_list for
488 * whatever they want. The only requirement is that list_empty(private_list)
489 * be true at clear_inode() time.
491 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
492 * filesystems should do that. invalidate_inode_buffers() should just go
493 * BUG_ON(!list_empty).
495 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
496 * take an address_space, not an inode. And it should be called
497 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
500 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
501 * list if it is already on a list. Because if the buffer is on a list,
502 * it *must* already be on the right one. If not, the filesystem is being
503 * silly. This will save a ton of locking. But first we have to ensure
504 * that buffers are taken *off* the old inode's list when they are freed
505 * (presumably in truncate). That requires careful auditing of all
506 * filesystems (do it inside bforget()). It could also be done by bringing
511 * The buffer's backing address_space's private_lock must be held
513 static void __remove_assoc_queue(struct buffer_head *bh)
515 list_del_init(&bh->b_assoc_buffers);
516 WARN_ON(!bh->b_assoc_map);
517 if (buffer_write_io_error(bh))
518 set_bit(AS_EIO, &bh->b_assoc_map->flags);
519 bh->b_assoc_map = NULL;
522 int inode_has_buffers(struct inode *inode)
524 return !list_empty(&inode->i_data.private_list);
528 * osync is designed to support O_SYNC io. It waits synchronously for
529 * all already-submitted IO to complete, but does not queue any new
530 * writes to the disk.
532 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
533 * you dirty the buffers, and then use osync_inode_buffers to wait for
534 * completion. Any other dirty buffers which are not yet queued for
535 * write will not be flushed to disk by the osync.
537 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
539 struct buffer_head *bh;
545 list_for_each_prev(p, list) {
547 if (buffer_locked(bh)) {
551 if (!buffer_uptodate(bh))
562 static void do_thaw_one(struct super_block *sb, void *unused)
564 char b[BDEVNAME_SIZE];
565 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
566 printk(KERN_WARNING "Emergency Thaw on %s\n",
567 bdevname(sb->s_bdev, b));
570 static void do_thaw_all(struct work_struct *work)
572 iterate_supers(do_thaw_one, NULL);
574 printk(KERN_WARNING "Emergency Thaw complete\n");
578 * emergency_thaw_all -- forcibly thaw every frozen filesystem
580 * Used for emergency unfreeze of all filesystems via SysRq
582 void emergency_thaw_all(void)
584 struct work_struct *work;
586 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 INIT_WORK(work, do_thaw_all);
594 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
595 * @mapping: the mapping which wants those buffers written
597 * Starts I/O against the buffers at mapping->private_list, and waits upon
600 * Basically, this is a convenience function for fsync().
601 * @mapping is a file or directory which needs those buffers to be written for
602 * a successful fsync().
604 int sync_mapping_buffers(struct address_space *mapping)
606 struct address_space *buffer_mapping = mapping->assoc_mapping;
608 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
611 return fsync_buffers_list(&buffer_mapping->private_lock,
612 &mapping->private_list);
614 EXPORT_SYMBOL(sync_mapping_buffers);
617 * Called when we've recently written block `bblock', and it is known that
618 * `bblock' was for a buffer_boundary() buffer. This means that the block at
619 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
620 * dirty, schedule it for IO. So that indirects merge nicely with their data.
622 void write_boundary_block(struct block_device *bdev,
623 sector_t bblock, unsigned blocksize)
625 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
627 if (buffer_dirty(bh))
628 ll_rw_block(WRITE, 1, &bh);
633 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
635 struct address_space *mapping = inode->i_mapping;
636 struct address_space *buffer_mapping = bh->b_page->mapping;
638 mark_buffer_dirty(bh);
639 if (!mapping->assoc_mapping) {
640 mapping->assoc_mapping = buffer_mapping;
642 BUG_ON(mapping->assoc_mapping != buffer_mapping);
644 if (!bh->b_assoc_map) {
645 spin_lock(&buffer_mapping->private_lock);
646 list_move_tail(&bh->b_assoc_buffers,
647 &mapping->private_list);
648 bh->b_assoc_map = mapping;
649 spin_unlock(&buffer_mapping->private_lock);
652 EXPORT_SYMBOL(mark_buffer_dirty_inode);
655 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
658 * If warn is true, then emit a warning if the page is not uptodate and has
659 * not been truncated.
661 static void __set_page_dirty(struct page *page,
662 struct address_space *mapping, int warn)
664 spin_lock_irq(&mapping->tree_lock);
665 if (page->mapping) { /* Race with truncate? */
666 WARN_ON_ONCE(warn && !PageUptodate(page));
667 account_page_dirtied(page, mapping);
668 radix_tree_tag_set(&mapping->page_tree,
669 page_index(page), PAGECACHE_TAG_DIRTY);
671 spin_unlock_irq(&mapping->tree_lock);
672 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
676 * Add a page to the dirty page list.
678 * It is a sad fact of life that this function is called from several places
679 * deeply under spinlocking. It may not sleep.
681 * If the page has buffers, the uptodate buffers are set dirty, to preserve
682 * dirty-state coherency between the page and the buffers. It the page does
683 * not have buffers then when they are later attached they will all be set
686 * The buffers are dirtied before the page is dirtied. There's a small race
687 * window in which a writepage caller may see the page cleanness but not the
688 * buffer dirtiness. That's fine. If this code were to set the page dirty
689 * before the buffers, a concurrent writepage caller could clear the page dirty
690 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
691 * page on the dirty page list.
693 * We use private_lock to lock against try_to_free_buffers while using the
694 * page's buffer list. Also use this to protect against clean buffers being
695 * added to the page after it was set dirty.
697 * FIXME: may need to call ->reservepage here as well. That's rather up to the
698 * address_space though.
700 int __set_page_dirty_buffers(struct page *page)
703 struct address_space *mapping = page_mapping(page);
705 if (unlikely(!mapping))
706 return !TestSetPageDirty(page);
708 spin_lock(&mapping->private_lock);
709 if (page_has_buffers(page)) {
710 struct buffer_head *head = page_buffers(page);
711 struct buffer_head *bh = head;
714 set_buffer_dirty(bh);
715 bh = bh->b_this_page;
716 } while (bh != head);
718 newly_dirty = !TestSetPageDirty(page);
719 spin_unlock(&mapping->private_lock);
722 __set_page_dirty(page, mapping, 1);
725 EXPORT_SYMBOL(__set_page_dirty_buffers);
728 * Write out and wait upon a list of buffers.
730 * We have conflicting pressures: we want to make sure that all
731 * initially dirty buffers get waited on, but that any subsequently
732 * dirtied buffers don't. After all, we don't want fsync to last
733 * forever if somebody is actively writing to the file.
735 * Do this in two main stages: first we copy dirty buffers to a
736 * temporary inode list, queueing the writes as we go. Then we clean
737 * up, waiting for those writes to complete.
739 * During this second stage, any subsequent updates to the file may end
740 * up refiling the buffer on the original inode's dirty list again, so
741 * there is a chance we will end up with a buffer queued for write but
742 * not yet completed on that list. So, as a final cleanup we go through
743 * the osync code to catch these locked, dirty buffers without requeuing
744 * any newly dirty buffers for write.
746 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
748 struct buffer_head *bh;
749 struct list_head tmp;
750 struct address_space *mapping;
752 struct blk_plug plug;
754 INIT_LIST_HEAD(&tmp);
755 blk_start_plug(&plug);
758 while (!list_empty(list)) {
759 bh = BH_ENTRY(list->next);
760 mapping = bh->b_assoc_map;
761 __remove_assoc_queue(bh);
762 /* Avoid race with mark_buffer_dirty_inode() which does
763 * a lockless check and we rely on seeing the dirty bit */
765 if (buffer_dirty(bh) || buffer_locked(bh)) {
766 list_add(&bh->b_assoc_buffers, &tmp);
767 bh->b_assoc_map = mapping;
768 if (buffer_dirty(bh)) {
772 * Ensure any pending I/O completes so that
773 * write_dirty_buffer() actually writes the
774 * current contents - it is a noop if I/O is
775 * still in flight on potentially older
778 write_dirty_buffer(bh, WRITE_SYNC);
781 * Kick off IO for the previous mapping. Note
782 * that we will not run the very last mapping,
783 * wait_on_buffer() will do that for us
784 * through sync_buffer().
793 blk_finish_plug(&plug);
796 while (!list_empty(&tmp)) {
797 bh = BH_ENTRY(tmp.prev);
799 mapping = bh->b_assoc_map;
800 __remove_assoc_queue(bh);
801 /* Avoid race with mark_buffer_dirty_inode() which does
802 * a lockless check and we rely on seeing the dirty bit */
804 if (buffer_dirty(bh)) {
805 list_add(&bh->b_assoc_buffers,
806 &mapping->private_list);
807 bh->b_assoc_map = mapping;
811 if (!buffer_uptodate(bh))
818 err2 = osync_buffers_list(lock, list);
826 * Invalidate any and all dirty buffers on a given inode. We are
827 * probably unmounting the fs, but that doesn't mean we have already
828 * done a sync(). Just drop the buffers from the inode list.
830 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
831 * assumes that all the buffers are against the blockdev. Not true
834 void invalidate_inode_buffers(struct inode *inode)
836 if (inode_has_buffers(inode)) {
837 struct address_space *mapping = &inode->i_data;
838 struct list_head *list = &mapping->private_list;
839 struct address_space *buffer_mapping = mapping->assoc_mapping;
841 spin_lock(&buffer_mapping->private_lock);
842 while (!list_empty(list))
843 __remove_assoc_queue(BH_ENTRY(list->next));
844 spin_unlock(&buffer_mapping->private_lock);
847 EXPORT_SYMBOL(invalidate_inode_buffers);
850 * Remove any clean buffers from the inode's buffer list. This is called
851 * when we're trying to free the inode itself. Those buffers can pin it.
853 * Returns true if all buffers were removed.
855 int remove_inode_buffers(struct inode *inode)
859 if (inode_has_buffers(inode)) {
860 struct address_space *mapping = &inode->i_data;
861 struct list_head *list = &mapping->private_list;
862 struct address_space *buffer_mapping = mapping->assoc_mapping;
864 spin_lock(&buffer_mapping->private_lock);
865 while (!list_empty(list)) {
866 struct buffer_head *bh = BH_ENTRY(list->next);
867 if (buffer_dirty(bh)) {
871 __remove_assoc_queue(bh);
873 spin_unlock(&buffer_mapping->private_lock);
879 * Create the appropriate buffers when given a page for data area and
880 * the size of each buffer.. Use the bh->b_this_page linked list to
881 * follow the buffers created. Return NULL if unable to create more
884 * The retry flag is used to differentiate async IO (paging, swapping)
885 * which may not fail from ordinary buffer allocations.
887 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
890 struct buffer_head *bh, *head;
896 while ((offset -= size) >= 0) {
897 bh = alloc_buffer_head(GFP_NOFS);
902 bh->b_this_page = head;
907 atomic_set(&bh->b_count, 0);
910 /* Link the buffer to its page */
911 set_bh_page(bh, page, offset);
913 init_buffer(bh, NULL, NULL);
917 * In case anything failed, we just free everything we got.
923 head = head->b_this_page;
924 free_buffer_head(bh);
929 * Return failure for non-async IO requests. Async IO requests
930 * are not allowed to fail, so we have to wait until buffer heads
931 * become available. But we don't want tasks sleeping with
932 * partially complete buffers, so all were released above.
937 /* We're _really_ low on memory. Now we just
938 * wait for old buffer heads to become free due to
939 * finishing IO. Since this is an async request and
940 * the reserve list is empty, we're sure there are
941 * async buffer heads in use.
946 EXPORT_SYMBOL_GPL(alloc_page_buffers);
949 link_dev_buffers(struct page *page, struct buffer_head *head)
951 struct buffer_head *bh, *tail;
956 bh = bh->b_this_page;
958 tail->b_this_page = head;
959 attach_page_buffers(page, head);
963 * Initialise the state of a blockdev page's buffers.
966 init_page_buffers(struct page *page, struct block_device *bdev,
967 sector_t block, int size)
969 struct buffer_head *head = page_buffers(page);
970 struct buffer_head *bh = head;
971 int uptodate = PageUptodate(page);
974 if (!buffer_mapped(bh)) {
975 init_buffer(bh, NULL, NULL);
977 bh->b_blocknr = block;
979 set_buffer_uptodate(bh);
980 set_buffer_mapped(bh);
983 bh = bh->b_this_page;
984 } while (bh != head);
988 * Create the page-cache page that contains the requested block.
990 * This is user purely for blockdev mappings.
993 grow_dev_page(struct block_device *bdev, sector_t block,
994 pgoff_t index, int size)
996 struct inode *inode = bdev->bd_inode;
998 struct buffer_head *bh;
1000 page = find_or_create_page(inode->i_mapping, index,
1001 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1005 BUG_ON(!PageLocked(page));
1007 if (page_has_buffers(page)) {
1008 bh = page_buffers(page);
1009 if (bh->b_size == size) {
1010 init_page_buffers(page, bdev, block, size);
1013 if (!try_to_free_buffers(page))
1018 * Allocate some buffers for this page
1020 bh = alloc_page_buffers(page, size, 0);
1025 * Link the page to the buffers and initialise them. Take the
1026 * lock to be atomic wrt __find_get_block(), which does not
1027 * run under the page lock.
1029 spin_lock(&inode->i_mapping->private_lock);
1030 link_dev_buffers(page, bh);
1031 init_page_buffers(page, bdev, block, size);
1032 spin_unlock(&inode->i_mapping->private_lock);
1038 page_cache_release(page);
1043 * Create buffers for the specified block device block's page. If
1044 * that page was dirty, the buffers are set dirty also.
1047 grow_buffers(struct block_device *bdev, sector_t block, int size)
1056 } while ((size << sizebits) < PAGE_SIZE);
1058 index = block >> sizebits;
1061 * Check for a block which wants to lie outside our maximum possible
1062 * pagecache index. (this comparison is done using sector_t types).
1064 if (unlikely(index != block >> sizebits)) {
1065 char b[BDEVNAME_SIZE];
1067 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1069 __func__, (unsigned long long)block,
1073 block = index << sizebits;
1074 /* Create a page with the proper size buffers.. */
1075 page = grow_dev_page(bdev, block, index, size);
1079 page_cache_release(page);
1083 static struct buffer_head *
1084 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1086 /* Size must be multiple of hard sectorsize */
1087 if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1088 (size < 512 || size > PAGE_SIZE))) {
1089 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1091 printk(KERN_ERR "logical block size: %d\n",
1092 bdev_logical_block_size(bdev));
1099 struct buffer_head * bh;
1102 bh = __find_get_block(bdev, block, size);
1106 ret = grow_buffers(bdev, block, size);
1115 * The relationship between dirty buffers and dirty pages:
1117 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1118 * the page is tagged dirty in its radix tree.
1120 * At all times, the dirtiness of the buffers represents the dirtiness of
1121 * subsections of the page. If the page has buffers, the page dirty bit is
1122 * merely a hint about the true dirty state.
1124 * When a page is set dirty in its entirety, all its buffers are marked dirty
1125 * (if the page has buffers).
1127 * When a buffer is marked dirty, its page is dirtied, but the page's other
1130 * Also. When blockdev buffers are explicitly read with bread(), they
1131 * individually become uptodate. But their backing page remains not
1132 * uptodate - even if all of its buffers are uptodate. A subsequent
1133 * block_read_full_page() against that page will discover all the uptodate
1134 * buffers, will set the page uptodate and will perform no I/O.
1138 * mark_buffer_dirty - mark a buffer_head as needing writeout
1139 * @bh: the buffer_head to mark dirty
1141 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1142 * backing page dirty, then tag the page as dirty in its address_space's radix
1143 * tree and then attach the address_space's inode to its superblock's dirty
1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1147 * mapping->tree_lock and mapping->host->i_lock.
1149 void mark_buffer_dirty(struct buffer_head *bh)
1151 WARN_ON_ONCE(!buffer_uptodate(bh));
1154 * Very *carefully* optimize the it-is-already-dirty case.
1156 * Don't let the final "is it dirty" escape to before we
1157 * perhaps modified the buffer.
1159 if (buffer_dirty(bh)) {
1161 if (buffer_dirty(bh))
1165 if (!test_set_buffer_dirty(bh)) {
1166 struct page *page = bh->b_page;
1167 if (!TestSetPageDirty(page)) {
1168 struct address_space *mapping = page_mapping(page);
1170 __set_page_dirty(page, mapping, 0);
1174 EXPORT_SYMBOL(mark_buffer_dirty);
1177 * Decrement a buffer_head's reference count. If all buffers against a page
1178 * have zero reference count, are clean and unlocked, and if the page is clean
1179 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1180 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1181 * a page but it ends up not being freed, and buffers may later be reattached).
1183 void __brelse(struct buffer_head * buf)
1185 if (atomic_read(&buf->b_count)) {
1189 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1191 EXPORT_SYMBOL(__brelse);
1194 * bforget() is like brelse(), except it discards any
1195 * potentially dirty data.
1197 void __bforget(struct buffer_head *bh)
1199 clear_buffer_dirty(bh);
1200 if (bh->b_assoc_map) {
1201 struct address_space *buffer_mapping = bh->b_page->mapping;
1203 spin_lock(&buffer_mapping->private_lock);
1204 list_del_init(&bh->b_assoc_buffers);
1205 bh->b_assoc_map = NULL;
1206 spin_unlock(&buffer_mapping->private_lock);
1210 EXPORT_SYMBOL(__bforget);
1212 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1215 if (buffer_uptodate(bh)) {
1220 bh->b_end_io = end_buffer_read_sync;
1221 submit_bh(READ, bh);
1223 if (buffer_uptodate(bh))
1231 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1232 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1233 * refcount elevated by one when they're in an LRU. A buffer can only appear
1234 * once in a particular CPU's LRU. A single buffer can be present in multiple
1235 * CPU's LRUs at the same time.
1237 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1238 * sb_find_get_block().
1240 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1241 * a local interrupt disable for that.
1244 #define BH_LRU_SIZE 8
1247 struct buffer_head *bhs[BH_LRU_SIZE];
1250 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1253 #define bh_lru_lock() local_irq_disable()
1254 #define bh_lru_unlock() local_irq_enable()
1256 #define bh_lru_lock() preempt_disable()
1257 #define bh_lru_unlock() preempt_enable()
1260 static inline void check_irqs_on(void)
1262 #ifdef irqs_disabled
1263 BUG_ON(irqs_disabled());
1268 * The LRU management algorithm is dopey-but-simple. Sorry.
1270 static void bh_lru_install(struct buffer_head *bh)
1272 struct buffer_head *evictee = NULL;
1276 if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1277 struct buffer_head *bhs[BH_LRU_SIZE];
1283 for (in = 0; in < BH_LRU_SIZE; in++) {
1284 struct buffer_head *bh2 =
1285 __this_cpu_read(bh_lrus.bhs[in]);
1290 if (out >= BH_LRU_SIZE) {
1291 BUG_ON(evictee != NULL);
1298 while (out < BH_LRU_SIZE)
1300 memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1309 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1311 static struct buffer_head *
1312 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1314 struct buffer_head *ret = NULL;
1319 for (i = 0; i < BH_LRU_SIZE; i++) {
1320 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1322 if (bh && bh->b_bdev == bdev &&
1323 bh->b_blocknr == block && bh->b_size == size) {
1326 __this_cpu_write(bh_lrus.bhs[i],
1327 __this_cpu_read(bh_lrus.bhs[i - 1]));
1330 __this_cpu_write(bh_lrus.bhs[0], bh);
1342 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1343 * it in the LRU and mark it as accessed. If it is not present then return
1346 struct buffer_head *
1347 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1349 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1352 bh = __find_get_block_slow(bdev, block);
1360 EXPORT_SYMBOL(__find_get_block);
1363 * __getblk will locate (and, if necessary, create) the buffer_head
1364 * which corresponds to the passed block_device, block and size. The
1365 * returned buffer has its reference count incremented.
1367 * __getblk() cannot fail - it just keeps trying. If you pass it an
1368 * illegal block number, __getblk() will happily return a buffer_head
1369 * which represents the non-existent block. Very weird.
1371 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1372 * attempt is failing. FIXME, perhaps?
1374 struct buffer_head *
1375 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1377 struct buffer_head *bh = __find_get_block(bdev, block, size);
1381 bh = __getblk_slow(bdev, block, size);
1384 EXPORT_SYMBOL(__getblk);
1387 * Do async read-ahead on a buffer..
1389 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1391 struct buffer_head *bh = __getblk(bdev, block, size);
1393 ll_rw_block(READA, 1, &bh);
1397 EXPORT_SYMBOL(__breadahead);
1400 * __bread() - reads a specified block and returns the bh
1401 * @bdev: the block_device to read from
1402 * @block: number of block
1403 * @size: size (in bytes) to read
1405 * Reads a specified block, and returns buffer head that contains it.
1406 * It returns NULL if the block was unreadable.
1408 struct buffer_head *
1409 __bread(struct block_device *bdev, sector_t block, unsigned size)
1411 struct buffer_head *bh = __getblk(bdev, block, size);
1413 if (likely(bh) && !buffer_uptodate(bh))
1414 bh = __bread_slow(bh);
1417 EXPORT_SYMBOL(__bread);
1420 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1421 * This doesn't race because it runs in each cpu either in irq
1422 * or with preempt disabled.
1424 static void invalidate_bh_lru(void *arg)
1426 struct bh_lru *b = &get_cpu_var(bh_lrus);
1429 for (i = 0; i < BH_LRU_SIZE; i++) {
1433 put_cpu_var(bh_lrus);
1436 void invalidate_bh_lrus(void)
1438 on_each_cpu(invalidate_bh_lru, NULL, 1);
1440 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1442 void set_bh_page(struct buffer_head *bh,
1443 struct page *page, unsigned long offset)
1446 BUG_ON(offset >= PAGE_SIZE);
1447 if (PageHighMem(page))
1449 * This catches illegal uses and preserves the offset:
1451 bh->b_data = (char *)(0 + offset);
1453 bh->b_data = page_address(page) + offset;
1455 EXPORT_SYMBOL(set_bh_page);
1458 * Called when truncating a buffer on a page completely.
1460 static void discard_buffer(struct buffer_head * bh)
1463 clear_buffer_dirty(bh);
1465 clear_buffer_mapped(bh);
1466 clear_buffer_req(bh);
1467 clear_buffer_new(bh);
1468 clear_buffer_delay(bh);
1469 clear_buffer_unwritten(bh);
1474 * block_invalidatepage - invalidate part of all of a buffer-backed page
1476 * @page: the page which is affected
1477 * @offset: the index of the truncation point
1479 * block_invalidatepage() is called when all or part of the page has become
1480 * invalidatedby a truncate operation.
1482 * block_invalidatepage() does not have to release all buffers, but it must
1483 * ensure that no dirty buffer is left outside @offset and that no I/O
1484 * is underway against any of the blocks which are outside the truncation
1485 * point. Because the caller is about to free (and possibly reuse) those
1488 void block_invalidatepage(struct page *page, unsigned long offset)
1490 struct buffer_head *head, *bh, *next;
1491 unsigned int curr_off = 0;
1493 BUG_ON(!PageLocked(page));
1494 if (!page_has_buffers(page))
1497 head = page_buffers(page);
1500 unsigned int next_off = curr_off + bh->b_size;
1501 next = bh->b_this_page;
1504 * is this block fully invalidated?
1506 if (offset <= curr_off)
1508 curr_off = next_off;
1510 } while (bh != head);
1513 * We release buffers only if the entire page is being invalidated.
1514 * The get_block cached value has been unconditionally invalidated,
1515 * so real IO is not possible anymore.
1518 try_to_release_page(page, 0);
1522 EXPORT_SYMBOL(block_invalidatepage);
1525 * We attach and possibly dirty the buffers atomically wrt
1526 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1527 * is already excluded via the page lock.
1529 void create_empty_buffers(struct page *page,
1530 unsigned long blocksize, unsigned long b_state)
1532 struct buffer_head *bh, *head, *tail;
1534 head = alloc_page_buffers(page, blocksize, 1);
1537 bh->b_state |= b_state;
1539 bh = bh->b_this_page;
1541 tail->b_this_page = head;
1543 spin_lock(&page->mapping->private_lock);
1544 if (PageUptodate(page) || PageDirty(page)) {
1547 if (PageDirty(page))
1548 set_buffer_dirty(bh);
1549 if (PageUptodate(page))
1550 set_buffer_uptodate(bh);
1551 bh = bh->b_this_page;
1552 } while (bh != head);
1554 attach_page_buffers(page, head);
1555 spin_unlock(&page->mapping->private_lock);
1557 EXPORT_SYMBOL(create_empty_buffers);
1560 * We are taking a block for data and we don't want any output from any
1561 * buffer-cache aliases starting from return from that function and
1562 * until the moment when something will explicitly mark the buffer
1563 * dirty (hopefully that will not happen until we will free that block ;-)
1564 * We don't even need to mark it not-uptodate - nobody can expect
1565 * anything from a newly allocated buffer anyway. We used to used
1566 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1567 * don't want to mark the alias unmapped, for example - it would confuse
1568 * anyone who might pick it with bread() afterwards...
1570 * Also.. Note that bforget() doesn't lock the buffer. So there can
1571 * be writeout I/O going on against recently-freed buffers. We don't
1572 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1573 * only if we really need to. That happens here.
1575 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1577 struct buffer_head *old_bh;
1581 old_bh = __find_get_block_slow(bdev, block);
1583 clear_buffer_dirty(old_bh);
1584 wait_on_buffer(old_bh);
1585 clear_buffer_req(old_bh);
1589 EXPORT_SYMBOL(unmap_underlying_metadata);
1592 * NOTE! All mapped/uptodate combinations are valid:
1594 * Mapped Uptodate Meaning
1596 * No No "unknown" - must do get_block()
1597 * No Yes "hole" - zero-filled
1598 * Yes No "allocated" - allocated on disk, not read in
1599 * Yes Yes "valid" - allocated and up-to-date in memory.
1601 * "Dirty" is valid only with the last case (mapped+uptodate).
1605 * While block_write_full_page is writing back the dirty buffers under
1606 * the page lock, whoever dirtied the buffers may decide to clean them
1607 * again at any time. We handle that by only looking at the buffer
1608 * state inside lock_buffer().
1610 * If block_write_full_page() is called for regular writeback
1611 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1612 * locked buffer. This only can happen if someone has written the buffer
1613 * directly, with submit_bh(). At the address_space level PageWriteback
1614 * prevents this contention from occurring.
1616 * If block_write_full_page() is called with wbc->sync_mode ==
1617 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1618 * causes the writes to be flagged as synchronous writes.
1620 static int __block_write_full_page(struct inode *inode, struct page *page,
1621 get_block_t *get_block, struct writeback_control *wbc,
1622 bh_end_io_t *handler)
1626 sector_t last_block;
1627 struct buffer_head *bh, *head;
1628 const unsigned blocksize = 1 << inode->i_blkbits;
1629 int nr_underway = 0;
1630 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1631 WRITE_SYNC : WRITE);
1633 BUG_ON(!PageLocked(page));
1635 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1637 if (!page_has_buffers(page)) {
1638 create_empty_buffers(page, blocksize,
1639 (1 << BH_Dirty)|(1 << BH_Uptodate));
1643 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1644 * here, and the (potentially unmapped) buffers may become dirty at
1645 * any time. If a buffer becomes dirty here after we've inspected it
1646 * then we just miss that fact, and the page stays dirty.
1648 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1649 * handle that here by just cleaning them.
1652 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1653 head = page_buffers(page);
1657 * Get all the dirty buffers mapped to disk addresses and
1658 * handle any aliases from the underlying blockdev's mapping.
1661 if (block > last_block) {
1663 * mapped buffers outside i_size will occur, because
1664 * this page can be outside i_size when there is a
1665 * truncate in progress.
1668 * The buffer was zeroed by block_write_full_page()
1670 clear_buffer_dirty(bh);
1671 set_buffer_uptodate(bh);
1672 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1674 WARN_ON(bh->b_size != blocksize);
1675 err = get_block(inode, block, bh, 1);
1678 clear_buffer_delay(bh);
1679 if (buffer_new(bh)) {
1680 /* blockdev mappings never come here */
1681 clear_buffer_new(bh);
1682 unmap_underlying_metadata(bh->b_bdev,
1686 bh = bh->b_this_page;
1688 } while (bh != head);
1691 if (!buffer_mapped(bh))
1694 * If it's a fully non-blocking write attempt and we cannot
1695 * lock the buffer then redirty the page. Note that this can
1696 * potentially cause a busy-wait loop from writeback threads
1697 * and kswapd activity, but those code paths have their own
1698 * higher-level throttling.
1700 if (wbc->sync_mode != WB_SYNC_NONE) {
1702 } else if (!trylock_buffer(bh)) {
1703 redirty_page_for_writepage(wbc, page);
1706 if (test_clear_buffer_dirty(bh)) {
1707 mark_buffer_async_write_endio(bh, handler);
1711 } while ((bh = bh->b_this_page) != head);
1714 * The page and its buffers are protected by PageWriteback(), so we can
1715 * drop the bh refcounts early.
1717 BUG_ON(PageWriteback(page));
1718 set_page_writeback(page);
1721 struct buffer_head *next = bh->b_this_page;
1722 if (buffer_async_write(bh)) {
1723 submit_bh(write_op, bh);
1727 } while (bh != head);
1732 if (nr_underway == 0) {
1734 * The page was marked dirty, but the buffers were
1735 * clean. Someone wrote them back by hand with
1736 * ll_rw_block/submit_bh. A rare case.
1738 end_page_writeback(page);
1741 * The page and buffer_heads can be released at any time from
1749 * ENOSPC, or some other error. We may already have added some
1750 * blocks to the file, so we need to write these out to avoid
1751 * exposing stale data.
1752 * The page is currently locked and not marked for writeback
1755 /* Recovery: lock and submit the mapped buffers */
1757 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1758 !buffer_delay(bh)) {
1760 mark_buffer_async_write_endio(bh, handler);
1763 * The buffer may have been set dirty during
1764 * attachment to a dirty page.
1766 clear_buffer_dirty(bh);
1768 } while ((bh = bh->b_this_page) != head);
1770 BUG_ON(PageWriteback(page));
1771 mapping_set_error(page->mapping, err);
1772 set_page_writeback(page);
1774 struct buffer_head *next = bh->b_this_page;
1775 if (buffer_async_write(bh)) {
1776 clear_buffer_dirty(bh);
1777 submit_bh(write_op, bh);
1781 } while (bh != head);
1787 * If a page has any new buffers, zero them out here, and mark them uptodate
1788 * and dirty so they'll be written out (in order to prevent uninitialised
1789 * block data from leaking). And clear the new bit.
1791 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1793 unsigned int block_start, block_end;
1794 struct buffer_head *head, *bh;
1796 BUG_ON(!PageLocked(page));
1797 if (!page_has_buffers(page))
1800 bh = head = page_buffers(page);
1803 block_end = block_start + bh->b_size;
1805 if (buffer_new(bh)) {
1806 if (block_end > from && block_start < to) {
1807 if (!PageUptodate(page)) {
1808 unsigned start, size;
1810 start = max(from, block_start);
1811 size = min(to, block_end) - start;
1813 zero_user(page, start, size);
1814 set_buffer_uptodate(bh);
1817 clear_buffer_new(bh);
1818 mark_buffer_dirty(bh);
1822 block_start = block_end;
1823 bh = bh->b_this_page;
1824 } while (bh != head);
1826 EXPORT_SYMBOL(page_zero_new_buffers);
1828 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1829 get_block_t *get_block)
1831 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1832 unsigned to = from + len;
1833 struct inode *inode = page->mapping->host;
1834 unsigned block_start, block_end;
1837 unsigned blocksize, bbits;
1838 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1840 BUG_ON(!PageLocked(page));
1841 BUG_ON(from > PAGE_CACHE_SIZE);
1842 BUG_ON(to > PAGE_CACHE_SIZE);
1845 blocksize = 1 << inode->i_blkbits;
1846 if (!page_has_buffers(page))
1847 create_empty_buffers(page, blocksize, 0);
1848 head = page_buffers(page);
1850 bbits = inode->i_blkbits;
1851 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1853 for(bh = head, block_start = 0; bh != head || !block_start;
1854 block++, block_start=block_end, bh = bh->b_this_page) {
1855 block_end = block_start + blocksize;
1856 if (block_end <= from || block_start >= to) {
1857 if (PageUptodate(page)) {
1858 if (!buffer_uptodate(bh))
1859 set_buffer_uptodate(bh);
1864 clear_buffer_new(bh);
1865 if (!buffer_mapped(bh)) {
1866 WARN_ON(bh->b_size != blocksize);
1867 err = get_block(inode, block, bh, 1);
1870 if (buffer_new(bh)) {
1871 unmap_underlying_metadata(bh->b_bdev,
1873 if (PageUptodate(page)) {
1874 clear_buffer_new(bh);
1875 set_buffer_uptodate(bh);
1876 mark_buffer_dirty(bh);
1879 if (block_end > to || block_start < from)
1880 zero_user_segments(page,
1886 if (PageUptodate(page)) {
1887 if (!buffer_uptodate(bh))
1888 set_buffer_uptodate(bh);
1891 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1892 !buffer_unwritten(bh) &&
1893 (block_start < from || block_end > to)) {
1894 ll_rw_block(READ, 1, &bh);
1899 * If we issued read requests - let them complete.
1901 while(wait_bh > wait) {
1902 wait_on_buffer(*--wait_bh);
1903 if (!buffer_uptodate(*wait_bh))
1906 if (unlikely(err)) {
1907 page_zero_new_buffers(page, from, to);
1908 ClearPageUptodate(page);
1912 EXPORT_SYMBOL(__block_write_begin);
1914 static int __block_commit_write(struct inode *inode, struct page *page,
1915 unsigned from, unsigned to)
1917 unsigned block_start, block_end;
1920 struct buffer_head *bh, *head;
1922 blocksize = 1 << inode->i_blkbits;
1924 for(bh = head = page_buffers(page), block_start = 0;
1925 bh != head || !block_start;
1926 block_start=block_end, bh = bh->b_this_page) {
1927 block_end = block_start + blocksize;
1928 if (block_end <= from || block_start >= to) {
1929 if (!buffer_uptodate(bh))
1932 set_buffer_uptodate(bh);
1933 mark_buffer_dirty(bh);
1935 clear_buffer_new(bh);
1939 * If this is a partial write which happened to make all buffers
1940 * uptodate then we can optimize away a bogus readpage() for
1941 * the next read(). Here we 'discover' whether the page went
1942 * uptodate as a result of this (potentially partial) write.
1945 SetPageUptodate(page);
1950 * block_write_begin takes care of the basic task of block allocation and
1951 * bringing partial write blocks uptodate first.
1953 * The filesystem needs to handle block truncation upon failure.
1955 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1956 unsigned flags, struct page **pagep, get_block_t *get_block)
1958 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1962 page = grab_cache_page_write_begin(mapping, index, flags);
1966 status = __block_write_begin(page, pos, len, get_block);
1967 if (unlikely(status)) {
1969 page_cache_release(page);
1976 EXPORT_SYMBOL(block_write_begin);
1978 int block_write_end(struct file *file, struct address_space *mapping,
1979 loff_t pos, unsigned len, unsigned copied,
1980 struct page *page, void *fsdata)
1982 struct inode *inode = mapping->host;
1985 start = pos & (PAGE_CACHE_SIZE - 1);
1987 if (unlikely(copied < len)) {
1989 * The buffers that were written will now be uptodate, so we
1990 * don't have to worry about a readpage reading them and
1991 * overwriting a partial write. However if we have encountered
1992 * a short write and only partially written into a buffer, it
1993 * will not be marked uptodate, so a readpage might come in and
1994 * destroy our partial write.
1996 * Do the simplest thing, and just treat any short write to a
1997 * non uptodate page as a zero-length write, and force the
1998 * caller to redo the whole thing.
2000 if (!PageUptodate(page))
2003 page_zero_new_buffers(page, start+copied, start+len);
2005 flush_dcache_page(page);
2007 /* This could be a short (even 0-length) commit */
2008 __block_commit_write(inode, page, start, start+copied);
2012 EXPORT_SYMBOL(block_write_end);
2014 int generic_write_end(struct file *file, struct address_space *mapping,
2015 loff_t pos, unsigned len, unsigned copied,
2016 struct page *page, void *fsdata)
2018 struct inode *inode = mapping->host;
2019 int i_size_changed = 0;
2021 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2024 * No need to use i_size_read() here, the i_size
2025 * cannot change under us because we hold i_mutex.
2027 * But it's important to update i_size while still holding page lock:
2028 * page writeout could otherwise come in and zero beyond i_size.
2030 if (pos+copied > inode->i_size) {
2031 i_size_write(inode, pos+copied);
2036 page_cache_release(page);
2039 * Don't mark the inode dirty under page lock. First, it unnecessarily
2040 * makes the holding time of page lock longer. Second, it forces lock
2041 * ordering of page lock and transaction start for journaling
2045 mark_inode_dirty(inode);
2049 EXPORT_SYMBOL(generic_write_end);
2052 * block_is_partially_uptodate checks whether buffers within a page are
2055 * Returns true if all buffers which correspond to a file portion
2056 * we want to read are uptodate.
2058 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2061 struct inode *inode = page->mapping->host;
2062 unsigned block_start, block_end, blocksize;
2064 struct buffer_head *bh, *head;
2067 if (!page_has_buffers(page))
2070 blocksize = 1 << inode->i_blkbits;
2071 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2073 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2076 head = page_buffers(page);
2080 block_end = block_start + blocksize;
2081 if (block_end > from && block_start < to) {
2082 if (!buffer_uptodate(bh)) {
2086 if (block_end >= to)
2089 block_start = block_end;
2090 bh = bh->b_this_page;
2091 } while (bh != head);
2095 EXPORT_SYMBOL(block_is_partially_uptodate);
2098 * Generic "read page" function for block devices that have the normal
2099 * get_block functionality. This is most of the block device filesystems.
2100 * Reads the page asynchronously --- the unlock_buffer() and
2101 * set/clear_buffer_uptodate() functions propagate buffer state into the
2102 * page struct once IO has completed.
2104 int block_read_full_page(struct page *page, get_block_t *get_block)
2106 struct inode *inode = page->mapping->host;
2107 sector_t iblock, lblock;
2108 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2109 unsigned int blocksize;
2111 int fully_mapped = 1;
2113 BUG_ON(!PageLocked(page));
2114 blocksize = 1 << inode->i_blkbits;
2115 if (!page_has_buffers(page))
2116 create_empty_buffers(page, blocksize, 0);
2117 head = page_buffers(page);
2119 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2120 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2126 if (buffer_uptodate(bh))
2129 if (!buffer_mapped(bh)) {
2133 if (iblock < lblock) {
2134 WARN_ON(bh->b_size != blocksize);
2135 err = get_block(inode, iblock, bh, 0);
2139 if (!buffer_mapped(bh)) {
2140 zero_user(page, i * blocksize, blocksize);
2142 set_buffer_uptodate(bh);
2146 * get_block() might have updated the buffer
2149 if (buffer_uptodate(bh))
2153 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2156 SetPageMappedToDisk(page);
2160 * All buffers are uptodate - we can set the page uptodate
2161 * as well. But not if get_block() returned an error.
2163 if (!PageError(page))
2164 SetPageUptodate(page);
2169 /* Stage two: lock the buffers */
2170 for (i = 0; i < nr; i++) {
2173 mark_buffer_async_read(bh);
2177 * Stage 3: start the IO. Check for uptodateness
2178 * inside the buffer lock in case another process reading
2179 * the underlying blockdev brought it uptodate (the sct fix).
2181 for (i = 0; i < nr; i++) {
2183 if (buffer_uptodate(bh))
2184 end_buffer_async_read(bh, 1);
2186 submit_bh(READ, bh);
2190 EXPORT_SYMBOL(block_read_full_page);
2192 /* utility function for filesystems that need to do work on expanding
2193 * truncates. Uses filesystem pagecache writes to allow the filesystem to
2194 * deal with the hole.
2196 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2198 struct address_space *mapping = inode->i_mapping;
2203 err = inode_newsize_ok(inode, size);
2207 err = pagecache_write_begin(NULL, mapping, size, 0,
2208 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2213 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2219 EXPORT_SYMBOL(generic_cont_expand_simple);
2221 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2222 loff_t pos, loff_t *bytes)
2224 struct inode *inode = mapping->host;
2225 unsigned blocksize = 1 << inode->i_blkbits;
2228 pgoff_t index, curidx;
2230 unsigned zerofrom, offset, len;
2233 index = pos >> PAGE_CACHE_SHIFT;
2234 offset = pos & ~PAGE_CACHE_MASK;
2236 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2237 zerofrom = curpos & ~PAGE_CACHE_MASK;
2238 if (zerofrom & (blocksize-1)) {
2239 *bytes |= (blocksize-1);
2242 len = PAGE_CACHE_SIZE - zerofrom;
2244 err = pagecache_write_begin(file, mapping, curpos, len,
2245 AOP_FLAG_UNINTERRUPTIBLE,
2249 zero_user(page, zerofrom, len);
2250 err = pagecache_write_end(file, mapping, curpos, len, len,
2257 balance_dirty_pages_ratelimited(mapping);
2260 /* page covers the boundary, find the boundary offset */
2261 if (index == curidx) {
2262 zerofrom = curpos & ~PAGE_CACHE_MASK;
2263 /* if we will expand the thing last block will be filled */
2264 if (offset <= zerofrom) {
2267 if (zerofrom & (blocksize-1)) {
2268 *bytes |= (blocksize-1);
2271 len = offset - zerofrom;
2273 err = pagecache_write_begin(file, mapping, curpos, len,
2274 AOP_FLAG_UNINTERRUPTIBLE,
2278 zero_user(page, zerofrom, len);
2279 err = pagecache_write_end(file, mapping, curpos, len, len,
2291 * For moronic filesystems that do not allow holes in file.
2292 * We may have to extend the file.
2294 int cont_write_begin(struct file *file, struct address_space *mapping,
2295 loff_t pos, unsigned len, unsigned flags,
2296 struct page **pagep, void **fsdata,
2297 get_block_t *get_block, loff_t *bytes)
2299 struct inode *inode = mapping->host;
2300 unsigned blocksize = 1 << inode->i_blkbits;
2304 err = cont_expand_zero(file, mapping, pos, bytes);
2308 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2309 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2310 *bytes |= (blocksize-1);
2314 return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2316 EXPORT_SYMBOL(cont_write_begin);
2318 int block_commit_write(struct page *page, unsigned from, unsigned to)
2320 struct inode *inode = page->mapping->host;
2321 __block_commit_write(inode,page,from,to);
2324 EXPORT_SYMBOL(block_commit_write);
2327 * block_page_mkwrite() is not allowed to change the file size as it gets
2328 * called from a page fault handler when a page is first dirtied. Hence we must
2329 * be careful to check for EOF conditions here. We set the page up correctly
2330 * for a written page which means we get ENOSPC checking when writing into
2331 * holes and correct delalloc and unwritten extent mapping on filesystems that
2332 * support these features.
2334 * We are not allowed to take the i_mutex here so we have to play games to
2335 * protect against truncate races as the page could now be beyond EOF. Because
2336 * truncate writes the inode size before removing pages, once we have the
2337 * page lock we can determine safely if the page is beyond EOF. If it is not
2338 * beyond EOF, then the page is guaranteed safe against truncation until we
2342 block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2343 get_block_t get_block)
2345 struct page *page = vmf->page;
2346 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2349 int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2352 size = i_size_read(inode);
2353 if ((page->mapping != inode->i_mapping) ||
2354 (page_offset(page) > size)) {
2355 /* page got truncated out from underneath us */
2360 /* page is wholly or partially inside EOF */
2361 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2362 end = size & ~PAGE_CACHE_MASK;
2364 end = PAGE_CACHE_SIZE;
2366 ret = __block_write_begin(page, 0, end, get_block);
2368 ret = block_commit_write(page, 0, end);
2370 if (unlikely(ret)) {
2374 else /* -ENOSPC, -EIO, etc */
2375 ret = VM_FAULT_SIGBUS;
2377 ret = VM_FAULT_LOCKED;
2382 EXPORT_SYMBOL(block_page_mkwrite);
2385 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2386 * immediately, while under the page lock. So it needs a special end_io
2387 * handler which does not touch the bh after unlocking it.
2389 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2391 __end_buffer_read_notouch(bh, uptodate);
2395 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2396 * the page (converting it to circular linked list and taking care of page
2399 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2401 struct buffer_head *bh;
2403 BUG_ON(!PageLocked(page));
2405 spin_lock(&page->mapping->private_lock);
2408 if (PageDirty(page))
2409 set_buffer_dirty(bh);
2410 if (!bh->b_this_page)
2411 bh->b_this_page = head;
2412 bh = bh->b_this_page;
2413 } while (bh != head);
2414 attach_page_buffers(page, head);
2415 spin_unlock(&page->mapping->private_lock);
2419 * On entry, the page is fully not uptodate.
2420 * On exit the page is fully uptodate in the areas outside (from,to)
2421 * The filesystem needs to handle block truncation upon failure.
2423 int nobh_write_begin(struct address_space *mapping,
2424 loff_t pos, unsigned len, unsigned flags,
2425 struct page **pagep, void **fsdata,
2426 get_block_t *get_block)
2428 struct inode *inode = mapping->host;
2429 const unsigned blkbits = inode->i_blkbits;
2430 const unsigned blocksize = 1 << blkbits;
2431 struct buffer_head *head, *bh;
2435 unsigned block_in_page;
2436 unsigned block_start, block_end;
2437 sector_t block_in_file;
2440 int is_mapped_to_disk = 1;
2442 index = pos >> PAGE_CACHE_SHIFT;
2443 from = pos & (PAGE_CACHE_SIZE - 1);
2446 page = grab_cache_page_write_begin(mapping, index, flags);
2452 if (page_has_buffers(page)) {
2453 ret = __block_write_begin(page, pos, len, get_block);
2459 if (PageMappedToDisk(page))
2463 * Allocate buffers so that we can keep track of state, and potentially
2464 * attach them to the page if an error occurs. In the common case of
2465 * no error, they will just be freed again without ever being attached
2466 * to the page (which is all OK, because we're under the page lock).
2468 * Be careful: the buffer linked list is a NULL terminated one, rather
2469 * than the circular one we're used to.
2471 head = alloc_page_buffers(page, blocksize, 0);
2477 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2480 * We loop across all blocks in the page, whether or not they are
2481 * part of the affected region. This is so we can discover if the
2482 * page is fully mapped-to-disk.
2484 for (block_start = 0, block_in_page = 0, bh = head;
2485 block_start < PAGE_CACHE_SIZE;
2486 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2489 block_end = block_start + blocksize;
2492 if (block_start >= to)
2494 ret = get_block(inode, block_in_file + block_in_page,
2498 if (!buffer_mapped(bh))
2499 is_mapped_to_disk = 0;
2501 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2502 if (PageUptodate(page)) {
2503 set_buffer_uptodate(bh);
2506 if (buffer_new(bh) || !buffer_mapped(bh)) {
2507 zero_user_segments(page, block_start, from,
2511 if (buffer_uptodate(bh))
2512 continue; /* reiserfs does this */
2513 if (block_start < from || block_end > to) {
2515 bh->b_end_io = end_buffer_read_nobh;
2516 submit_bh(READ, bh);
2523 * The page is locked, so these buffers are protected from
2524 * any VM or truncate activity. Hence we don't need to care
2525 * for the buffer_head refcounts.
2527 for (bh = head; bh; bh = bh->b_this_page) {
2529 if (!buffer_uptodate(bh))
2536 if (is_mapped_to_disk)
2537 SetPageMappedToDisk(page);
2539 *fsdata = head; /* to be released by nobh_write_end */
2546 * Error recovery is a bit difficult. We need to zero out blocks that
2547 * were newly allocated, and dirty them to ensure they get written out.
2548 * Buffers need to be attached to the page at this point, otherwise
2549 * the handling of potential IO errors during writeout would be hard
2550 * (could try doing synchronous writeout, but what if that fails too?)
2552 attach_nobh_buffers(page, head);
2553 page_zero_new_buffers(page, from, to);
2557 page_cache_release(page);
2562 EXPORT_SYMBOL(nobh_write_begin);
2564 int nobh_write_end(struct file *file, struct address_space *mapping,
2565 loff_t pos, unsigned len, unsigned copied,
2566 struct page *page, void *fsdata)
2568 struct inode *inode = page->mapping->host;
2569 struct buffer_head *head = fsdata;
2570 struct buffer_head *bh;
2571 BUG_ON(fsdata != NULL && page_has_buffers(page));
2573 if (unlikely(copied < len) && head)
2574 attach_nobh_buffers(page, head);
2575 if (page_has_buffers(page))
2576 return generic_write_end(file, mapping, pos, len,
2577 copied, page, fsdata);
2579 SetPageUptodate(page);
2580 set_page_dirty(page);
2581 if (pos+copied > inode->i_size) {
2582 i_size_write(inode, pos+copied);
2583 mark_inode_dirty(inode);
2587 page_cache_release(page);
2591 head = head->b_this_page;
2592 free_buffer_head(bh);
2597 EXPORT_SYMBOL(nobh_write_end);
2600 * nobh_writepage() - based on block_full_write_page() except
2601 * that it tries to operate without attaching bufferheads to
2604 int nobh_writepage(struct page *page, get_block_t *get_block,
2605 struct writeback_control *wbc)
2607 struct inode * const inode = page->mapping->host;
2608 loff_t i_size = i_size_read(inode);
2609 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2613 /* Is the page fully inside i_size? */
2614 if (page->index < end_index)
2617 /* Is the page fully outside i_size? (truncate in progress) */
2618 offset = i_size & (PAGE_CACHE_SIZE-1);
2619 if (page->index >= end_index+1 || !offset) {
2621 * The page may have dirty, unmapped buffers. For example,
2622 * they may have been added in ext3_writepage(). Make them
2623 * freeable here, so the page does not leak.
2626 /* Not really sure about this - do we need this ? */
2627 if (page->mapping->a_ops->invalidatepage)
2628 page->mapping->a_ops->invalidatepage(page, offset);
2631 return 0; /* don't care */
2635 * The page straddles i_size. It must be zeroed out on each and every
2636 * writepage invocation because it may be mmapped. "A file is mapped
2637 * in multiples of the page size. For a file that is not a multiple of
2638 * the page size, the remaining memory is zeroed when mapped, and
2639 * writes to that region are not written out to the file."
2641 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2643 ret = mpage_writepage(page, get_block, wbc);
2645 ret = __block_write_full_page(inode, page, get_block, wbc,
2646 end_buffer_async_write);
2649 EXPORT_SYMBOL(nobh_writepage);
2651 int nobh_truncate_page(struct address_space *mapping,
2652 loff_t from, get_block_t *get_block)
2654 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2655 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2658 unsigned length, pos;
2659 struct inode *inode = mapping->host;
2661 struct buffer_head map_bh;
2664 blocksize = 1 << inode->i_blkbits;
2665 length = offset & (blocksize - 1);
2667 /* Block boundary? Nothing to do */
2671 length = blocksize - length;
2672 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2674 page = grab_cache_page(mapping, index);
2679 if (page_has_buffers(page)) {
2682 page_cache_release(page);
2683 return block_truncate_page(mapping, from, get_block);
2686 /* Find the buffer that contains "offset" */
2688 while (offset >= pos) {
2693 map_bh.b_size = blocksize;
2695 err = get_block(inode, iblock, &map_bh, 0);
2698 /* unmapped? It's a hole - nothing to do */
2699 if (!buffer_mapped(&map_bh))
2702 /* Ok, it's mapped. Make sure it's up-to-date */
2703 if (!PageUptodate(page)) {
2704 err = mapping->a_ops->readpage(NULL, page);
2706 page_cache_release(page);
2710 if (!PageUptodate(page)) {
2714 if (page_has_buffers(page))
2717 zero_user(page, offset, length);
2718 set_page_dirty(page);
2723 page_cache_release(page);
2727 EXPORT_SYMBOL(nobh_truncate_page);
2729 int block_truncate_page(struct address_space *mapping,
2730 loff_t from, get_block_t *get_block)
2732 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2733 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2736 unsigned length, pos;
2737 struct inode *inode = mapping->host;
2739 struct buffer_head *bh;
2742 blocksize = 1 << inode->i_blkbits;
2743 length = offset & (blocksize - 1);
2745 /* Block boundary? Nothing to do */
2749 length = blocksize - length;
2750 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2752 page = grab_cache_page(mapping, index);
2757 if (!page_has_buffers(page))
2758 create_empty_buffers(page, blocksize, 0);
2760 /* Find the buffer that contains "offset" */
2761 bh = page_buffers(page);
2763 while (offset >= pos) {
2764 bh = bh->b_this_page;
2770 if (!buffer_mapped(bh)) {
2771 WARN_ON(bh->b_size != blocksize);
2772 err = get_block(inode, iblock, bh, 0);
2775 /* unmapped? It's a hole - nothing to do */
2776 if (!buffer_mapped(bh))
2780 /* Ok, it's mapped. Make sure it's up-to-date */
2781 if (PageUptodate(page))
2782 set_buffer_uptodate(bh);
2784 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2786 ll_rw_block(READ, 1, &bh);
2788 /* Uhhuh. Read error. Complain and punt. */
2789 if (!buffer_uptodate(bh))
2793 zero_user(page, offset, length);
2794 mark_buffer_dirty(bh);
2799 page_cache_release(page);
2803 EXPORT_SYMBOL(block_truncate_page);
2806 * The generic ->writepage function for buffer-backed address_spaces
2807 * this form passes in the end_io handler used to finish the IO.
2809 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2810 struct writeback_control *wbc, bh_end_io_t *handler)
2812 struct inode * const inode = page->mapping->host;
2813 loff_t i_size = i_size_read(inode);
2814 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2817 /* Is the page fully inside i_size? */
2818 if (page->index < end_index)
2819 return __block_write_full_page(inode, page, get_block, wbc,
2822 /* Is the page fully outside i_size? (truncate in progress) */
2823 offset = i_size & (PAGE_CACHE_SIZE-1);
2824 if (page->index >= end_index+1 || !offset) {
2826 * The page may have dirty, unmapped buffers. For example,
2827 * they may have been added in ext3_writepage(). Make them
2828 * freeable here, so the page does not leak.
2830 do_invalidatepage(page, 0);
2832 return 0; /* don't care */
2836 * The page straddles i_size. It must be zeroed out on each and every
2837 * writepage invocation because it may be mmapped. "A file is mapped
2838 * in multiples of the page size. For a file that is not a multiple of
2839 * the page size, the remaining memory is zeroed when mapped, and
2840 * writes to that region are not written out to the file."
2842 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2843 return __block_write_full_page(inode, page, get_block, wbc, handler);
2845 EXPORT_SYMBOL(block_write_full_page_endio);
2848 * The generic ->writepage function for buffer-backed address_spaces
2850 int block_write_full_page(struct page *page, get_block_t *get_block,
2851 struct writeback_control *wbc)
2853 return block_write_full_page_endio(page, get_block, wbc,
2854 end_buffer_async_write);
2856 EXPORT_SYMBOL(block_write_full_page);
2858 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2859 get_block_t *get_block)
2861 struct buffer_head tmp;
2862 struct inode *inode = mapping->host;
2865 tmp.b_size = 1 << inode->i_blkbits;
2866 get_block(inode, block, &tmp, 0);
2867 return tmp.b_blocknr;
2869 EXPORT_SYMBOL(generic_block_bmap);
2871 static void end_bio_bh_io_sync(struct bio *bio, int err)
2873 struct buffer_head *bh = bio->bi_private;
2875 if (err == -EOPNOTSUPP) {
2876 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2879 if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2880 set_bit(BH_Quiet, &bh->b_state);
2882 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2886 int submit_bh(int rw, struct buffer_head * bh)
2891 BUG_ON(!buffer_locked(bh));
2892 BUG_ON(!buffer_mapped(bh));
2893 BUG_ON(!bh->b_end_io);
2894 BUG_ON(buffer_delay(bh));
2895 BUG_ON(buffer_unwritten(bh));
2898 * Only clear out a write error when rewriting
2900 if (test_set_buffer_req(bh) && (rw & WRITE))
2901 clear_buffer_write_io_error(bh);
2904 * from here on down, it's all bio -- do the initial mapping,
2905 * submit_bio -> generic_make_request may further map this bio around
2907 bio = bio_alloc(GFP_NOIO, 1);
2909 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2910 bio->bi_bdev = bh->b_bdev;
2911 bio->bi_io_vec[0].bv_page = bh->b_page;
2912 bio->bi_io_vec[0].bv_len = bh->b_size;
2913 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2917 bio->bi_size = bh->b_size;
2919 bio->bi_end_io = end_bio_bh_io_sync;
2920 bio->bi_private = bh;
2923 submit_bio(rw, bio);
2925 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2931 EXPORT_SYMBOL(submit_bh);
2934 * ll_rw_block: low-level access to block devices (DEPRECATED)
2935 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2936 * @nr: number of &struct buffer_heads in the array
2937 * @bhs: array of pointers to &struct buffer_head
2939 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2940 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2941 * %READA option is described in the documentation for generic_make_request()
2942 * which ll_rw_block() calls.
2944 * This function drops any buffer that it cannot get a lock on (with the
2945 * BH_Lock state bit), any buffer that appears to be clean when doing a write
2946 * request, and any buffer that appears to be up-to-date when doing read
2947 * request. Further it marks as clean buffers that are processed for
2948 * writing (the buffer cache won't assume that they are actually clean
2949 * until the buffer gets unlocked).
2951 * ll_rw_block sets b_end_io to simple completion handler that marks
2952 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2955 * All of the buffers must be for the same device, and must also be a
2956 * multiple of the current approved size for the device.
2958 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2962 for (i = 0; i < nr; i++) {
2963 struct buffer_head *bh = bhs[i];
2965 if (!trylock_buffer(bh))
2968 if (test_clear_buffer_dirty(bh)) {
2969 bh->b_end_io = end_buffer_write_sync;
2971 submit_bh(WRITE, bh);
2975 if (!buffer_uptodate(bh)) {
2976 bh->b_end_io = end_buffer_read_sync;
2985 EXPORT_SYMBOL(ll_rw_block);
2987 void write_dirty_buffer(struct buffer_head *bh, int rw)
2990 if (!test_clear_buffer_dirty(bh)) {
2994 bh->b_end_io = end_buffer_write_sync;
2998 EXPORT_SYMBOL(write_dirty_buffer);
3001 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3002 * and then start new I/O and then wait upon it. The caller must have a ref on
3005 int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3009 WARN_ON(atomic_read(&bh->b_count) < 1);
3011 if (test_clear_buffer_dirty(bh)) {
3013 bh->b_end_io = end_buffer_write_sync;
3014 ret = submit_bh(rw, bh);
3016 if (!ret && !buffer_uptodate(bh))
3023 EXPORT_SYMBOL(__sync_dirty_buffer);
3025 int sync_dirty_buffer(struct buffer_head *bh)
3027 return __sync_dirty_buffer(bh, WRITE_SYNC);
3029 EXPORT_SYMBOL(sync_dirty_buffer);
3032 * try_to_free_buffers() checks if all the buffers on this particular page
3033 * are unused, and releases them if so.
3035 * Exclusion against try_to_free_buffers may be obtained by either
3036 * locking the page or by holding its mapping's private_lock.
3038 * If the page is dirty but all the buffers are clean then we need to
3039 * be sure to mark the page clean as well. This is because the page
3040 * may be against a block device, and a later reattachment of buffers
3041 * to a dirty page will set *all* buffers dirty. Which would corrupt
3042 * filesystem data on the same device.
3044 * The same applies to regular filesystem pages: if all the buffers are
3045 * clean then we set the page clean and proceed. To do that, we require
3046 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3049 * try_to_free_buffers() is non-blocking.
3051 static inline int buffer_busy(struct buffer_head *bh)
3053 return atomic_read(&bh->b_count) |
3054 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3058 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3060 struct buffer_head *head = page_buffers(page);
3061 struct buffer_head *bh;
3065 if (buffer_write_io_error(bh) && page->mapping)
3066 set_bit(AS_EIO, &page->mapping->flags);
3067 if (buffer_busy(bh))
3069 bh = bh->b_this_page;
3070 } while (bh != head);
3073 struct buffer_head *next = bh->b_this_page;
3075 if (bh->b_assoc_map)
3076 __remove_assoc_queue(bh);
3078 } while (bh != head);
3079 *buffers_to_free = head;
3080 __clear_page_buffers(page);
3086 int try_to_free_buffers(struct page *page)
3088 struct address_space * const mapping = page->mapping;
3089 struct buffer_head *buffers_to_free = NULL;
3092 BUG_ON(!PageLocked(page));
3093 if (PageWriteback(page))
3096 if (mapping == NULL) { /* can this still happen? */
3097 ret = drop_buffers(page, &buffers_to_free);
3101 spin_lock(&mapping->private_lock);
3102 ret = drop_buffers(page, &buffers_to_free);
3105 * If the filesystem writes its buffers by hand (eg ext3)
3106 * then we can have clean buffers against a dirty page. We
3107 * clean the page here; otherwise the VM will never notice
3108 * that the filesystem did any IO at all.
3110 * Also, during truncate, discard_buffer will have marked all
3111 * the page's buffers clean. We discover that here and clean
3114 * private_lock must be held over this entire operation in order
3115 * to synchronise against __set_page_dirty_buffers and prevent the
3116 * dirty bit from being lost.
3119 cancel_dirty_page(page, PAGE_CACHE_SIZE);
3120 spin_unlock(&mapping->private_lock);
3122 if (buffers_to_free) {
3123 struct buffer_head *bh = buffers_to_free;
3126 struct buffer_head *next = bh->b_this_page;
3127 free_buffer_head(bh);
3129 } while (bh != buffers_to_free);
3133 EXPORT_SYMBOL(try_to_free_buffers);
3136 * There are no bdflush tunables left. But distributions are
3137 * still running obsolete flush daemons, so we terminate them here.
3139 * Use of bdflush() is deprecated and will be removed in a future kernel.
3140 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3142 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3144 static int msg_count;
3146 if (!capable(CAP_SYS_ADMIN))
3149 if (msg_count < 5) {
3152 "warning: process `%s' used the obsolete bdflush"
3153 " system call\n", current->comm);
3154 printk(KERN_INFO "Fix your initscripts?\n");
3163 * Buffer-head allocation
3165 static struct kmem_cache *bh_cachep;
3168 * Once the number of bh's in the machine exceeds this level, we start
3169 * stripping them in writeback.
3171 static int max_buffer_heads;
3173 int buffer_heads_over_limit;
3175 struct bh_accounting {
3176 int nr; /* Number of live bh's */
3177 int ratelimit; /* Limit cacheline bouncing */
3180 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3182 static void recalc_bh_state(void)
3187 if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3189 __this_cpu_write(bh_accounting.ratelimit, 0);
3190 for_each_online_cpu(i)
3191 tot += per_cpu(bh_accounting, i).nr;
3192 buffer_heads_over_limit = (tot > max_buffer_heads);
3195 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3197 struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3199 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3201 __this_cpu_inc(bh_accounting.nr);
3207 EXPORT_SYMBOL(alloc_buffer_head);
3209 void free_buffer_head(struct buffer_head *bh)
3211 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3212 kmem_cache_free(bh_cachep, bh);
3214 __this_cpu_dec(bh_accounting.nr);
3218 EXPORT_SYMBOL(free_buffer_head);
3220 static void buffer_exit_cpu(int cpu)
3223 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3225 for (i = 0; i < BH_LRU_SIZE; i++) {
3229 this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3230 per_cpu(bh_accounting, cpu).nr = 0;
3233 static int buffer_cpu_notify(struct notifier_block *self,
3234 unsigned long action, void *hcpu)
3236 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3237 buffer_exit_cpu((unsigned long)hcpu);
3242 * bh_uptodate_or_lock - Test whether the buffer is uptodate
3243 * @bh: struct buffer_head
3245 * Return true if the buffer is up-to-date and false,
3246 * with the buffer locked, if not.
3248 int bh_uptodate_or_lock(struct buffer_head *bh)
3250 if (!buffer_uptodate(bh)) {
3252 if (!buffer_uptodate(bh))
3258 EXPORT_SYMBOL(bh_uptodate_or_lock);
3261 * bh_submit_read - Submit a locked buffer for reading
3262 * @bh: struct buffer_head
3264 * Returns zero on success and -EIO on error.
3266 int bh_submit_read(struct buffer_head *bh)
3268 BUG_ON(!buffer_locked(bh));
3270 if (buffer_uptodate(bh)) {
3276 bh->b_end_io = end_buffer_read_sync;
3277 submit_bh(READ, bh);
3279 if (buffer_uptodate(bh))
3283 EXPORT_SYMBOL(bh_submit_read);
3285 void __init buffer_init(void)
3289 bh_cachep = kmem_cache_create("buffer_head",
3290 sizeof(struct buffer_head), 0,
3291 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3296 * Limit the bh occupancy to 10% of ZONE_NORMAL
3298 nrpages = (nr_free_buffer_pages() * 10) / 100;
3299 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3300 hotcpu_notifier(buffer_cpu_notify, 0);