4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
21 #include <linux/config.h>
22 #include <linux/kernel.h>
23 #include <linux/syscalls.h>
26 #include <linux/percpu.h>
27 #include <linux/slab.h>
28 #include <linux/smp_lock.h>
29 #include <linux/capability.h>
30 #include <linux/blkdev.h>
31 #include <linux/file.h>
32 #include <linux/quotaops.h>
33 #include <linux/highmem.h>
34 #include <linux/module.h>
35 #include <linux/writeback.h>
36 #include <linux/hash.h>
37 #include <linux/suspend.h>
38 #include <linux/buffer_head.h>
39 #include <linux/bio.h>
40 #include <linux/notifier.h>
41 #include <linux/cpu.h>
42 #include <linux/bitops.h>
43 #include <linux/mpage.h>
44 #include <linux/bit_spinlock.h>
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47 static void invalidate_bh_lrus(void);
49 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
52 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
54 bh->b_end_io = handler;
55 bh->b_private = private;
58 static int sync_buffer(void *word)
60 struct block_device *bd;
61 struct buffer_head *bh
62 = container_of(word, struct buffer_head, b_state);
67 blk_run_address_space(bd->bd_inode->i_mapping);
72 void fastcall __lock_buffer(struct buffer_head *bh)
74 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
75 TASK_UNINTERRUPTIBLE);
77 EXPORT_SYMBOL(__lock_buffer);
79 void fastcall unlock_buffer(struct buffer_head *bh)
81 clear_buffer_locked(bh);
82 smp_mb__after_clear_bit();
83 wake_up_bit(&bh->b_state, BH_Lock);
87 * Block until a buffer comes unlocked. This doesn't stop it
88 * from becoming locked again - you have to lock it yourself
89 * if you want to preserve its state.
91 void __wait_on_buffer(struct buffer_head * bh)
93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
97 __clear_page_buffers(struct page *page)
99 ClearPagePrivate(page);
100 set_page_private(page, 0);
101 page_cache_release(page);
104 static void buffer_io_error(struct buffer_head *bh)
106 char b[BDEVNAME_SIZE];
108 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
109 bdevname(bh->b_bdev, b),
110 (unsigned long long)bh->b_blocknr);
114 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
115 * unlock the buffer. This is what ll_rw_block uses too.
117 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
120 set_buffer_uptodate(bh);
122 /* This happens, due to failed READA attempts. */
123 clear_buffer_uptodate(bh);
129 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
131 char b[BDEVNAME_SIZE];
134 set_buffer_uptodate(bh);
136 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
138 printk(KERN_WARNING "lost page write due to "
140 bdevname(bh->b_bdev, b));
142 set_buffer_write_io_error(bh);
143 clear_buffer_uptodate(bh);
150 * Write out and wait upon all the dirty data associated with a block
151 * device via its mapping. Does not take the superblock lock.
153 int sync_blockdev(struct block_device *bdev)
158 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
161 EXPORT_SYMBOL(sync_blockdev);
163 static void __fsync_super(struct super_block *sb)
165 sync_inodes_sb(sb, 0);
168 if (sb->s_dirt && sb->s_op->write_super)
169 sb->s_op->write_super(sb);
171 if (sb->s_op->sync_fs)
172 sb->s_op->sync_fs(sb, 1);
173 sync_blockdev(sb->s_bdev);
174 sync_inodes_sb(sb, 1);
178 * Write out and wait upon all dirty data associated with this
179 * superblock. Filesystem data as well as the underlying block
180 * device. Takes the superblock lock.
182 int fsync_super(struct super_block *sb)
185 return sync_blockdev(sb->s_bdev);
189 * Write out and wait upon all dirty data associated with this
190 * device. Filesystem data as well as the underlying block
191 * device. Takes the superblock lock.
193 int fsync_bdev(struct block_device *bdev)
195 struct super_block *sb = get_super(bdev);
197 int res = fsync_super(sb);
201 return sync_blockdev(bdev);
205 * freeze_bdev -- lock a filesystem and force it into a consistent state
206 * @bdev: blockdevice to lock
208 * This takes the block device bd_mount_mutex to make sure no new mounts
209 * happen on bdev until thaw_bdev() is called.
210 * If a superblock is found on this device, we take the s_umount semaphore
211 * on it to make sure nobody unmounts until the snapshot creation is done.
213 struct super_block *freeze_bdev(struct block_device *bdev)
215 struct super_block *sb;
217 mutex_lock(&bdev->bd_mount_mutex);
218 sb = get_super(bdev);
219 if (sb && !(sb->s_flags & MS_RDONLY)) {
220 sb->s_frozen = SB_FREEZE_WRITE;
225 sb->s_frozen = SB_FREEZE_TRANS;
228 sync_blockdev(sb->s_bdev);
230 if (sb->s_op->write_super_lockfs)
231 sb->s_op->write_super_lockfs(sb);
235 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
237 EXPORT_SYMBOL(freeze_bdev);
240 * thaw_bdev -- unlock filesystem
241 * @bdev: blockdevice to unlock
242 * @sb: associated superblock
244 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
246 void thaw_bdev(struct block_device *bdev, struct super_block *sb)
249 BUG_ON(sb->s_bdev != bdev);
251 if (sb->s_op->unlockfs)
252 sb->s_op->unlockfs(sb);
253 sb->s_frozen = SB_UNFROZEN;
255 wake_up(&sb->s_wait_unfrozen);
259 mutex_unlock(&bdev->bd_mount_mutex);
261 EXPORT_SYMBOL(thaw_bdev);
264 * sync everything. Start out by waking pdflush, because that writes back
265 * all queues in parallel.
267 static void do_sync(unsigned long wait)
270 sync_inodes(0); /* All mappings, inodes and their blockdevs */
272 sync_supers(); /* Write the superblocks */
273 sync_filesystems(0); /* Start syncing the filesystems */
274 sync_filesystems(wait); /* Waitingly sync the filesystems */
275 sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */
277 printk("Emergency Sync complete\n");
278 if (unlikely(laptop_mode))
279 laptop_sync_completion();
282 asmlinkage long sys_sync(void)
288 void emergency_sync(void)
290 pdflush_operation(do_sync, 0);
294 * Generic function to fsync a file.
296 * filp may be NULL if called via the msync of a vma.
299 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
301 struct inode * inode = dentry->d_inode;
302 struct super_block * sb;
305 /* sync the inode to buffers */
306 ret = write_inode_now(inode, 0);
308 /* sync the superblock to buffers */
311 if (sb->s_op->write_super)
312 sb->s_op->write_super(sb);
315 /* .. finally sync the buffers to disk */
316 err = sync_blockdev(sb->s_bdev);
322 long do_fsync(struct file *file, int datasync)
326 struct address_space *mapping = file->f_mapping;
328 if (!file->f_op || !file->f_op->fsync) {
329 /* Why? We can still call filemap_fdatawrite */
334 current->flags |= PF_SYNCWRITE;
335 ret = filemap_fdatawrite(mapping);
338 * We need to protect against concurrent writers, which could cause
339 * livelocks in fsync_buffers_list().
341 mutex_lock(&mapping->host->i_mutex);
342 err = file->f_op->fsync(file, file->f_dentry, datasync);
345 mutex_unlock(&mapping->host->i_mutex);
346 err = filemap_fdatawait(mapping);
349 current->flags &= ~PF_SYNCWRITE;
354 static long __do_fsync(unsigned int fd, int datasync)
361 ret = do_fsync(file, datasync);
367 asmlinkage long sys_fsync(unsigned int fd)
369 return __do_fsync(fd, 0);
372 asmlinkage long sys_fdatasync(unsigned int fd)
374 return __do_fsync(fd, 1);
378 * Various filesystems appear to want __find_get_block to be non-blocking.
379 * But it's the page lock which protects the buffers. To get around this,
380 * we get exclusion from try_to_free_buffers with the blockdev mapping's
383 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
384 * may be quite high. This code could TryLock the page, and if that
385 * succeeds, there is no need to take private_lock. (But if
386 * private_lock is contended then so is mapping->tree_lock).
388 static struct buffer_head *
389 __find_get_block_slow(struct block_device *bdev, sector_t block)
391 struct inode *bd_inode = bdev->bd_inode;
392 struct address_space *bd_mapping = bd_inode->i_mapping;
393 struct buffer_head *ret = NULL;
395 struct buffer_head *bh;
396 struct buffer_head *head;
400 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
401 page = find_get_page(bd_mapping, index);
405 spin_lock(&bd_mapping->private_lock);
406 if (!page_has_buffers(page))
408 head = page_buffers(page);
411 if (bh->b_blocknr == block) {
416 if (!buffer_mapped(bh))
418 bh = bh->b_this_page;
419 } while (bh != head);
421 /* we might be here because some of the buffers on this page are
422 * not mapped. This is due to various races between
423 * file io on the block device and getblk. It gets dealt with
424 * elsewhere, don't buffer_error if we had some unmapped buffers
427 printk("__find_get_block_slow() failed. "
428 "block=%llu, b_blocknr=%llu\n",
429 (unsigned long long)block,
430 (unsigned long long)bh->b_blocknr);
431 printk("b_state=0x%08lx, b_size=%zu\n",
432 bh->b_state, bh->b_size);
433 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
436 spin_unlock(&bd_mapping->private_lock);
437 page_cache_release(page);
442 /* If invalidate_buffers() will trash dirty buffers, it means some kind
443 of fs corruption is going on. Trashing dirty data always imply losing
444 information that was supposed to be just stored on the physical layer
447 Thus invalidate_buffers in general usage is not allwowed to trash
448 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
449 be preserved. These buffers are simply skipped.
451 We also skip buffers which are still in use. For example this can
452 happen if a userspace program is reading the block device.
454 NOTE: In the case where the user removed a removable-media-disk even if
455 there's still dirty data not synced on disk (due a bug in the device driver
456 or due an error of the user), by not destroying the dirty buffers we could
457 generate corruption also on the next media inserted, thus a parameter is
458 necessary to handle this case in the most safe way possible (trying
459 to not corrupt also the new disk inserted with the data belonging to
460 the old now corrupted disk). Also for the ramdisk the natural thing
461 to do in order to release the ramdisk memory is to destroy dirty buffers.
463 These are two special cases. Normal usage imply the device driver
464 to issue a sync on the device (without waiting I/O completion) and
465 then an invalidate_buffers call that doesn't trash dirty buffers.
467 For handling cache coherency with the blkdev pagecache the 'update' case
468 is been introduced. It is needed to re-read from disk any pinned
469 buffer. NOTE: re-reading from disk is destructive so we can do it only
470 when we assume nobody is changing the buffercache under our I/O and when
471 we think the disk contains more recent information than the buffercache.
472 The update == 1 pass marks the buffers we need to update, the update == 2
473 pass does the actual I/O. */
474 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
476 invalidate_bh_lrus();
478 * FIXME: what about destroy_dirty_buffers?
479 * We really want to use invalidate_inode_pages2() for
480 * that, but not until that's cleaned up.
482 invalidate_inode_pages(bdev->bd_inode->i_mapping);
486 * Kick pdflush then try to free up some ZONE_NORMAL memory.
488 static void free_more_memory(void)
493 wakeup_pdflush(1024);
496 for_each_online_pgdat(pgdat) {
497 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
499 try_to_free_pages(zones, GFP_NOFS);
504 * I/O completion handler for block_read_full_page() - pages
505 * which come unlocked at the end of I/O.
507 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
510 struct buffer_head *first;
511 struct buffer_head *tmp;
513 int page_uptodate = 1;
515 BUG_ON(!buffer_async_read(bh));
519 set_buffer_uptodate(bh);
521 clear_buffer_uptodate(bh);
522 if (printk_ratelimit())
528 * Be _very_ careful from here on. Bad things can happen if
529 * two buffer heads end IO at almost the same time and both
530 * decide that the page is now completely done.
532 first = page_buffers(page);
533 local_irq_save(flags);
534 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
535 clear_buffer_async_read(bh);
539 if (!buffer_uptodate(tmp))
541 if (buffer_async_read(tmp)) {
542 BUG_ON(!buffer_locked(tmp));
545 tmp = tmp->b_this_page;
547 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
548 local_irq_restore(flags);
551 * If none of the buffers had errors and they are all
552 * uptodate then we can set the page uptodate.
554 if (page_uptodate && !PageError(page))
555 SetPageUptodate(page);
560 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
561 local_irq_restore(flags);
566 * Completion handler for block_write_full_page() - pages which are unlocked
567 * during I/O, and which have PageWriteback cleared upon I/O completion.
569 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
571 char b[BDEVNAME_SIZE];
573 struct buffer_head *first;
574 struct buffer_head *tmp;
577 BUG_ON(!buffer_async_write(bh));
581 set_buffer_uptodate(bh);
583 if (printk_ratelimit()) {
585 printk(KERN_WARNING "lost page write due to "
587 bdevname(bh->b_bdev, b));
589 set_bit(AS_EIO, &page->mapping->flags);
590 clear_buffer_uptodate(bh);
594 first = page_buffers(page);
595 local_irq_save(flags);
596 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
598 clear_buffer_async_write(bh);
600 tmp = bh->b_this_page;
602 if (buffer_async_write(tmp)) {
603 BUG_ON(!buffer_locked(tmp));
606 tmp = tmp->b_this_page;
608 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
609 local_irq_restore(flags);
610 end_page_writeback(page);
614 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
615 local_irq_restore(flags);
620 * If a page's buffers are under async readin (end_buffer_async_read
621 * completion) then there is a possibility that another thread of
622 * control could lock one of the buffers after it has completed
623 * but while some of the other buffers have not completed. This
624 * locked buffer would confuse end_buffer_async_read() into not unlocking
625 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
626 * that this buffer is not under async I/O.
628 * The page comes unlocked when it has no locked buffer_async buffers
631 * PageLocked prevents anyone starting new async I/O reads any of
634 * PageWriteback is used to prevent simultaneous writeout of the same
637 * PageLocked prevents anyone from starting writeback of a page which is
638 * under read I/O (PageWriteback is only ever set against a locked page).
640 static void mark_buffer_async_read(struct buffer_head *bh)
642 bh->b_end_io = end_buffer_async_read;
643 set_buffer_async_read(bh);
646 void mark_buffer_async_write(struct buffer_head *bh)
648 bh->b_end_io = end_buffer_async_write;
649 set_buffer_async_write(bh);
651 EXPORT_SYMBOL(mark_buffer_async_write);
655 * fs/buffer.c contains helper functions for buffer-backed address space's
656 * fsync functions. A common requirement for buffer-based filesystems is
657 * that certain data from the backing blockdev needs to be written out for
658 * a successful fsync(). For example, ext2 indirect blocks need to be
659 * written back and waited upon before fsync() returns.
661 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
662 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
663 * management of a list of dependent buffers at ->i_mapping->private_list.
665 * Locking is a little subtle: try_to_free_buffers() will remove buffers
666 * from their controlling inode's queue when they are being freed. But
667 * try_to_free_buffers() will be operating against the *blockdev* mapping
668 * at the time, not against the S_ISREG file which depends on those buffers.
669 * So the locking for private_list is via the private_lock in the address_space
670 * which backs the buffers. Which is different from the address_space
671 * against which the buffers are listed. So for a particular address_space,
672 * mapping->private_lock does *not* protect mapping->private_list! In fact,
673 * mapping->private_list will always be protected by the backing blockdev's
676 * Which introduces a requirement: all buffers on an address_space's
677 * ->private_list must be from the same address_space: the blockdev's.
679 * address_spaces which do not place buffers at ->private_list via these
680 * utility functions are free to use private_lock and private_list for
681 * whatever they want. The only requirement is that list_empty(private_list)
682 * be true at clear_inode() time.
684 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
685 * filesystems should do that. invalidate_inode_buffers() should just go
686 * BUG_ON(!list_empty).
688 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
689 * take an address_space, not an inode. And it should be called
690 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
693 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
694 * list if it is already on a list. Because if the buffer is on a list,
695 * it *must* already be on the right one. If not, the filesystem is being
696 * silly. This will save a ton of locking. But first we have to ensure
697 * that buffers are taken *off* the old inode's list when they are freed
698 * (presumably in truncate). That requires careful auditing of all
699 * filesystems (do it inside bforget()). It could also be done by bringing
704 * The buffer's backing address_space's private_lock must be held
706 static inline void __remove_assoc_queue(struct buffer_head *bh)
708 list_del_init(&bh->b_assoc_buffers);
711 int inode_has_buffers(struct inode *inode)
713 return !list_empty(&inode->i_data.private_list);
717 * osync is designed to support O_SYNC io. It waits synchronously for
718 * all already-submitted IO to complete, but does not queue any new
719 * writes to the disk.
721 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
722 * you dirty the buffers, and then use osync_inode_buffers to wait for
723 * completion. Any other dirty buffers which are not yet queued for
724 * write will not be flushed to disk by the osync.
726 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
728 struct buffer_head *bh;
734 list_for_each_prev(p, list) {
736 if (buffer_locked(bh)) {
740 if (!buffer_uptodate(bh))
752 * sync_mapping_buffers - write out and wait upon a mapping's "associated"
754 * @mapping: the mapping which wants those buffers written
756 * Starts I/O against the buffers at mapping->private_list, and waits upon
759 * Basically, this is a convenience function for fsync().
760 * @mapping is a file or directory which needs those buffers to be written for
761 * a successful fsync().
763 int sync_mapping_buffers(struct address_space *mapping)
765 struct address_space *buffer_mapping = mapping->assoc_mapping;
767 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
770 return fsync_buffers_list(&buffer_mapping->private_lock,
771 &mapping->private_list);
773 EXPORT_SYMBOL(sync_mapping_buffers);
776 * Called when we've recently written block `bblock', and it is known that
777 * `bblock' was for a buffer_boundary() buffer. This means that the block at
778 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
779 * dirty, schedule it for IO. So that indirects merge nicely with their data.
781 void write_boundary_block(struct block_device *bdev,
782 sector_t bblock, unsigned blocksize)
784 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
786 if (buffer_dirty(bh))
787 ll_rw_block(WRITE, 1, &bh);
792 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
794 struct address_space *mapping = inode->i_mapping;
795 struct address_space *buffer_mapping = bh->b_page->mapping;
797 mark_buffer_dirty(bh);
798 if (!mapping->assoc_mapping) {
799 mapping->assoc_mapping = buffer_mapping;
801 BUG_ON(mapping->assoc_mapping != buffer_mapping);
803 if (list_empty(&bh->b_assoc_buffers)) {
804 spin_lock(&buffer_mapping->private_lock);
805 list_move_tail(&bh->b_assoc_buffers,
806 &mapping->private_list);
807 spin_unlock(&buffer_mapping->private_lock);
810 EXPORT_SYMBOL(mark_buffer_dirty_inode);
813 * Add a page to the dirty page list.
815 * It is a sad fact of life that this function is called from several places
816 * deeply under spinlocking. It may not sleep.
818 * If the page has buffers, the uptodate buffers are set dirty, to preserve
819 * dirty-state coherency between the page and the buffers. It the page does
820 * not have buffers then when they are later attached they will all be set
823 * The buffers are dirtied before the page is dirtied. There's a small race
824 * window in which a writepage caller may see the page cleanness but not the
825 * buffer dirtiness. That's fine. If this code were to set the page dirty
826 * before the buffers, a concurrent writepage caller could clear the page dirty
827 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
828 * page on the dirty page list.
830 * We use private_lock to lock against try_to_free_buffers while using the
831 * page's buffer list. Also use this to protect against clean buffers being
832 * added to the page after it was set dirty.
834 * FIXME: may need to call ->reservepage here as well. That's rather up to the
835 * address_space though.
837 int __set_page_dirty_buffers(struct page *page)
839 struct address_space * const mapping = page->mapping;
841 spin_lock(&mapping->private_lock);
842 if (page_has_buffers(page)) {
843 struct buffer_head *head = page_buffers(page);
844 struct buffer_head *bh = head;
847 set_buffer_dirty(bh);
848 bh = bh->b_this_page;
849 } while (bh != head);
851 spin_unlock(&mapping->private_lock);
853 if (!TestSetPageDirty(page)) {
854 write_lock_irq(&mapping->tree_lock);
855 if (page->mapping) { /* Race with truncate? */
856 if (mapping_cap_account_dirty(mapping))
857 inc_page_state(nr_dirty);
858 radix_tree_tag_set(&mapping->page_tree,
860 PAGECACHE_TAG_DIRTY);
862 write_unlock_irq(&mapping->tree_lock);
863 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
868 EXPORT_SYMBOL(__set_page_dirty_buffers);
871 * Write out and wait upon a list of buffers.
873 * We have conflicting pressures: we want to make sure that all
874 * initially dirty buffers get waited on, but that any subsequently
875 * dirtied buffers don't. After all, we don't want fsync to last
876 * forever if somebody is actively writing to the file.
878 * Do this in two main stages: first we copy dirty buffers to a
879 * temporary inode list, queueing the writes as we go. Then we clean
880 * up, waiting for those writes to complete.
882 * During this second stage, any subsequent updates to the file may end
883 * up refiling the buffer on the original inode's dirty list again, so
884 * there is a chance we will end up with a buffer queued for write but
885 * not yet completed on that list. So, as a final cleanup we go through
886 * the osync code to catch these locked, dirty buffers without requeuing
887 * any newly dirty buffers for write.
889 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
891 struct buffer_head *bh;
892 struct list_head tmp;
895 INIT_LIST_HEAD(&tmp);
898 while (!list_empty(list)) {
899 bh = BH_ENTRY(list->next);
900 list_del_init(&bh->b_assoc_buffers);
901 if (buffer_dirty(bh) || buffer_locked(bh)) {
902 list_add(&bh->b_assoc_buffers, &tmp);
903 if (buffer_dirty(bh)) {
907 * Ensure any pending I/O completes so that
908 * ll_rw_block() actually writes the current
909 * contents - it is a noop if I/O is still in
910 * flight on potentially older contents.
912 ll_rw_block(SWRITE, 1, &bh);
919 while (!list_empty(&tmp)) {
920 bh = BH_ENTRY(tmp.prev);
921 __remove_assoc_queue(bh);
925 if (!buffer_uptodate(bh))
932 err2 = osync_buffers_list(lock, list);
940 * Invalidate any and all dirty buffers on a given inode. We are
941 * probably unmounting the fs, but that doesn't mean we have already
942 * done a sync(). Just drop the buffers from the inode list.
944 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
945 * assumes that all the buffers are against the blockdev. Not true
948 void invalidate_inode_buffers(struct inode *inode)
950 if (inode_has_buffers(inode)) {
951 struct address_space *mapping = &inode->i_data;
952 struct list_head *list = &mapping->private_list;
953 struct address_space *buffer_mapping = mapping->assoc_mapping;
955 spin_lock(&buffer_mapping->private_lock);
956 while (!list_empty(list))
957 __remove_assoc_queue(BH_ENTRY(list->next));
958 spin_unlock(&buffer_mapping->private_lock);
963 * Remove any clean buffers from the inode's buffer list. This is called
964 * when we're trying to free the inode itself. Those buffers can pin it.
966 * Returns true if all buffers were removed.
968 int remove_inode_buffers(struct inode *inode)
972 if (inode_has_buffers(inode)) {
973 struct address_space *mapping = &inode->i_data;
974 struct list_head *list = &mapping->private_list;
975 struct address_space *buffer_mapping = mapping->assoc_mapping;
977 spin_lock(&buffer_mapping->private_lock);
978 while (!list_empty(list)) {
979 struct buffer_head *bh = BH_ENTRY(list->next);
980 if (buffer_dirty(bh)) {
984 __remove_assoc_queue(bh);
986 spin_unlock(&buffer_mapping->private_lock);
992 * Create the appropriate buffers when given a page for data area and
993 * the size of each buffer.. Use the bh->b_this_page linked list to
994 * follow the buffers created. Return NULL if unable to create more
997 * The retry flag is used to differentiate async IO (paging, swapping)
998 * which may not fail from ordinary buffer allocations.
1000 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
1003 struct buffer_head *bh, *head;
1009 while ((offset -= size) >= 0) {
1010 bh = alloc_buffer_head(GFP_NOFS);
1015 bh->b_this_page = head;
1020 atomic_set(&bh->b_count, 0);
1021 bh->b_private = NULL;
1024 /* Link the buffer to its page */
1025 set_bh_page(bh, page, offset);
1027 init_buffer(bh, NULL, NULL);
1031 * In case anything failed, we just free everything we got.
1037 head = head->b_this_page;
1038 free_buffer_head(bh);
1043 * Return failure for non-async IO requests. Async IO requests
1044 * are not allowed to fail, so we have to wait until buffer heads
1045 * become available. But we don't want tasks sleeping with
1046 * partially complete buffers, so all were released above.
1051 /* We're _really_ low on memory. Now we just
1052 * wait for old buffer heads to become free due to
1053 * finishing IO. Since this is an async request and
1054 * the reserve list is empty, we're sure there are
1055 * async buffer heads in use.
1060 EXPORT_SYMBOL_GPL(alloc_page_buffers);
1063 link_dev_buffers(struct page *page, struct buffer_head *head)
1065 struct buffer_head *bh, *tail;
1070 bh = bh->b_this_page;
1072 tail->b_this_page = head;
1073 attach_page_buffers(page, head);
1077 * Initialise the state of a blockdev page's buffers.
1080 init_page_buffers(struct page *page, struct block_device *bdev,
1081 sector_t block, int size)
1083 struct buffer_head *head = page_buffers(page);
1084 struct buffer_head *bh = head;
1085 int uptodate = PageUptodate(page);
1088 if (!buffer_mapped(bh)) {
1089 init_buffer(bh, NULL, NULL);
1091 bh->b_blocknr = block;
1093 set_buffer_uptodate(bh);
1094 set_buffer_mapped(bh);
1097 bh = bh->b_this_page;
1098 } while (bh != head);
1102 * Create the page-cache page that contains the requested block.
1104 * This is user purely for blockdev mappings.
1106 static struct page *
1107 grow_dev_page(struct block_device *bdev, sector_t block,
1108 pgoff_t index, int size)
1110 struct inode *inode = bdev->bd_inode;
1112 struct buffer_head *bh;
1114 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
1118 BUG_ON(!PageLocked(page));
1120 if (page_has_buffers(page)) {
1121 bh = page_buffers(page);
1122 if (bh->b_size == size) {
1123 init_page_buffers(page, bdev, block, size);
1126 if (!try_to_free_buffers(page))
1131 * Allocate some buffers for this page
1133 bh = alloc_page_buffers(page, size, 0);
1138 * Link the page to the buffers and initialise them. Take the
1139 * lock to be atomic wrt __find_get_block(), which does not
1140 * run under the page lock.
1142 spin_lock(&inode->i_mapping->private_lock);
1143 link_dev_buffers(page, bh);
1144 init_page_buffers(page, bdev, block, size);
1145 spin_unlock(&inode->i_mapping->private_lock);
1151 page_cache_release(page);
1156 * Create buffers for the specified block device block's page. If
1157 * that page was dirty, the buffers are set dirty also.
1159 * Except that's a bug. Attaching dirty buffers to a dirty
1160 * blockdev's page can result in filesystem corruption, because
1161 * some of those buffers may be aliases of filesystem data.
1162 * grow_dev_page() will go BUG() if this happens.
1165 grow_buffers(struct block_device *bdev, sector_t block, int size)
1174 } while ((size << sizebits) < PAGE_SIZE);
1176 index = block >> sizebits;
1177 block = index << sizebits;
1179 /* Create a page with the proper size buffers.. */
1180 page = grow_dev_page(bdev, block, index, size);
1184 page_cache_release(page);
1188 static struct buffer_head *
1189 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1191 /* Size must be multiple of hard sectorsize */
1192 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1193 (size < 512 || size > PAGE_SIZE))) {
1194 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1196 printk(KERN_ERR "hardsect size: %d\n",
1197 bdev_hardsect_size(bdev));
1204 struct buffer_head * bh;
1206 bh = __find_get_block(bdev, block, size);
1210 if (!grow_buffers(bdev, block, size))
1216 * The relationship between dirty buffers and dirty pages:
1218 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1219 * the page is tagged dirty in its radix tree.
1221 * At all times, the dirtiness of the buffers represents the dirtiness of
1222 * subsections of the page. If the page has buffers, the page dirty bit is
1223 * merely a hint about the true dirty state.
1225 * When a page is set dirty in its entirety, all its buffers are marked dirty
1226 * (if the page has buffers).
1228 * When a buffer is marked dirty, its page is dirtied, but the page's other
1231 * Also. When blockdev buffers are explicitly read with bread(), they
1232 * individually become uptodate. But their backing page remains not
1233 * uptodate - even if all of its buffers are uptodate. A subsequent
1234 * block_read_full_page() against that page will discover all the uptodate
1235 * buffers, will set the page uptodate and will perform no I/O.
1239 * mark_buffer_dirty - mark a buffer_head as needing writeout
1240 * @bh: the buffer_head to mark dirty
1242 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1243 * backing page dirty, then tag the page as dirty in its address_space's radix
1244 * tree and then attach the address_space's inode to its superblock's dirty
1247 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1248 * mapping->tree_lock and the global inode_lock.
1250 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1252 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
1253 __set_page_dirty_nobuffers(bh->b_page);
1257 * Decrement a buffer_head's reference count. If all buffers against a page
1258 * have zero reference count, are clean and unlocked, and if the page is clean
1259 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1260 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1261 * a page but it ends up not being freed, and buffers may later be reattached).
1263 void __brelse(struct buffer_head * buf)
1265 if (atomic_read(&buf->b_count)) {
1269 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1274 * bforget() is like brelse(), except it discards any
1275 * potentially dirty data.
1277 void __bforget(struct buffer_head *bh)
1279 clear_buffer_dirty(bh);
1280 if (!list_empty(&bh->b_assoc_buffers)) {
1281 struct address_space *buffer_mapping = bh->b_page->mapping;
1283 spin_lock(&buffer_mapping->private_lock);
1284 list_del_init(&bh->b_assoc_buffers);
1285 spin_unlock(&buffer_mapping->private_lock);
1290 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1293 if (buffer_uptodate(bh)) {
1298 bh->b_end_io = end_buffer_read_sync;
1299 submit_bh(READ, bh);
1301 if (buffer_uptodate(bh))
1309 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1310 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1311 * refcount elevated by one when they're in an LRU. A buffer can only appear
1312 * once in a particular CPU's LRU. A single buffer can be present in multiple
1313 * CPU's LRUs at the same time.
1315 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1316 * sb_find_get_block().
1318 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1319 * a local interrupt disable for that.
1322 #define BH_LRU_SIZE 8
1325 struct buffer_head *bhs[BH_LRU_SIZE];
1328 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1331 #define bh_lru_lock() local_irq_disable()
1332 #define bh_lru_unlock() local_irq_enable()
1334 #define bh_lru_lock() preempt_disable()
1335 #define bh_lru_unlock() preempt_enable()
1338 static int lru_disabled __read_mostly;
1340 static inline void check_irqs_on(void)
1342 #ifdef irqs_disabled
1343 BUG_ON(irqs_disabled());
1348 * The LRU management algorithm is dopey-but-simple. Sorry.
1350 static void bh_lru_install(struct buffer_head *bh)
1352 struct buffer_head *evictee = NULL;
1360 lru = &__get_cpu_var(bh_lrus);
1361 if (lru->bhs[0] != bh) {
1362 struct buffer_head *bhs[BH_LRU_SIZE];
1368 for (in = 0; in < BH_LRU_SIZE; in++) {
1369 struct buffer_head *bh2 = lru->bhs[in];
1374 if (out >= BH_LRU_SIZE) {
1375 BUG_ON(evictee != NULL);
1382 while (out < BH_LRU_SIZE)
1384 memcpy(lru->bhs, bhs, sizeof(bhs));
1393 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1395 static struct buffer_head *
1396 lookup_bh_lru(struct block_device *bdev, sector_t block, int size)
1398 struct buffer_head *ret = NULL;
1407 lru = &__get_cpu_var(bh_lrus);
1408 for (i = 0; i < BH_LRU_SIZE; i++) {
1409 struct buffer_head *bh = lru->bhs[i];
1411 if (bh && bh->b_bdev == bdev &&
1412 bh->b_blocknr == block && bh->b_size == size) {
1415 lru->bhs[i] = lru->bhs[i - 1];
1430 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1431 * it in the LRU and mark it as accessed. If it is not present then return
1434 struct buffer_head *
1435 __find_get_block(struct block_device *bdev, sector_t block, int size)
1437 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1440 bh = __find_get_block_slow(bdev, block);
1448 EXPORT_SYMBOL(__find_get_block);
1451 * __getblk will locate (and, if necessary, create) the buffer_head
1452 * which corresponds to the passed block_device, block and size. The
1453 * returned buffer has its reference count incremented.
1455 * __getblk() cannot fail - it just keeps trying. If you pass it an
1456 * illegal block number, __getblk() will happily return a buffer_head
1457 * which represents the non-existent block. Very weird.
1459 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1460 * attempt is failing. FIXME, perhaps?
1462 struct buffer_head *
1463 __getblk(struct block_device *bdev, sector_t block, int size)
1465 struct buffer_head *bh = __find_get_block(bdev, block, size);
1469 bh = __getblk_slow(bdev, block, size);
1472 EXPORT_SYMBOL(__getblk);
1475 * Do async read-ahead on a buffer..
1477 void __breadahead(struct block_device *bdev, sector_t block, int size)
1479 struct buffer_head *bh = __getblk(bdev, block, size);
1481 ll_rw_block(READA, 1, &bh);
1485 EXPORT_SYMBOL(__breadahead);
1488 * __bread() - reads a specified block and returns the bh
1489 * @bdev: the block_device to read from
1490 * @block: number of block
1491 * @size: size (in bytes) to read
1493 * Reads a specified block, and returns buffer head that contains it.
1494 * It returns NULL if the block was unreadable.
1496 struct buffer_head *
1497 __bread(struct block_device *bdev, sector_t block, int size)
1499 struct buffer_head *bh = __getblk(bdev, block, size);
1501 if (likely(bh) && !buffer_uptodate(bh))
1502 bh = __bread_slow(bh);
1505 EXPORT_SYMBOL(__bread);
1508 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1509 * This doesn't race because it runs in each cpu either in irq
1510 * or with preempt disabled.
1512 static void invalidate_bh_lru(void *arg)
1514 struct bh_lru *b = &get_cpu_var(bh_lrus);
1517 for (i = 0; i < BH_LRU_SIZE; i++) {
1521 put_cpu_var(bh_lrus);
1524 static void invalidate_bh_lrus(void)
1527 on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
1530 void set_bh_page(struct buffer_head *bh,
1531 struct page *page, unsigned long offset)
1534 BUG_ON(offset >= PAGE_SIZE);
1535 if (PageHighMem(page))
1537 * This catches illegal uses and preserves the offset:
1539 bh->b_data = (char *)(0 + offset);
1541 bh->b_data = page_address(page) + offset;
1543 EXPORT_SYMBOL(set_bh_page);
1546 * Called when truncating a buffer on a page completely.
1548 static void discard_buffer(struct buffer_head * bh)
1551 clear_buffer_dirty(bh);
1553 clear_buffer_mapped(bh);
1554 clear_buffer_req(bh);
1555 clear_buffer_new(bh);
1556 clear_buffer_delay(bh);
1561 * try_to_release_page() - release old fs-specific metadata on a page
1563 * @page: the page which the kernel is trying to free
1564 * @gfp_mask: memory allocation flags (and I/O mode)
1566 * The address_space is to try to release any data against the page
1567 * (presumably at page->private). If the release was successful, return `1'.
1568 * Otherwise return zero.
1570 * The @gfp_mask argument specifies whether I/O may be performed to release
1571 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
1573 * NOTE: @gfp_mask may go away, and this function may become non-blocking.
1575 int try_to_release_page(struct page *page, gfp_t gfp_mask)
1577 struct address_space * const mapping = page->mapping;
1579 BUG_ON(!PageLocked(page));
1580 if (PageWriteback(page))
1583 if (mapping && mapping->a_ops->releasepage)
1584 return mapping->a_ops->releasepage(page, gfp_mask);
1585 return try_to_free_buffers(page);
1587 EXPORT_SYMBOL(try_to_release_page);
1590 * block_invalidatepage - invalidate part of all of a buffer-backed page
1592 * @page: the page which is affected
1593 * @offset: the index of the truncation point
1595 * block_invalidatepage() is called when all or part of the page has become
1596 * invalidatedby a truncate operation.
1598 * block_invalidatepage() does not have to release all buffers, but it must
1599 * ensure that no dirty buffer is left outside @offset and that no I/O
1600 * is underway against any of the blocks which are outside the truncation
1601 * point. Because the caller is about to free (and possibly reuse) those
1604 void block_invalidatepage(struct page *page, unsigned long offset)
1606 struct buffer_head *head, *bh, *next;
1607 unsigned int curr_off = 0;
1609 BUG_ON(!PageLocked(page));
1610 if (!page_has_buffers(page))
1613 head = page_buffers(page);
1616 unsigned int next_off = curr_off + bh->b_size;
1617 next = bh->b_this_page;
1620 * is this block fully invalidated?
1622 if (offset <= curr_off)
1624 curr_off = next_off;
1626 } while (bh != head);
1629 * We release buffers only if the entire page is being invalidated.
1630 * The get_block cached value has been unconditionally invalidated,
1631 * so real IO is not possible anymore.
1634 try_to_release_page(page, 0);
1638 EXPORT_SYMBOL(block_invalidatepage);
1640 void do_invalidatepage(struct page *page, unsigned long offset)
1642 void (*invalidatepage)(struct page *, unsigned long);
1643 invalidatepage = page->mapping->a_ops->invalidatepage ? :
1644 block_invalidatepage;
1645 (*invalidatepage)(page, offset);
1649 * We attach and possibly dirty the buffers atomically wrt
1650 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1651 * is already excluded via the page lock.
1653 void create_empty_buffers(struct page *page,
1654 unsigned long blocksize, unsigned long b_state)
1656 struct buffer_head *bh, *head, *tail;
1658 head = alloc_page_buffers(page, blocksize, 1);
1661 bh->b_state |= b_state;
1663 bh = bh->b_this_page;
1665 tail->b_this_page = head;
1667 spin_lock(&page->mapping->private_lock);
1668 if (PageUptodate(page) || PageDirty(page)) {
1671 if (PageDirty(page))
1672 set_buffer_dirty(bh);
1673 if (PageUptodate(page))
1674 set_buffer_uptodate(bh);
1675 bh = bh->b_this_page;
1676 } while (bh != head);
1678 attach_page_buffers(page, head);
1679 spin_unlock(&page->mapping->private_lock);
1681 EXPORT_SYMBOL(create_empty_buffers);
1684 * We are taking a block for data and we don't want any output from any
1685 * buffer-cache aliases starting from return from that function and
1686 * until the moment when something will explicitly mark the buffer
1687 * dirty (hopefully that will not happen until we will free that block ;-)
1688 * We don't even need to mark it not-uptodate - nobody can expect
1689 * anything from a newly allocated buffer anyway. We used to used
1690 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1691 * don't want to mark the alias unmapped, for example - it would confuse
1692 * anyone who might pick it with bread() afterwards...
1694 * Also.. Note that bforget() doesn't lock the buffer. So there can
1695 * be writeout I/O going on against recently-freed buffers. We don't
1696 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1697 * only if we really need to. That happens here.
1699 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1701 struct buffer_head *old_bh;
1705 old_bh = __find_get_block_slow(bdev, block);
1707 clear_buffer_dirty(old_bh);
1708 wait_on_buffer(old_bh);
1709 clear_buffer_req(old_bh);
1713 EXPORT_SYMBOL(unmap_underlying_metadata);
1716 * NOTE! All mapped/uptodate combinations are valid:
1718 * Mapped Uptodate Meaning
1720 * No No "unknown" - must do get_block()
1721 * No Yes "hole" - zero-filled
1722 * Yes No "allocated" - allocated on disk, not read in
1723 * Yes Yes "valid" - allocated and up-to-date in memory.
1725 * "Dirty" is valid only with the last case (mapped+uptodate).
1729 * While block_write_full_page is writing back the dirty buffers under
1730 * the page lock, whoever dirtied the buffers may decide to clean them
1731 * again at any time. We handle that by only looking at the buffer
1732 * state inside lock_buffer().
1734 * If block_write_full_page() is called for regular writeback
1735 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1736 * locked buffer. This only can happen if someone has written the buffer
1737 * directly, with submit_bh(). At the address_space level PageWriteback
1738 * prevents this contention from occurring.
1740 static int __block_write_full_page(struct inode *inode, struct page *page,
1741 get_block_t *get_block, struct writeback_control *wbc)
1745 sector_t last_block;
1746 struct buffer_head *bh, *head;
1747 const unsigned blocksize = 1 << inode->i_blkbits;
1748 int nr_underway = 0;
1750 BUG_ON(!PageLocked(page));
1752 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1754 if (!page_has_buffers(page)) {
1755 create_empty_buffers(page, blocksize,
1756 (1 << BH_Dirty)|(1 << BH_Uptodate));
1760 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1761 * here, and the (potentially unmapped) buffers may become dirty at
1762 * any time. If a buffer becomes dirty here after we've inspected it
1763 * then we just miss that fact, and the page stays dirty.
1765 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1766 * handle that here by just cleaning them.
1769 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1770 head = page_buffers(page);
1774 * Get all the dirty buffers mapped to disk addresses and
1775 * handle any aliases from the underlying blockdev's mapping.
1778 if (block > last_block) {
1780 * mapped buffers outside i_size will occur, because
1781 * this page can be outside i_size when there is a
1782 * truncate in progress.
1785 * The buffer was zeroed by block_write_full_page()
1787 clear_buffer_dirty(bh);
1788 set_buffer_uptodate(bh);
1789 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1790 WARN_ON(bh->b_size != blocksize);
1791 err = get_block(inode, block, bh, 1);
1794 if (buffer_new(bh)) {
1795 /* blockdev mappings never come here */
1796 clear_buffer_new(bh);
1797 unmap_underlying_metadata(bh->b_bdev,
1801 bh = bh->b_this_page;
1803 } while (bh != head);
1806 if (!buffer_mapped(bh))
1809 * If it's a fully non-blocking write attempt and we cannot
1810 * lock the buffer then redirty the page. Note that this can
1811 * potentially cause a busy-wait loop from pdflush and kswapd
1812 * activity, but those code paths have their own higher-level
1815 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1817 } else if (test_set_buffer_locked(bh)) {
1818 redirty_page_for_writepage(wbc, page);
1821 if (test_clear_buffer_dirty(bh)) {
1822 mark_buffer_async_write(bh);
1826 } while ((bh = bh->b_this_page) != head);
1829 * The page and its buffers are protected by PageWriteback(), so we can
1830 * drop the bh refcounts early.
1832 BUG_ON(PageWriteback(page));
1833 set_page_writeback(page);
1836 struct buffer_head *next = bh->b_this_page;
1837 if (buffer_async_write(bh)) {
1838 submit_bh(WRITE, bh);
1842 } while (bh != head);
1847 if (nr_underway == 0) {
1849 * The page was marked dirty, but the buffers were
1850 * clean. Someone wrote them back by hand with
1851 * ll_rw_block/submit_bh. A rare case.
1855 if (!buffer_uptodate(bh)) {
1859 bh = bh->b_this_page;
1860 } while (bh != head);
1862 SetPageUptodate(page);
1863 end_page_writeback(page);
1865 * The page and buffer_heads can be released at any time from
1868 wbc->pages_skipped++; /* We didn't write this page */
1874 * ENOSPC, or some other error. We may already have added some
1875 * blocks to the file, so we need to write these out to avoid
1876 * exposing stale data.
1877 * The page is currently locked and not marked for writeback
1880 /* Recovery: lock and submit the mapped buffers */
1882 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1884 mark_buffer_async_write(bh);
1887 * The buffer may have been set dirty during
1888 * attachment to a dirty page.
1890 clear_buffer_dirty(bh);
1892 } while ((bh = bh->b_this_page) != head);
1894 BUG_ON(PageWriteback(page));
1895 set_page_writeback(page);
1898 struct buffer_head *next = bh->b_this_page;
1899 if (buffer_async_write(bh)) {
1900 clear_buffer_dirty(bh);
1901 submit_bh(WRITE, bh);
1905 } while (bh != head);
1909 static int __block_prepare_write(struct inode *inode, struct page *page,
1910 unsigned from, unsigned to, get_block_t *get_block)
1912 unsigned block_start, block_end;
1915 unsigned blocksize, bbits;
1916 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1918 BUG_ON(!PageLocked(page));
1919 BUG_ON(from > PAGE_CACHE_SIZE);
1920 BUG_ON(to > PAGE_CACHE_SIZE);
1923 blocksize = 1 << inode->i_blkbits;
1924 if (!page_has_buffers(page))
1925 create_empty_buffers(page, blocksize, 0);
1926 head = page_buffers(page);
1928 bbits = inode->i_blkbits;
1929 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1931 for(bh = head, block_start = 0; bh != head || !block_start;
1932 block++, block_start=block_end, bh = bh->b_this_page) {
1933 block_end = block_start + blocksize;
1934 if (block_end <= from || block_start >= to) {
1935 if (PageUptodate(page)) {
1936 if (!buffer_uptodate(bh))
1937 set_buffer_uptodate(bh);
1942 clear_buffer_new(bh);
1943 if (!buffer_mapped(bh)) {
1944 WARN_ON(bh->b_size != blocksize);
1945 err = get_block(inode, block, bh, 1);
1948 if (buffer_new(bh)) {
1949 unmap_underlying_metadata(bh->b_bdev,
1951 if (PageUptodate(page)) {
1952 set_buffer_uptodate(bh);
1955 if (block_end > to || block_start < from) {
1958 kaddr = kmap_atomic(page, KM_USER0);
1962 if (block_start < from)
1963 memset(kaddr+block_start,
1964 0, from-block_start);
1965 flush_dcache_page(page);
1966 kunmap_atomic(kaddr, KM_USER0);
1971 if (PageUptodate(page)) {
1972 if (!buffer_uptodate(bh))
1973 set_buffer_uptodate(bh);
1976 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1977 (block_start < from || block_end > to)) {
1978 ll_rw_block(READ, 1, &bh);
1983 * If we issued read requests - let them complete.
1985 while(wait_bh > wait) {
1986 wait_on_buffer(*--wait_bh);
1987 if (!buffer_uptodate(*wait_bh))
1994 clear_buffer_new(bh);
1995 } while ((bh = bh->b_this_page) != head);
2000 * Zero out any newly allocated blocks to avoid exposing stale
2001 * data. If BH_New is set, we know that the block was newly
2002 * allocated in the above loop.
2007 block_end = block_start+blocksize;
2008 if (block_end <= from)
2010 if (block_start >= to)
2012 if (buffer_new(bh)) {
2015 clear_buffer_new(bh);
2016 kaddr = kmap_atomic(page, KM_USER0);
2017 memset(kaddr+block_start, 0, bh->b_size);
2018 kunmap_atomic(kaddr, KM_USER0);
2019 set_buffer_uptodate(bh);
2020 mark_buffer_dirty(bh);
2023 block_start = block_end;
2024 bh = bh->b_this_page;
2025 } while (bh != head);
2029 static int __block_commit_write(struct inode *inode, struct page *page,
2030 unsigned from, unsigned to)
2032 unsigned block_start, block_end;
2035 struct buffer_head *bh, *head;
2037 blocksize = 1 << inode->i_blkbits;
2039 for(bh = head = page_buffers(page), block_start = 0;
2040 bh != head || !block_start;
2041 block_start=block_end, bh = bh->b_this_page) {
2042 block_end = block_start + blocksize;
2043 if (block_end <= from || block_start >= to) {
2044 if (!buffer_uptodate(bh))
2047 set_buffer_uptodate(bh);
2048 mark_buffer_dirty(bh);
2053 * If this is a partial write which happened to make all buffers
2054 * uptodate then we can optimize away a bogus readpage() for
2055 * the next read(). Here we 'discover' whether the page went
2056 * uptodate as a result of this (potentially partial) write.
2059 SetPageUptodate(page);
2064 * Generic "read page" function for block devices that have the normal
2065 * get_block functionality. This is most of the block device filesystems.
2066 * Reads the page asynchronously --- the unlock_buffer() and
2067 * set/clear_buffer_uptodate() functions propagate buffer state into the
2068 * page struct once IO has completed.
2070 int block_read_full_page(struct page *page, get_block_t *get_block)
2072 struct inode *inode = page->mapping->host;
2073 sector_t iblock, lblock;
2074 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2075 unsigned int blocksize;
2077 int fully_mapped = 1;
2079 BUG_ON(!PageLocked(page));
2080 blocksize = 1 << inode->i_blkbits;
2081 if (!page_has_buffers(page))
2082 create_empty_buffers(page, blocksize, 0);
2083 head = page_buffers(page);
2085 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2086 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2092 if (buffer_uptodate(bh))
2095 if (!buffer_mapped(bh)) {
2099 if (iblock < lblock) {
2100 WARN_ON(bh->b_size != blocksize);
2101 err = get_block(inode, iblock, bh, 0);
2105 if (!buffer_mapped(bh)) {
2106 void *kaddr = kmap_atomic(page, KM_USER0);
2107 memset(kaddr + i * blocksize, 0, blocksize);
2108 flush_dcache_page(page);
2109 kunmap_atomic(kaddr, KM_USER0);
2111 set_buffer_uptodate(bh);
2115 * get_block() might have updated the buffer
2118 if (buffer_uptodate(bh))
2122 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2125 SetPageMappedToDisk(page);
2129 * All buffers are uptodate - we can set the page uptodate
2130 * as well. But not if get_block() returned an error.
2132 if (!PageError(page))
2133 SetPageUptodate(page);
2138 /* Stage two: lock the buffers */
2139 for (i = 0; i < nr; i++) {
2142 mark_buffer_async_read(bh);
2146 * Stage 3: start the IO. Check for uptodateness
2147 * inside the buffer lock in case another process reading
2148 * the underlying blockdev brought it uptodate (the sct fix).
2150 for (i = 0; i < nr; i++) {
2152 if (buffer_uptodate(bh))
2153 end_buffer_async_read(bh, 1);
2155 submit_bh(READ, bh);
2160 /* utility function for filesystems that need to do work on expanding
2161 * truncates. Uses prepare/commit_write to allow the filesystem to
2162 * deal with the hole.
2164 static int __generic_cont_expand(struct inode *inode, loff_t size,
2165 pgoff_t index, unsigned int offset)
2167 struct address_space *mapping = inode->i_mapping;
2169 unsigned long limit;
2173 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2174 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2175 send_sig(SIGXFSZ, current, 0);
2178 if (size > inode->i_sb->s_maxbytes)
2182 page = grab_cache_page(mapping, index);
2185 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
2188 * ->prepare_write() may have instantiated a few blocks
2189 * outside i_size. Trim these off again.
2192 page_cache_release(page);
2193 vmtruncate(inode, inode->i_size);
2197 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
2200 page_cache_release(page);
2207 int generic_cont_expand(struct inode *inode, loff_t size)
2210 unsigned int offset;
2212 offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */
2214 /* ugh. in prepare/commit_write, if from==to==start of block, we
2215 ** skip the prepare. make sure we never send an offset for the start
2218 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
2219 /* caller must handle this extra byte. */
2222 index = size >> PAGE_CACHE_SHIFT;
2224 return __generic_cont_expand(inode, size, index, offset);
2227 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2229 loff_t pos = size - 1;
2230 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2231 unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1;
2233 /* prepare/commit_write can handle even if from==to==start of block. */
2234 return __generic_cont_expand(inode, size, index, offset);
2238 * For moronic filesystems that do not allow holes in file.
2239 * We may have to extend the file.
2242 int cont_prepare_write(struct page *page, unsigned offset,
2243 unsigned to, get_block_t *get_block, loff_t *bytes)
2245 struct address_space *mapping = page->mapping;
2246 struct inode *inode = mapping->host;
2247 struct page *new_page;
2251 unsigned blocksize = 1 << inode->i_blkbits;
2254 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
2256 new_page = grab_cache_page(mapping, pgpos);
2259 /* we might sleep */
2260 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
2261 unlock_page(new_page);
2262 page_cache_release(new_page);
2265 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2266 if (zerofrom & (blocksize-1)) {
2267 *bytes |= (blocksize-1);
2270 status = __block_prepare_write(inode, new_page, zerofrom,
2271 PAGE_CACHE_SIZE, get_block);
2274 kaddr = kmap_atomic(new_page, KM_USER0);
2275 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
2276 flush_dcache_page(new_page);
2277 kunmap_atomic(kaddr, KM_USER0);
2278 generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE);
2279 unlock_page(new_page);
2280 page_cache_release(new_page);
2283 if (page->index < pgpos) {
2284 /* completely inside the area */
2287 /* page covers the boundary, find the boundary offset */
2288 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2290 /* if we will expand the thing last block will be filled */
2291 if (to > zerofrom && (zerofrom & (blocksize-1))) {
2292 *bytes |= (blocksize-1);
2296 /* starting below the boundary? Nothing to zero out */
2297 if (offset <= zerofrom)
2300 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
2303 if (zerofrom < offset) {
2304 kaddr = kmap_atomic(page, KM_USER0);
2305 memset(kaddr+zerofrom, 0, offset-zerofrom);
2306 flush_dcache_page(page);
2307 kunmap_atomic(kaddr, KM_USER0);
2308 __block_commit_write(inode, page, zerofrom, offset);
2312 ClearPageUptodate(page);
2316 ClearPageUptodate(new_page);
2317 unlock_page(new_page);
2318 page_cache_release(new_page);
2323 int block_prepare_write(struct page *page, unsigned from, unsigned to,
2324 get_block_t *get_block)
2326 struct inode *inode = page->mapping->host;
2327 int err = __block_prepare_write(inode, page, from, to, get_block);
2329 ClearPageUptodate(page);
2333 int block_commit_write(struct page *page, unsigned from, unsigned to)
2335 struct inode *inode = page->mapping->host;
2336 __block_commit_write(inode,page,from,to);
2340 int generic_commit_write(struct file *file, struct page *page,
2341 unsigned from, unsigned to)
2343 struct inode *inode = page->mapping->host;
2344 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2345 __block_commit_write(inode,page,from,to);
2347 * No need to use i_size_read() here, the i_size
2348 * cannot change under us because we hold i_mutex.
2350 if (pos > inode->i_size) {
2351 i_size_write(inode, pos);
2352 mark_inode_dirty(inode);
2359 * nobh_prepare_write()'s prereads are special: the buffer_heads are freed
2360 * immediately, while under the page lock. So it needs a special end_io
2361 * handler which does not touch the bh after unlocking it.
2363 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
2364 * a race there is benign: unlock_buffer() only use the bh's address for
2365 * hashing after unlocking the buffer, so it doesn't actually touch the bh
2368 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2371 set_buffer_uptodate(bh);
2373 /* This happens, due to failed READA attempts. */
2374 clear_buffer_uptodate(bh);
2380 * On entry, the page is fully not uptodate.
2381 * On exit the page is fully uptodate in the areas outside (from,to)
2383 int nobh_prepare_write(struct page *page, unsigned from, unsigned to,
2384 get_block_t *get_block)
2386 struct inode *inode = page->mapping->host;
2387 const unsigned blkbits = inode->i_blkbits;
2388 const unsigned blocksize = 1 << blkbits;
2389 struct buffer_head map_bh;
2390 struct buffer_head *read_bh[MAX_BUF_PER_PAGE];
2391 unsigned block_in_page;
2392 unsigned block_start;
2393 sector_t block_in_file;
2398 int is_mapped_to_disk = 1;
2401 if (PageMappedToDisk(page))
2404 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2405 map_bh.b_page = page;
2408 * We loop across all blocks in the page, whether or not they are
2409 * part of the affected region. This is so we can discover if the
2410 * page is fully mapped-to-disk.
2412 for (block_start = 0, block_in_page = 0;
2413 block_start < PAGE_CACHE_SIZE;
2414 block_in_page++, block_start += blocksize) {
2415 unsigned block_end = block_start + blocksize;
2420 if (block_start >= to)
2422 map_bh.b_size = blocksize;
2423 ret = get_block(inode, block_in_file + block_in_page,
2427 if (!buffer_mapped(&map_bh))
2428 is_mapped_to_disk = 0;
2429 if (buffer_new(&map_bh))
2430 unmap_underlying_metadata(map_bh.b_bdev,
2432 if (PageUptodate(page))
2434 if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) {
2435 kaddr = kmap_atomic(page, KM_USER0);
2436 if (block_start < from) {
2437 memset(kaddr+block_start, 0, from-block_start);
2440 if (block_end > to) {
2441 memset(kaddr + to, 0, block_end - to);
2444 flush_dcache_page(page);
2445 kunmap_atomic(kaddr, KM_USER0);
2448 if (buffer_uptodate(&map_bh))
2449 continue; /* reiserfs does this */
2450 if (block_start < from || block_end > to) {
2451 struct buffer_head *bh = alloc_buffer_head(GFP_NOFS);
2457 bh->b_state = map_bh.b_state;
2458 atomic_set(&bh->b_count, 0);
2459 bh->b_this_page = NULL;
2461 bh->b_blocknr = map_bh.b_blocknr;
2462 bh->b_size = blocksize;
2463 bh->b_data = (char *)(long)block_start;
2464 bh->b_bdev = map_bh.b_bdev;
2465 bh->b_private = NULL;
2466 read_bh[nr_reads++] = bh;
2471 struct buffer_head *bh;
2474 * The page is locked, so these buffers are protected from
2475 * any VM or truncate activity. Hence we don't need to care
2476 * for the buffer_head refcounts.
2478 for (i = 0; i < nr_reads; i++) {
2481 bh->b_end_io = end_buffer_read_nobh;
2482 submit_bh(READ, bh);
2484 for (i = 0; i < nr_reads; i++) {
2487 if (!buffer_uptodate(bh))
2489 free_buffer_head(bh);
2496 if (is_mapped_to_disk)
2497 SetPageMappedToDisk(page);
2498 SetPageUptodate(page);
2501 * Setting the page dirty here isn't necessary for the prepare_write
2502 * function - commit_write will do that. But if/when this function is
2503 * used within the pagefault handler to ensure that all mmapped pages
2504 * have backing space in the filesystem, we will need to dirty the page
2505 * if its contents were altered.
2508 set_page_dirty(page);
2513 for (i = 0; i < nr_reads; i++) {
2515 free_buffer_head(read_bh[i]);
2519 * Error recovery is pretty slack. Clear the page and mark it dirty
2520 * so we'll later zero out any blocks which _were_ allocated.
2522 kaddr = kmap_atomic(page, KM_USER0);
2523 memset(kaddr, 0, PAGE_CACHE_SIZE);
2524 kunmap_atomic(kaddr, KM_USER0);
2525 SetPageUptodate(page);
2526 set_page_dirty(page);
2529 EXPORT_SYMBOL(nobh_prepare_write);
2531 int nobh_commit_write(struct file *file, struct page *page,
2532 unsigned from, unsigned to)
2534 struct inode *inode = page->mapping->host;
2535 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2537 set_page_dirty(page);
2538 if (pos > inode->i_size) {
2539 i_size_write(inode, pos);
2540 mark_inode_dirty(inode);
2544 EXPORT_SYMBOL(nobh_commit_write);
2547 * nobh_writepage() - based on block_full_write_page() except
2548 * that it tries to operate without attaching bufferheads to
2551 int nobh_writepage(struct page *page, get_block_t *get_block,
2552 struct writeback_control *wbc)
2554 struct inode * const inode = page->mapping->host;
2555 loff_t i_size = i_size_read(inode);
2556 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2561 /* Is the page fully inside i_size? */
2562 if (page->index < end_index)
2565 /* Is the page fully outside i_size? (truncate in progress) */
2566 offset = i_size & (PAGE_CACHE_SIZE-1);
2567 if (page->index >= end_index+1 || !offset) {
2569 * The page may have dirty, unmapped buffers. For example,
2570 * they may have been added in ext3_writepage(). Make them
2571 * freeable here, so the page does not leak.
2574 /* Not really sure about this - do we need this ? */
2575 if (page->mapping->a_ops->invalidatepage)
2576 page->mapping->a_ops->invalidatepage(page, offset);
2579 return 0; /* don't care */
2583 * The page straddles i_size. It must be zeroed out on each and every
2584 * writepage invocation because it may be mmapped. "A file is mapped
2585 * in multiples of the page size. For a file that is not a multiple of
2586 * the page size, the remaining memory is zeroed when mapped, and
2587 * writes to that region are not written out to the file."
2589 kaddr = kmap_atomic(page, KM_USER0);
2590 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2591 flush_dcache_page(page);
2592 kunmap_atomic(kaddr, KM_USER0);
2594 ret = mpage_writepage(page, get_block, wbc);
2596 ret = __block_write_full_page(inode, page, get_block, wbc);
2599 EXPORT_SYMBOL(nobh_writepage);
2602 * This function assumes that ->prepare_write() uses nobh_prepare_write().
2604 int nobh_truncate_page(struct address_space *mapping, loff_t from)
2606 struct inode *inode = mapping->host;
2607 unsigned blocksize = 1 << inode->i_blkbits;
2608 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2609 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2612 struct address_space_operations *a_ops = mapping->a_ops;
2616 if ((offset & (blocksize - 1)) == 0)
2620 page = grab_cache_page(mapping, index);
2624 to = (offset + blocksize) & ~(blocksize - 1);
2625 ret = a_ops->prepare_write(NULL, page, offset, to);
2627 kaddr = kmap_atomic(page, KM_USER0);
2628 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2629 flush_dcache_page(page);
2630 kunmap_atomic(kaddr, KM_USER0);
2631 set_page_dirty(page);
2634 page_cache_release(page);
2638 EXPORT_SYMBOL(nobh_truncate_page);
2640 int block_truncate_page(struct address_space *mapping,
2641 loff_t from, get_block_t *get_block)
2643 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2644 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2647 unsigned length, pos;
2648 struct inode *inode = mapping->host;
2650 struct buffer_head *bh;
2654 blocksize = 1 << inode->i_blkbits;
2655 length = offset & (blocksize - 1);
2657 /* Block boundary? Nothing to do */
2661 length = blocksize - length;
2662 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2664 page = grab_cache_page(mapping, index);
2669 if (!page_has_buffers(page))
2670 create_empty_buffers(page, blocksize, 0);
2672 /* Find the buffer that contains "offset" */
2673 bh = page_buffers(page);
2675 while (offset >= pos) {
2676 bh = bh->b_this_page;
2682 if (!buffer_mapped(bh)) {
2683 WARN_ON(bh->b_size != blocksize);
2684 err = get_block(inode, iblock, bh, 0);
2687 /* unmapped? It's a hole - nothing to do */
2688 if (!buffer_mapped(bh))
2692 /* Ok, it's mapped. Make sure it's up-to-date */
2693 if (PageUptodate(page))
2694 set_buffer_uptodate(bh);
2696 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2698 ll_rw_block(READ, 1, &bh);
2700 /* Uhhuh. Read error. Complain and punt. */
2701 if (!buffer_uptodate(bh))
2705 kaddr = kmap_atomic(page, KM_USER0);
2706 memset(kaddr + offset, 0, length);
2707 flush_dcache_page(page);
2708 kunmap_atomic(kaddr, KM_USER0);
2710 mark_buffer_dirty(bh);
2715 page_cache_release(page);
2721 * The generic ->writepage function for buffer-backed address_spaces
2723 int block_write_full_page(struct page *page, get_block_t *get_block,
2724 struct writeback_control *wbc)
2726 struct inode * const inode = page->mapping->host;
2727 loff_t i_size = i_size_read(inode);
2728 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2732 /* Is the page fully inside i_size? */
2733 if (page->index < end_index)
2734 return __block_write_full_page(inode, page, get_block, wbc);
2736 /* Is the page fully outside i_size? (truncate in progress) */
2737 offset = i_size & (PAGE_CACHE_SIZE-1);
2738 if (page->index >= end_index+1 || !offset) {
2740 * The page may have dirty, unmapped buffers. For example,
2741 * they may have been added in ext3_writepage(). Make them
2742 * freeable here, so the page does not leak.
2744 do_invalidatepage(page, 0);
2746 return 0; /* don't care */
2750 * The page straddles i_size. It must be zeroed out on each and every
2751 * writepage invokation because it may be mmapped. "A file is mapped
2752 * in multiples of the page size. For a file that is not a multiple of
2753 * the page size, the remaining memory is zeroed when mapped, and
2754 * writes to that region are not written out to the file."
2756 kaddr = kmap_atomic(page, KM_USER0);
2757 memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
2758 flush_dcache_page(page);
2759 kunmap_atomic(kaddr, KM_USER0);
2760 return __block_write_full_page(inode, page, get_block, wbc);
2763 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2764 get_block_t *get_block)
2766 struct buffer_head tmp;
2767 struct inode *inode = mapping->host;
2770 tmp.b_size = 1 << inode->i_blkbits;
2771 get_block(inode, block, &tmp, 0);
2772 return tmp.b_blocknr;
2775 static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err)
2777 struct buffer_head *bh = bio->bi_private;
2782 if (err == -EOPNOTSUPP) {
2783 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2784 set_bit(BH_Eopnotsupp, &bh->b_state);
2787 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2792 int submit_bh(int rw, struct buffer_head * bh)
2797 BUG_ON(!buffer_locked(bh));
2798 BUG_ON(!buffer_mapped(bh));
2799 BUG_ON(!bh->b_end_io);
2801 if (buffer_ordered(bh) && (rw == WRITE))
2805 * Only clear out a write error when rewriting, should this
2806 * include WRITE_SYNC as well?
2808 if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
2809 clear_buffer_write_io_error(bh);
2812 * from here on down, it's all bio -- do the initial mapping,
2813 * submit_bio -> generic_make_request may further map this bio around
2815 bio = bio_alloc(GFP_NOIO, 1);
2817 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2818 bio->bi_bdev = bh->b_bdev;
2819 bio->bi_io_vec[0].bv_page = bh->b_page;
2820 bio->bi_io_vec[0].bv_len = bh->b_size;
2821 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2825 bio->bi_size = bh->b_size;
2827 bio->bi_end_io = end_bio_bh_io_sync;
2828 bio->bi_private = bh;
2831 submit_bio(rw, bio);
2833 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2841 * ll_rw_block: low-level access to block devices (DEPRECATED)
2842 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
2843 * @nr: number of &struct buffer_heads in the array
2844 * @bhs: array of pointers to &struct buffer_head
2846 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2847 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2848 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2849 * are sent to disk. The fourth %READA option is described in the documentation
2850 * for generic_make_request() which ll_rw_block() calls.
2852 * This function drops any buffer that it cannot get a lock on (with the
2853 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2854 * clean when doing a write request, and any buffer that appears to be
2855 * up-to-date when doing read request. Further it marks as clean buffers that
2856 * are processed for writing (the buffer cache won't assume that they are
2857 * actually clean until the buffer gets unlocked).
2859 * ll_rw_block sets b_end_io to simple completion handler that marks
2860 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2863 * All of the buffers must be for the same device, and must also be a
2864 * multiple of the current approved size for the device.
2866 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2870 for (i = 0; i < nr; i++) {
2871 struct buffer_head *bh = bhs[i];
2875 else if (test_set_buffer_locked(bh))
2878 if (rw == WRITE || rw == SWRITE) {
2879 if (test_clear_buffer_dirty(bh)) {
2880 bh->b_end_io = end_buffer_write_sync;
2882 submit_bh(WRITE, bh);
2886 if (!buffer_uptodate(bh)) {
2887 bh->b_end_io = end_buffer_read_sync;
2898 * For a data-integrity writeout, we need to wait upon any in-progress I/O
2899 * and then start new I/O and then wait upon it. The caller must have a ref on
2902 int sync_dirty_buffer(struct buffer_head *bh)
2906 WARN_ON(atomic_read(&bh->b_count) < 1);
2908 if (test_clear_buffer_dirty(bh)) {
2910 bh->b_end_io = end_buffer_write_sync;
2911 ret = submit_bh(WRITE, bh);
2913 if (buffer_eopnotsupp(bh)) {
2914 clear_buffer_eopnotsupp(bh);
2917 if (!ret && !buffer_uptodate(bh))
2926 * try_to_free_buffers() checks if all the buffers on this particular page
2927 * are unused, and releases them if so.
2929 * Exclusion against try_to_free_buffers may be obtained by either
2930 * locking the page or by holding its mapping's private_lock.
2932 * If the page is dirty but all the buffers are clean then we need to
2933 * be sure to mark the page clean as well. This is because the page
2934 * may be against a block device, and a later reattachment of buffers
2935 * to a dirty page will set *all* buffers dirty. Which would corrupt
2936 * filesystem data on the same device.
2938 * The same applies to regular filesystem pages: if all the buffers are
2939 * clean then we set the page clean and proceed. To do that, we require
2940 * total exclusion from __set_page_dirty_buffers(). That is obtained with
2943 * try_to_free_buffers() is non-blocking.
2945 static inline int buffer_busy(struct buffer_head *bh)
2947 return atomic_read(&bh->b_count) |
2948 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
2952 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
2954 struct buffer_head *head = page_buffers(page);
2955 struct buffer_head *bh;
2959 if (buffer_write_io_error(bh) && page->mapping)
2960 set_bit(AS_EIO, &page->mapping->flags);
2961 if (buffer_busy(bh))
2963 bh = bh->b_this_page;
2964 } while (bh != head);
2967 struct buffer_head *next = bh->b_this_page;
2969 if (!list_empty(&bh->b_assoc_buffers))
2970 __remove_assoc_queue(bh);
2972 } while (bh != head);
2973 *buffers_to_free = head;
2974 __clear_page_buffers(page);
2980 int try_to_free_buffers(struct page *page)
2982 struct address_space * const mapping = page->mapping;
2983 struct buffer_head *buffers_to_free = NULL;
2986 BUG_ON(!PageLocked(page));
2987 if (PageWriteback(page))
2990 if (mapping == NULL) { /* can this still happen? */
2991 ret = drop_buffers(page, &buffers_to_free);
2995 spin_lock(&mapping->private_lock);
2996 ret = drop_buffers(page, &buffers_to_free);
2999 * If the filesystem writes its buffers by hand (eg ext3)
3000 * then we can have clean buffers against a dirty page. We
3001 * clean the page here; otherwise later reattachment of buffers
3002 * could encounter a non-uptodate page, which is unresolvable.
3003 * This only applies in the rare case where try_to_free_buffers
3004 * succeeds but the page is not freed.
3006 clear_page_dirty(page);
3008 spin_unlock(&mapping->private_lock);
3010 if (buffers_to_free) {
3011 struct buffer_head *bh = buffers_to_free;
3014 struct buffer_head *next = bh->b_this_page;
3015 free_buffer_head(bh);
3017 } while (bh != buffers_to_free);
3021 EXPORT_SYMBOL(try_to_free_buffers);
3023 void block_sync_page(struct page *page)
3025 struct address_space *mapping;
3028 mapping = page_mapping(page);
3030 blk_run_backing_dev(mapping->backing_dev_info, page);
3034 * There are no bdflush tunables left. But distributions are
3035 * still running obsolete flush daemons, so we terminate them here.
3037 * Use of bdflush() is deprecated and will be removed in a future kernel.
3038 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3040 asmlinkage long sys_bdflush(int func, long data)
3042 static int msg_count;
3044 if (!capable(CAP_SYS_ADMIN))
3047 if (msg_count < 5) {
3050 "warning: process `%s' used the obsolete bdflush"
3051 " system call\n", current->comm);
3052 printk(KERN_INFO "Fix your initscripts?\n");
3061 * Buffer-head allocation
3063 static kmem_cache_t *bh_cachep;
3066 * Once the number of bh's in the machine exceeds this level, we start
3067 * stripping them in writeback.
3069 static int max_buffer_heads;
3071 int buffer_heads_over_limit;
3073 struct bh_accounting {
3074 int nr; /* Number of live bh's */
3075 int ratelimit; /* Limit cacheline bouncing */
3078 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3080 static void recalc_bh_state(void)
3085 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3087 __get_cpu_var(bh_accounting).ratelimit = 0;
3088 for_each_online_cpu(i)
3089 tot += per_cpu(bh_accounting, i).nr;
3090 buffer_heads_over_limit = (tot > max_buffer_heads);
3093 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3095 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
3097 get_cpu_var(bh_accounting).nr++;
3099 put_cpu_var(bh_accounting);
3103 EXPORT_SYMBOL(alloc_buffer_head);
3105 void free_buffer_head(struct buffer_head *bh)
3107 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3108 kmem_cache_free(bh_cachep, bh);
3109 get_cpu_var(bh_accounting).nr--;
3111 put_cpu_var(bh_accounting);
3113 EXPORT_SYMBOL(free_buffer_head);
3116 init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags)
3118 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
3119 SLAB_CTOR_CONSTRUCTOR) {
3120 struct buffer_head * bh = (struct buffer_head *)data;
3122 memset(bh, 0, sizeof(*bh));
3123 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3127 #ifdef CONFIG_HOTPLUG_CPU
3128 static void buffer_exit_cpu(int cpu)
3131 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3136 for (i = 0; i < BH_LRU_SIZE; i++) {
3140 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3141 per_cpu(bh_accounting, cpu).nr = 0;
3142 put_cpu_var(bh_accounting);
3145 static int buffer_cpu_notify(struct notifier_block *self,
3146 unsigned long action, void *hcpu)
3148 if (action == CPU_DEAD)
3149 buffer_exit_cpu((unsigned long)hcpu);
3152 #endif /* CONFIG_HOTPLUG_CPU */
3154 void __init buffer_init(void)
3158 bh_cachep = kmem_cache_create("buffer_head",
3159 sizeof(struct buffer_head), 0,
3160 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3166 * Limit the bh occupancy to 10% of ZONE_NORMAL
3168 nrpages = (nr_free_buffer_pages() * 10) / 100;
3169 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3170 hotcpu_notifier(buffer_cpu_notify, 0);
3173 static int __init disable_buffer_lru(char *s)
3178 __setup("disable_buffer_lru", disable_buffer_lru);
3180 EXPORT_SYMBOL(__bforget);
3181 EXPORT_SYMBOL(__brelse);
3182 EXPORT_SYMBOL(__wait_on_buffer);
3183 EXPORT_SYMBOL(block_commit_write);
3184 EXPORT_SYMBOL(block_prepare_write);
3185 EXPORT_SYMBOL(block_read_full_page);
3186 EXPORT_SYMBOL(block_sync_page);
3187 EXPORT_SYMBOL(block_truncate_page);
3188 EXPORT_SYMBOL(block_write_full_page);
3189 EXPORT_SYMBOL(cont_prepare_write);
3190 EXPORT_SYMBOL(end_buffer_async_write);
3191 EXPORT_SYMBOL(end_buffer_read_sync);
3192 EXPORT_SYMBOL(end_buffer_write_sync);
3193 EXPORT_SYMBOL(file_fsync);
3194 EXPORT_SYMBOL(fsync_bdev);
3195 EXPORT_SYMBOL(generic_block_bmap);
3196 EXPORT_SYMBOL(generic_commit_write);
3197 EXPORT_SYMBOL(generic_cont_expand);
3198 EXPORT_SYMBOL(generic_cont_expand_simple);
3199 EXPORT_SYMBOL(init_buffer);
3200 EXPORT_SYMBOL(invalidate_bdev);
3201 EXPORT_SYMBOL(ll_rw_block);
3202 EXPORT_SYMBOL(mark_buffer_dirty);
3203 EXPORT_SYMBOL(submit_bh);
3204 EXPORT_SYMBOL(sync_dirty_buffer);
3205 EXPORT_SYMBOL(unlock_buffer);