fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31
  32 /*
  33  * Default IO end handler for temporary BJ_IO buffer_heads.
  34  */
  35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  36 {
  37         BUFFER_TRACE(bh, "");
  38         if (uptodate)
  39                 set_buffer_uptodate(bh);
  40         else
  41                 clear_buffer_uptodate(bh);
  42         unlock_buffer(bh);
  43 }
  44
  45 /*
  46  * When an ext4 file is truncated, it is possible that some pages are not
  47  * successfully freed, because they are attached to a committing transaction.
  48  * After the transaction commits, these pages are left on the LRU, with no
  49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  51  * the numbers in /proc/meminfo look odd.
  52  *
  53  * So here, we have a buffer which has just come off the forget list.  Look to
  54  * see if we can strip all buffers from the backing page.
  55  *
  56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  57  * caller provided us with a ref against the buffer, and we drop that here.
  58  */
  59 static void release_buffer_page(struct buffer_head *bh)
  60 {
  61         struct page *page;
  62
  63         if (buffer_dirty(bh))
  64                 goto nope;
  65         if (atomic_read(&bh->b_count) != 1)
  66                 goto nope;
  67         page = bh->b_page;
  68         if (!page)
  69                 goto nope;
  70         if (page->mapping)
  71                 goto nope;
  72
  73         /* OK, it's a truncated page */
  74         if (!trylock_page(page))
  75                 goto nope;
  76
  77         page_cache_get(page);
  78         __brelse(bh);
  79         try_to_free_buffers(page);
  80         unlock_page(page);
  81         page_cache_release(page);
  82         return;
  83
  84 nope:
  85         __brelse(bh);
  86 }
  87
  88 /*
  89  * Done it all: now submit the commit record.  We should have
  90  * cleaned up our previous buffers by now, so if we are in abort
  91  * mode we can now just skip the rest of the journal write
  92  * entirely.
  93  *
  94  * Returns 1 if the journal needs to be aborted or 0 on success
  95  */
  96 static int journal_submit_commit_record(journal_t *journal,
  97                                         transaction_t *commit_transaction,
  98                                         struct buffer_head **cbh,
  99                                         __u32 crc32_sum)
 100 {
 101         struct journal_head *descriptor;
 102         struct commit_header *tmp;
 103         struct buffer_head *bh;
 104         int ret;
 105         struct timespec now = current_kernel_time();
 106
 107         *cbh = NULL;
 108
 109         if (is_journal_aborted(journal))
 110                 return 0;
 111
 112         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 113         if (!descriptor)
 114                 return 1;
 115
 116         bh = jh2bh(descriptor);
 117
 118         tmp = (struct commit_header *)bh->b_data;
 119         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 120         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 121         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 122         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 123         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 124
 125         if (JBD2_HAS_COMPAT_FEATURE(journal,
 126                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 127                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 128                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 129                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 130         }
 131
 132         JBUFFER_TRACE(descriptor, "submit commit block");
 133         lock_buffer(bh);
 134         clear_buffer_dirty(bh);
 135         set_buffer_uptodate(bh);
 136         bh->b_end_io = journal_end_buffer_io_sync;
 137
 138         if (journal->j_flags & JBD2_BARRIER &&
 139             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 140                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 141                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 142         else
 143                 ret = submit_bh(WRITE_SYNC, bh);
 144
 145         *cbh = bh;
 146         return ret;
 147 }
 148
 149 /*
 150  * This function along with journal_submit_commit_record
 151  * allows to write the commit record asynchronously.
 152  */
 153 static int journal_wait_on_commit_record(journal_t *journal,
 154                                          struct buffer_head *bh)
 155 {
 156         int ret = 0;
 157
 158         clear_buffer_dirty(bh);
 159         wait_on_buffer(bh);
 160
 161         if (unlikely(!buffer_uptodate(bh)))
 162                 ret = -EIO;
 163         put_bh(bh);            /* One for getblk() */
 164         jbd2_journal_put_journal_head(bh2jh(bh));
 165
 166         return ret;
 167 }
 168
 169 /*
 170  * write the filemap data using writepage() address_space_operations.
 171  * We don't do block allocation here even for delalloc. We don't
 172  * use writepages() because with dealyed allocation we may be doing
 173  * block allocation in writepages().
 174  */
 175 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 176 {
 177         int ret;
 178         struct writeback_control wbc = {
 179                 .sync_mode =  WB_SYNC_ALL,
 180                 .nr_to_write = mapping->nrpages * 2,
 181                 .range_start = 0,
 182                 .range_end = i_size_read(mapping->host),
 183         };
 184
 185         ret = generic_writepages(mapping, &wbc);
 186         return ret;
 187 }
 188
 189 /*
 190  * Submit all the data buffers of inode associated with the transaction to
 191  * disk.
 192  *
 193  * We are in a committing transaction. Therefore no new inode can be added to
 194  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 195  * operate on from being released while we write out pages.
 196  */
 197 static int journal_submit_data_buffers(journal_t *journal,
 198                 transaction_t *commit_transaction)
 199 {
 200         struct jbd2_inode *jinode;
 201         int err, ret = 0;
 202         struct address_space *mapping;
 203
 204         spin_lock(&journal->j_list_lock);
 205         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 206                 mapping = jinode->i_vfs_inode->i_mapping;
 207                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 208                 spin_unlock(&journal->j_list_lock);
 209                 /*
 210                  * submit the inode data buffers. We use writepage
 211                  * instead of writepages. Because writepages can do
 212                  * block allocation  with delalloc. We need to write
 213                  * only allocated blocks here.
 214                  */
 215                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 216                 err = journal_submit_inode_data_buffers(mapping);
 217                 if (!ret)
 218                         ret = err;
 219                 spin_lock(&journal->j_list_lock);
 220                 J_ASSERT(jinode->i_transaction == commit_transaction);
 221                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 222                 smp_mb__after_clear_bit();
 223                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 224         }
 225         spin_unlock(&journal->j_list_lock);
 226         return ret;
 227 }
 228
 229 /*
 230  * Wait for data submitted for writeout, refile inodes to proper
 231  * transaction if needed.
 232  *
 233  */
 234 static int journal_finish_inode_data_buffers(journal_t *journal,
 235                 transaction_t *commit_transaction)
 236 {
 237         struct jbd2_inode *jinode, *next_i;
 238         int err, ret = 0;
 239
 240         /* For locking, see the comment in journal_submit_data_buffers() */
 241         spin_lock(&journal->j_list_lock);
 242         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 243                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 244                 spin_unlock(&journal->j_list_lock);
 245                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 246                 if (err) {
 247                         /*
 248                          * Because AS_EIO is cleared by
 249                          * filemap_fdatawait_range(), set it again so
 250                          * that user process can get -EIO from fsync().
 251                          */
 252                         set_bit(AS_EIO,
 253                                 &jinode->i_vfs_inode->i_mapping->flags);
 254
 255                         if (!ret)
 256                                 ret = err;
 257                 }
 258                 spin_lock(&journal->j_list_lock);
 259                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 260                 smp_mb__after_clear_bit();
 261                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 262         }
 263
 264         /* Now refile inode to proper lists */
 265         list_for_each_entry_safe(jinode, next_i,
 266                                  &commit_transaction->t_inode_list, i_list) {
 267                 list_del(&jinode->i_list);
 268                 if (jinode->i_next_transaction) {
 269                         jinode->i_transaction = jinode->i_next_transaction;
 270                         jinode->i_next_transaction = NULL;
 271                         list_add(&jinode->i_list,
 272                                 &jinode->i_transaction->t_inode_list);
 273                 } else {
 274                         jinode->i_transaction = NULL;
 275                 }
 276         }
 277         spin_unlock(&journal->j_list_lock);
 278
 279         return ret;
 280 }
 281
 282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 283 {
 284         struct page *page = bh->b_page;
 285         char *addr;
 286         __u32 checksum;
 287
 288         addr = kmap_atomic(page);
 289         checksum = crc32_be(crc32_sum,
 290                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 291         kunmap_atomic(addr);
 292
 293         return checksum;
 294 }
 295
 296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 297                                    unsigned long long block)
 298 {
 299         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 300         if (tag_bytes > JBD2_TAG_SIZE32)
 301                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 302 }
 303
 304 /*
 305  * jbd2_journal_commit_transaction
 306  *
 307  * The primary function for committing a transaction to the log.  This
 308  * function is called by the journal thread to begin a complete commit.
 309  */
 310 void jbd2_journal_commit_transaction(journal_t *journal)
 311 {
 312         struct transaction_stats_s stats;
 313         transaction_t *commit_transaction;
 314         struct journal_head *jh, *new_jh, *descriptor;
 315         struct buffer_head **wbuf = journal->j_wbuf;
 316         int bufs;
 317         int flags;
 318         int err;
 319         unsigned long long blocknr;
 320         ktime_t start_time;
 321         u64 commit_time;
 322         char *tagp = NULL;
 323         journal_header_t *header;
 324         journal_block_tag_t *tag = NULL;
 325         int space_left = 0;
 326         int first_tag = 0;
 327         int tag_flag;
 328         int i, to_free = 0;
 329         int tag_bytes = journal_tag_bytes(journal);
 330         struct buffer_head *cbh = NULL; /* For transactional checksums */
 331         __u32 crc32_sum = ~0;
 332         struct blk_plug plug;
 333         /* Tail of the journal */
 334         unsigned long first_block;
 335         tid_t first_tid;
 336         int update_tail;
 337
 338         /*
 339          * First job: lock down the current transaction and wait for
 340          * all outstanding updates to complete.
 341          */
 342
 343         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 344         if (journal->j_flags & JBD2_FLUSHED) {
 345                 jbd_debug(3, "super block updated\n");
 346                 mutex_lock(&journal->j_checkpoint_mutex);
 347                 /*
 348                  * We hold j_checkpoint_mutex so tail cannot change under us.
 349                  * We don't need any special data guarantees for writing sb
 350                  * since journal is empty and it is ok for write to be
 351                  * flushed only with transaction commit.
 352                  */
 353                 jbd2_journal_update_sb_log_tail(journal,
 354                                                 journal->j_tail_sequence,
 355                                                 journal->j_tail,
 356                                                 WRITE_SYNC);
 357                 mutex_unlock(&journal->j_checkpoint_mutex);
 358         } else {
 359                 jbd_debug(3, "superblock not updated\n");
 360         }
 361
 362         J_ASSERT(journal->j_running_transaction != NULL);
 363         J_ASSERT(journal->j_committing_transaction == NULL);
 364
 365         commit_transaction = journal->j_running_transaction;
 366         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 367
 368         trace_jbd2_start_commit(journal, commit_transaction);
 369         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 370                         commit_transaction->t_tid);
 371
 372         write_lock(&journal->j_state_lock);
 373         commit_transaction->t_state = T_LOCKED;
 374
 375         trace_jbd2_commit_locking(journal, commit_transaction);
 376         stats.run.rs_wait = commit_transaction->t_max_wait;
 377         stats.run.rs_locked = jiffies;
 378         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 379                                               stats.run.rs_locked);
 380
 381         spin_lock(&commit_transaction->t_handle_lock);
 382         while (atomic_read(&commit_transaction->t_updates)) {
 383                 DEFINE_WAIT(wait);
 384
 385                 prepare_to_wait(&journal->j_wait_updates, &wait,
 386                                         TASK_UNINTERRUPTIBLE);
 387                 if (atomic_read(&commit_transaction->t_updates)) {
 388                         spin_unlock(&commit_transaction->t_handle_lock);
 389                         write_unlock(&journal->j_state_lock);
 390                         schedule();
 391                         write_lock(&journal->j_state_lock);
 392                         spin_lock(&commit_transaction->t_handle_lock);
 393                 }
 394                 finish_wait(&journal->j_wait_updates, &wait);
 395         }
 396         spin_unlock(&commit_transaction->t_handle_lock);
 397
 398         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 399                         journal->j_max_transaction_buffers);
 400
 401         /*
 402          * First thing we are allowed to do is to discard any remaining
 403          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 404          * that there are no such buffers: if a large filesystem
 405          * operation like a truncate needs to split itself over multiple
 406          * transactions, then it may try to do a jbd2_journal_restart() while
 407          * there are still BJ_Reserved buffers outstanding.  These must
 408          * be released cleanly from the current transaction.
 409          *
 410          * In this case, the filesystem must still reserve write access
 411          * again before modifying the buffer in the new transaction, but
 412          * we do not require it to remember exactly which old buffers it
 413          * has reserved.  This is consistent with the existing behaviour
 414          * that multiple jbd2_journal_get_write_access() calls to the same
 415          * buffer are perfectly permissible.
 416          */
 417         while (commit_transaction->t_reserved_list) {
 418                 jh = commit_transaction->t_reserved_list;
 419                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 420                 /*
 421                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 422                  * leave undo-committed data.
 423                  */
 424                 if (jh->b_committed_data) {
 425                         struct buffer_head *bh = jh2bh(jh);
 426
 427                         jbd_lock_bh_state(bh);
 428                         jbd2_free(jh->b_committed_data, bh->b_size);
 429                         jh->b_committed_data = NULL;
 430                         jbd_unlock_bh_state(bh);
 431                 }
 432                 jbd2_journal_refile_buffer(journal, jh);
 433         }
 434
 435         /*
 436          * Now try to drop any written-back buffers from the journal's
 437          * checkpoint lists.  We do this *before* commit because it potentially
 438          * frees some memory
 439          */
 440         spin_lock(&journal->j_list_lock);
 441         __jbd2_journal_clean_checkpoint_list(journal);
 442         spin_unlock(&journal->j_list_lock);
 443
 444         jbd_debug(3, "JBD2: commit phase 1\n");
 445
 446         /*
 447          * Clear revoked flag to reflect there is no revoked buffers
 448          * in the next transaction which is going to be started.
 449          */
 450         jbd2_clear_buffer_revoked_flags(journal);
 451
 452         /*
 453          * Switch to a new revoke table.
 454          */
 455         jbd2_journal_switch_revoke_table(journal);
 456
 457         trace_jbd2_commit_flushing(journal, commit_transaction);
 458         stats.run.rs_flushing = jiffies;
 459         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 460                                              stats.run.rs_flushing);
 461
 462         commit_transaction->t_state = T_FLUSH;
 463         journal->j_committing_transaction = commit_transaction;
 464         journal->j_running_transaction = NULL;
 465         start_time = ktime_get();
 466         commit_transaction->t_log_start = journal->j_head;
 467         wake_up(&journal->j_wait_transaction_locked);
 468         write_unlock(&journal->j_state_lock);
 469
 470         jbd_debug(3, "JBD2: commit phase 2\n");
 471
 472         /*
 473          * Now start flushing things to disk, in the order they appear
 474          * on the transaction lists.  Data blocks go first.
 475          */
 476         err = journal_submit_data_buffers(journal, commit_transaction);
 477         if (err)
 478                 jbd2_journal_abort(journal, err);
 479
 480         blk_start_plug(&plug);
 481         jbd2_journal_write_revoke_records(journal, commit_transaction,
 482                                           WRITE_SYNC);
 483         blk_finish_plug(&plug);
 484
 485         jbd_debug(3, "JBD2: commit phase 2\n");
 486
 487         /*
 488          * Way to go: we have now written out all of the data for a
 489          * transaction!  Now comes the tricky part: we need to write out
 490          * metadata.  Loop over the transaction's entire buffer list:
 491          */
 492         write_lock(&journal->j_state_lock);
 493         commit_transaction->t_state = T_COMMIT;
 494         write_unlock(&journal->j_state_lock);
 495
 496         trace_jbd2_commit_logging(journal, commit_transaction);
 497         stats.run.rs_logging = jiffies;
 498         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 499                                                stats.run.rs_logging);
 500         stats.run.rs_blocks =
 501                 atomic_read(&commit_transaction->t_outstanding_credits);
 502         stats.run.rs_blocks_logged = 0;
 503
 504         J_ASSERT(commit_transaction->t_nr_buffers <=
 505                  atomic_read(&commit_transaction->t_outstanding_credits));
 506
 507         err = 0;
 508         descriptor = NULL;
 509         bufs = 0;
 510         blk_start_plug(&plug);
 511         while (commit_transaction->t_buffers) {
 512
 513                 /* Find the next buffer to be journaled... */
 514
 515                 jh = commit_transaction->t_buffers;
 516
 517                 /* If we're in abort mode, we just un-journal the buffer and
 518                    release it. */
 519
 520                 if (is_journal_aborted(journal)) {
 521                         clear_buffer_jbddirty(jh2bh(jh));
 522                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 523                         jbd2_buffer_abort_trigger(jh,
 524                                                   jh->b_frozen_data ?
 525                                                   jh->b_frozen_triggers :
 526                                                   jh->b_triggers);
 527                         jbd2_journal_refile_buffer(journal, jh);
 528                         /* If that was the last one, we need to clean up
 529                          * any descriptor buffers which may have been
 530                          * already allocated, even if we are now
 531                          * aborting. */
 532                         if (!commit_transaction->t_buffers)
 533                                 goto start_journal_io;
 534                         continue;
 535                 }
 536
 537                 /* Make sure we have a descriptor block in which to
 538                    record the metadata buffer. */
 539
 540                 if (!descriptor) {
 541                         struct buffer_head *bh;
 542
 543                         J_ASSERT (bufs == 0);
 544
 545                         jbd_debug(4, "JBD2: get descriptor\n");
 546
 547                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 548                         if (!descriptor) {
 549                                 jbd2_journal_abort(journal, -EIO);
 550                                 continue;
 551                         }
 552
 553                         bh = jh2bh(descriptor);
 554                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 555                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 556                         header = (journal_header_t *)&bh->b_data[0];
 557                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 558                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 559                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 560
 561                         tagp = &bh->b_data[sizeof(journal_header_t)];
 562                         space_left = bh->b_size - sizeof(journal_header_t);
 563                         first_tag = 1;
 564                         set_buffer_jwrite(bh);
 565                         set_buffer_dirty(bh);
 566                         wbuf[bufs++] = bh;
 567
 568                         /* Record it so that we can wait for IO
 569                            completion later */
 570                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 571                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 572                                         BJ_LogCtl);
 573                 }
 574
 575                 /* Where is the buffer to be written? */
 576
 577                 err = jbd2_journal_next_log_block(journal, &blocknr);
 578                 /* If the block mapping failed, just abandon the buffer
 579                    and repeat this loop: we'll fall into the
 580                    refile-on-abort condition above. */
 581                 if (err) {
 582                         jbd2_journal_abort(journal, err);
 583                         continue;
 584                 }
 585
 586                 /*
 587                  * start_this_handle() uses t_outstanding_credits to determine
 588                  * the free space in the log, but this counter is changed
 589                  * by jbd2_journal_next_log_block() also.
 590                  */
 591                 atomic_dec(&commit_transaction->t_outstanding_credits);
 592
 593                 /* Bump b_count to prevent truncate from stumbling over
 594                    the shadowed buffer!  @@@ This can go if we ever get
 595                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 596                 atomic_inc(&jh2bh(jh)->b_count);
 597
 598                 /* Make a temporary IO buffer with which to write it out
 599                    (this will requeue both the metadata buffer and the
 600                    temporary IO buffer). new_bh goes on BJ_IO*/
 601
 602                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 603                 /*
 604                  * akpm: jbd2_journal_write_metadata_buffer() sets
 605                  * new_bh->b_transaction to commit_transaction.
 606                  * We need to clean this up before we release new_bh
 607                  * (which is of type BJ_IO)
 608                  */
 609                 JBUFFER_TRACE(jh, "ph3: write metadata");
 610                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 611                                                       jh, &new_jh, blocknr);
 612                 if (flags < 0) {
 613                         jbd2_journal_abort(journal, flags);
 614                         continue;
 615                 }
 616                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 617                 wbuf[bufs++] = jh2bh(new_jh);
 618
 619                 /* Record the new block's tag in the current descriptor
 620                    buffer */
 621
 622                 tag_flag = 0;
 623                 if (flags & 1)
 624                         tag_flag |= JBD2_FLAG_ESCAPE;
 625                 if (!first_tag)
 626                         tag_flag |= JBD2_FLAG_SAME_UUID;
 627
 628                 tag = (journal_block_tag_t *) tagp;
 629                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 630                 tag->t_flags = cpu_to_be32(tag_flag);
 631                 tagp += tag_bytes;
 632                 space_left -= tag_bytes;
 633
 634                 if (first_tag) {
 635                         memcpy (tagp, journal->j_uuid, 16);
 636                         tagp += 16;
 637                         space_left -= 16;
 638                         first_tag = 0;
 639                 }
 640
 641                 /* If there's no more to do, or if the descriptor is full,
 642                    let the IO rip! */
 643
 644                 if (bufs == journal->j_wbufsize ||
 645                     commit_transaction->t_buffers == NULL ||
 646                     space_left < tag_bytes + 16) {
 647
 648                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 649
 650                         /* Write an end-of-descriptor marker before
 651                            submitting the IOs.  "tag" still points to
 652                            the last tag we set up. */
 653
 654                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
 655
 656 start_journal_io:
 657                         for (i = 0; i < bufs; i++) {
 658                                 struct buffer_head *bh = wbuf[i];
 659                                 /*
 660                                  * Compute checksum.
 661                                  */
 662                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 663                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 664                                         crc32_sum =
 665                                             jbd2_checksum_data(crc32_sum, bh);
 666                                 }
 667
 668                                 lock_buffer(bh);
 669                                 clear_buffer_dirty(bh);
 670                                 set_buffer_uptodate(bh);
 671                                 bh->b_end_io = journal_end_buffer_io_sync;
 672                                 submit_bh(WRITE_SYNC, bh);
 673                         }
 674                         cond_resched();
 675                         stats.run.rs_blocks_logged += bufs;
 676
 677                         /* Force a new descriptor to be generated next
 678                            time round the loop. */
 679                         descriptor = NULL;
 680                         bufs = 0;
 681                 }
 682         }
 683
 684         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 685         if (err) {
 686                 printk(KERN_WARNING
 687                         "JBD2: Detected IO errors while flushing file data "
 688                        "on %s\n", journal->j_devname);
 689                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 690                         jbd2_journal_abort(journal, err);
 691                 err = 0;
 692         }
 693
 694         /*
 695          * Get current oldest transaction in the log before we issue flush
 696          * to the filesystem device. After the flush we can be sure that
 697          * blocks of all older transactions are checkpointed to persistent
 698          * storage and we will be safe to update journal start in the
 699          * superblock with the numbers we get here.
 700          */
 701         update_tail =
 702                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 703
 704         write_lock(&journal->j_state_lock);
 705         if (update_tail) {
 706                 long freed = first_block - journal->j_tail;
 707
 708                 if (first_block < journal->j_tail)
 709                         freed += journal->j_last - journal->j_first;
 710                 /* Update tail only if we free significant amount of space */
 711                 if (freed < journal->j_maxlen / 4)
 712                         update_tail = 0;
 713         }
 714         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 715         commit_transaction->t_state = T_COMMIT_DFLUSH;
 716         write_unlock(&journal->j_state_lock);
 717
 718         /*
 719          * If the journal is not located on the file system device,
 720          * then we must flush the file system device before we issue
 721          * the commit record
 722          */
 723         if (commit_transaction->t_need_data_flush &&
 724             (journal->j_fs_dev != journal->j_dev) &&
 725             (journal->j_flags & JBD2_BARRIER))
 726                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 727
 728         /* Done it all: now write the commit record asynchronously. */
 729         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 730                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 731                 err = journal_submit_commit_record(journal, commit_transaction,
 732                                                  &cbh, crc32_sum);
 733                 if (err)
 734                         __jbd2_journal_abort_hard(journal);
 735         }
 736
 737         blk_finish_plug(&plug);
 738
 739         /* Lo and behold: we have just managed to send a transaction to
 740            the log.  Before we can commit it, wait for the IO so far to
 741            complete.  Control buffers being written are on the
 742            transaction's t_log_list queue, and metadata buffers are on
 743            the t_iobuf_list queue.
 744
 745            Wait for the buffers in reverse order.  That way we are
 746            less likely to be woken up until all IOs have completed, and
 747            so we incur less scheduling load.
 748         */
 749
 750         jbd_debug(3, "JBD2: commit phase 3\n");
 751
 752         /*
 753          * akpm: these are BJ_IO, and j_list_lock is not needed.
 754          * See __journal_try_to_free_buffer.
 755          */
 756 wait_for_iobuf:
 757         while (commit_transaction->t_iobuf_list != NULL) {
 758                 struct buffer_head *bh;
 759
 760                 jh = commit_transaction->t_iobuf_list->b_tprev;
 761                 bh = jh2bh(jh);
 762                 if (buffer_locked(bh)) {
 763                         wait_on_buffer(bh);
 764                         goto wait_for_iobuf;
 765                 }
 766                 if (cond_resched())
 767                         goto wait_for_iobuf;
 768
 769                 if (unlikely(!buffer_uptodate(bh)))
 770                         err = -EIO;
 771
 772                 clear_buffer_jwrite(bh);
 773
 774                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 775                 jbd2_journal_unfile_buffer(journal, jh);
 776
 777                 /*
 778                  * ->t_iobuf_list should contain only dummy buffer_heads
 779                  * which were created by jbd2_journal_write_metadata_buffer().
 780                  */
 781                 BUFFER_TRACE(bh, "dumping temporary bh");
 782                 jbd2_journal_put_journal_head(jh);
 783                 __brelse(bh);
 784                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 785                 free_buffer_head(bh);
 786
 787                 /* We also have to unlock and free the corresponding
 788                    shadowed buffer */
 789                 jh = commit_transaction->t_shadow_list->b_tprev;
 790                 bh = jh2bh(jh);
 791                 clear_bit(BH_JWrite, &bh->b_state);
 792                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 793
 794                 /* The metadata is now released for reuse, but we need
 795                    to remember it against this transaction so that when
 796                    we finally commit, we can do any checkpointing
 797                    required. */
 798                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 799                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 800                 /*
 801                  * Wake up any transactions which were waiting for this IO to
 802                  * complete. The barrier must be here so that changes by
 803                  * jbd2_journal_file_buffer() take effect before wake_up_bit()
 804                  * does the waitqueue check.
 805                  */
 806                 smp_mb();
 807                 wake_up_bit(&bh->b_state, BH_Unshadow);
 808                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 809                 __brelse(bh);
 810         }
 811
 812         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 813
 814         jbd_debug(3, "JBD2: commit phase 4\n");
 815
 816         /* Here we wait for the revoke record and descriptor record buffers */
 817  wait_for_ctlbuf:
 818         while (commit_transaction->t_log_list != NULL) {
 819                 struct buffer_head *bh;
 820
 821                 jh = commit_transaction->t_log_list->b_tprev;
 822                 bh = jh2bh(jh);
 823                 if (buffer_locked(bh)) {
 824                         wait_on_buffer(bh);
 825                         goto wait_for_ctlbuf;
 826                 }
 827                 if (cond_resched())
 828                         goto wait_for_ctlbuf;
 829
 830                 if (unlikely(!buffer_uptodate(bh)))
 831                         err = -EIO;
 832
 833                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 834                 clear_buffer_jwrite(bh);
 835                 jbd2_journal_unfile_buffer(journal, jh);
 836                 jbd2_journal_put_journal_head(jh);
 837                 __brelse(bh);           /* One for getblk */
 838                 /* AKPM: bforget here */
 839         }
 840
 841         if (err)
 842                 jbd2_journal_abort(journal, err);
 843
 844         jbd_debug(3, "JBD2: commit phase 5\n");
 845         write_lock(&journal->j_state_lock);
 846         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 847         commit_transaction->t_state = T_COMMIT_JFLUSH;
 848         write_unlock(&journal->j_state_lock);
 849
 850         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 851                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 852                 err = journal_submit_commit_record(journal, commit_transaction,
 853                                                 &cbh, crc32_sum);
 854                 if (err)
 855                         __jbd2_journal_abort_hard(journal);
 856         }
 857         if (cbh)
 858                 err = journal_wait_on_commit_record(journal, cbh);
 859         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 860                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 861             journal->j_flags & JBD2_BARRIER) {
 862                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 863         }
 864
 865         if (err)
 866                 jbd2_journal_abort(journal, err);
 867
 868         /*
 869          * Now disk caches for filesystem device are flushed so we are safe to
 870          * erase checkpointed transactions from the log by updating journal
 871          * superblock.
 872          */
 873         if (update_tail)
 874                 jbd2_update_log_tail(journal, first_tid, first_block);
 875
 876         /* End of a transaction!  Finally, we can do checkpoint
 877            processing: any buffers committed as a result of this
 878            transaction can be removed from any checkpoint list it was on
 879            before. */
 880
 881         jbd_debug(3, "JBD2: commit phase 6\n");
 882
 883         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 884         J_ASSERT(commit_transaction->t_buffers == NULL);
 885         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 886         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 887         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 888         J_ASSERT(commit_transaction->t_log_list == NULL);
 889
 890 restart_loop:
 891         /*
 892          * As there are other places (journal_unmap_buffer()) adding buffers
 893          * to this list we have to be careful and hold the j_list_lock.
 894          */
 895         spin_lock(&journal->j_list_lock);
 896         while (commit_transaction->t_forget) {
 897                 transaction_t *cp_transaction;
 898                 struct buffer_head *bh;
 899                 int try_to_free = 0;
 900
 901                 jh = commit_transaction->t_forget;
 902                 spin_unlock(&journal->j_list_lock);
 903                 bh = jh2bh(jh);
 904                 /*
 905                  * Get a reference so that bh cannot be freed before we are
 906                  * done with it.
 907                  */
 908                 get_bh(bh);
 909                 jbd_lock_bh_state(bh);
 910                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 911
 912                 /*
 913                  * If there is undo-protected committed data against
 914                  * this buffer, then we can remove it now.  If it is a
 915                  * buffer needing such protection, the old frozen_data
 916                  * field now points to a committed version of the
 917                  * buffer, so rotate that field to the new committed
 918                  * data.
 919                  *
 920                  * Otherwise, we can just throw away the frozen data now.
 921                  *
 922                  * We also know that the frozen data has already fired
 923                  * its triggers if they exist, so we can clear that too.
 924                  */
 925                 if (jh->b_committed_data) {
 926                         jbd2_free(jh->b_committed_data, bh->b_size);
 927                         jh->b_committed_data = NULL;
 928                         if (jh->b_frozen_data) {
 929                                 jh->b_committed_data = jh->b_frozen_data;
 930                                 jh->b_frozen_data = NULL;
 931                                 jh->b_frozen_triggers = NULL;
 932                         }
 933                 } else if (jh->b_frozen_data) {
 934                         jbd2_free(jh->b_frozen_data, bh->b_size);
 935                         jh->b_frozen_data = NULL;
 936                         jh->b_frozen_triggers = NULL;
 937                 }
 938
 939                 spin_lock(&journal->j_list_lock);
 940                 cp_transaction = jh->b_cp_transaction;
 941                 if (cp_transaction) {
 942                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 943                         cp_transaction->t_chp_stats.cs_dropped++;
 944                         __jbd2_journal_remove_checkpoint(jh);
 945                 }
 946
 947                 /* Only re-checkpoint the buffer_head if it is marked
 948                  * dirty.  If the buffer was added to the BJ_Forget list
 949                  * by jbd2_journal_forget, it may no longer be dirty and
 950                  * there's no point in keeping a checkpoint record for
 951                  * it. */
 952
 953                 /* A buffer which has been freed while still being
 954                  * journaled by a previous transaction may end up still
 955                  * being dirty here, but we want to avoid writing back
 956                  * that buffer in the future after the "add to orphan"
 957                  * operation been committed,  That's not only a performance
 958                  * gain, it also stops aliasing problems if the buffer is
 959                  * left behind for writeback and gets reallocated for another
 960                  * use in a different page. */
 961                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 962                         clear_buffer_freed(bh);
 963                         clear_buffer_jbddirty(bh);
 964                 }
 965
 966                 if (buffer_jbddirty(bh)) {
 967                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 968                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 969                         if (is_journal_aborted(journal))
 970                                 clear_buffer_jbddirty(bh);
 971                 } else {
 972                         J_ASSERT_BH(bh, !buffer_dirty(bh));
 973                         /*
 974                          * The buffer on BJ_Forget list and not jbddirty means
 975                          * it has been freed by this transaction and hence it
 976                          * could not have been reallocated until this
 977                          * transaction has committed. *BUT* it could be
 978                          * reallocated once we have written all the data to
 979                          * disk and before we process the buffer on BJ_Forget
 980                          * list.
 981                          */
 982                         if (!jh->b_next_transaction)
 983                                 try_to_free = 1;
 984                 }
 985                 JBUFFER_TRACE(jh, "refile or unfile buffer");
 986                 __jbd2_journal_refile_buffer(jh);
 987                 jbd_unlock_bh_state(bh);
 988                 if (try_to_free)
 989                         release_buffer_page(bh);        /* Drops bh reference */
 990                 else
 991                         __brelse(bh);
 992                 cond_resched_lock(&journal->j_list_lock);
 993         }
 994         spin_unlock(&journal->j_list_lock);
 995         /*
 996          * This is a bit sleazy.  We use j_list_lock to protect transition
 997          * of a transaction into T_FINISHED state and calling
 998          * __jbd2_journal_drop_transaction(). Otherwise we could race with
 999          * other checkpointing code processing the transaction...
1000          */
1001         write_lock(&journal->j_state_lock);
1002         spin_lock(&journal->j_list_lock);
1003         /*
1004          * Now recheck if some buffers did not get attached to the transaction
1005          * while the lock was dropped...
1006          */
1007         if (commit_transaction->t_forget) {
1008                 spin_unlock(&journal->j_list_lock);
1009                 write_unlock(&journal->j_state_lock);
1010                 goto restart_loop;
1011         }
1012
1013         /* Done with this transaction! */
1014
1015         jbd_debug(3, "JBD2: commit phase 7\n");
1016
1017         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1018
1019         commit_transaction->t_start = jiffies;
1020         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1021                                               commit_transaction->t_start);
1022
1023         /*
1024          * File the transaction statistics
1025          */
1026         stats.ts_tid = commit_transaction->t_tid;
1027         stats.run.rs_handle_count =
1028                 atomic_read(&commit_transaction->t_handle_count);
1029         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1030                              commit_transaction->t_tid, &stats.run);
1031
1032         /*
1033          * Calculate overall stats
1034          */
1035         spin_lock(&journal->j_history_lock);
1036         journal->j_stats.ts_tid++;
1037         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1038         journal->j_stats.run.rs_running += stats.run.rs_running;
1039         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1040         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1041         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1042         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1043         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1044         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1045         spin_unlock(&journal->j_history_lock);
1046
1047         commit_transaction->t_state = T_FINISHED;
1048         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1049         journal->j_commit_sequence = commit_transaction->t_tid;
1050         journal->j_committing_transaction = NULL;
1051         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1052
1053         /*
1054          * weight the commit time higher than the average time so we don't
1055          * react too strongly to vast changes in the commit time
1056          */
1057         if (likely(journal->j_average_commit_time))
1058                 journal->j_average_commit_time = (commit_time +
1059                                 journal->j_average_commit_time*3) / 4;
1060         else
1061                 journal->j_average_commit_time = commit_time;
1062         write_unlock(&journal->j_state_lock);
1063
1064         if (commit_transaction->t_checkpoint_list == NULL &&
1065             commit_transaction->t_checkpoint_io_list == NULL) {
1066                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1067                 to_free = 1;
1068         } else {
1069                 if (journal->j_checkpoint_transactions == NULL) {
1070                         journal->j_checkpoint_transactions = commit_transaction;
1071                         commit_transaction->t_cpnext = commit_transaction;
1072                         commit_transaction->t_cpprev = commit_transaction;
1073                 } else {
1074                         commit_transaction->t_cpnext =
1075                                 journal->j_checkpoint_transactions;
1076                         commit_transaction->t_cpprev =
1077                                 commit_transaction->t_cpnext->t_cpprev;
1078                         commit_transaction->t_cpnext->t_cpprev =
1079                                 commit_transaction;
1080                         commit_transaction->t_cpprev->t_cpnext =
1081                                 commit_transaction;
1082                 }
1083         }
1084         spin_unlock(&journal->j_list_lock);
1085
1086         if (journal->j_commit_callback)
1087                 journal->j_commit_callback(journal, commit_transaction);
1088
1089         trace_jbd2_end_commit(journal, commit_transaction);
1090         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1091                   journal->j_commit_sequence, journal->j_tail_sequence);
1092         if (to_free)
1093                 jbd2_journal_free_transaction(commit_transaction);
1094
1095         wake_up(&journal->j_wait_done_commit);
1096 }