]> asedeno.scripts.mit.edu Git - linux.git/blob - fs/jbd2/commit.c
Merge branch 'core-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / fs / jbd2 / commit.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * linux/fs/jbd2/commit.c
4  *
5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6  *
7  * Copyright 1998 Red Hat corp --- All Rights Reserved
8  *
9  * Journal commit routines for the generic filesystem journaling code;
10  * part of the ext2fs journaling system.
11  */
12
13 #include <linux/time.h>
14 #include <linux/fs.h>
15 #include <linux/jbd2.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/mm.h>
19 #include <linux/pagemap.h>
20 #include <linux/jiffies.h>
21 #include <linux/crc32.h>
22 #include <linux/writeback.h>
23 #include <linux/backing-dev.h>
24 #include <linux/bio.h>
25 #include <linux/blkdev.h>
26 #include <linux/bitops.h>
27 #include <trace/events/jbd2.h>
28
29 /*
30  * IO end handler for temporary buffer_heads handling writes to the journal.
31  */
32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33 {
34         struct buffer_head *orig_bh = bh->b_private;
35
36         BUFFER_TRACE(bh, "");
37         if (uptodate)
38                 set_buffer_uptodate(bh);
39         else
40                 clear_buffer_uptodate(bh);
41         if (orig_bh) {
42                 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43                 smp_mb__after_atomic();
44                 wake_up_bit(&orig_bh->b_state, BH_Shadow);
45         }
46         unlock_buffer(bh);
47 }
48
49 /*
50  * When an ext4 file is truncated, it is possible that some pages are not
51  * successfully freed, because they are attached to a committing transaction.
52  * After the transaction commits, these pages are left on the LRU, with no
53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
55  * the numbers in /proc/meminfo look odd.
56  *
57  * So here, we have a buffer which has just come off the forget list.  Look to
58  * see if we can strip all buffers from the backing page.
59  *
60  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61  * caller provided us with a ref against the buffer, and we drop that here.
62  */
63 static void release_buffer_page(struct buffer_head *bh)
64 {
65         struct page *page;
66
67         if (buffer_dirty(bh))
68                 goto nope;
69         if (atomic_read(&bh->b_count) != 1)
70                 goto nope;
71         page = bh->b_page;
72         if (!page)
73                 goto nope;
74         if (page->mapping)
75                 goto nope;
76
77         /* OK, it's a truncated page */
78         if (!trylock_page(page))
79                 goto nope;
80
81         get_page(page);
82         __brelse(bh);
83         try_to_free_buffers(page);
84         unlock_page(page);
85         put_page(page);
86         return;
87
88 nope:
89         __brelse(bh);
90 }
91
92 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
93 {
94         struct commit_header *h;
95         __u32 csum;
96
97         if (!jbd2_journal_has_csum_v2or3(j))
98                 return;
99
100         h = (struct commit_header *)(bh->b_data);
101         h->h_chksum_type = 0;
102         h->h_chksum_size = 0;
103         h->h_chksum[0] = 0;
104         csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
105         h->h_chksum[0] = cpu_to_be32(csum);
106 }
107
108 /*
109  * Done it all: now submit the commit record.  We should have
110  * cleaned up our previous buffers by now, so if we are in abort
111  * mode we can now just skip the rest of the journal write
112  * entirely.
113  *
114  * Returns 1 if the journal needs to be aborted or 0 on success
115  */
116 static int journal_submit_commit_record(journal_t *journal,
117                                         transaction_t *commit_transaction,
118                                         struct buffer_head **cbh,
119                                         __u32 crc32_sum)
120 {
121         struct commit_header *tmp;
122         struct buffer_head *bh;
123         int ret;
124         struct timespec64 now;
125
126         *cbh = NULL;
127
128         if (is_journal_aborted(journal))
129                 return 0;
130
131         bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
132                                                 JBD2_COMMIT_BLOCK);
133         if (!bh)
134                 return 1;
135
136         tmp = (struct commit_header *)bh->b_data;
137         ktime_get_coarse_real_ts64(&now);
138         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
139         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
140
141         if (jbd2_has_feature_checksum(journal)) {
142                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
143                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
144                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
145         }
146         jbd2_commit_block_csum_set(journal, bh);
147
148         BUFFER_TRACE(bh, "submit commit block");
149         lock_buffer(bh);
150         clear_buffer_dirty(bh);
151         set_buffer_uptodate(bh);
152         bh->b_end_io = journal_end_buffer_io_sync;
153
154         if (journal->j_flags & JBD2_BARRIER &&
155             !jbd2_has_feature_async_commit(journal))
156                 ret = submit_bh(REQ_OP_WRITE,
157                         REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
158         else
159                 ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
160
161         *cbh = bh;
162         return ret;
163 }
164
165 /*
166  * This function along with journal_submit_commit_record
167  * allows to write the commit record asynchronously.
168  */
169 static int journal_wait_on_commit_record(journal_t *journal,
170                                          struct buffer_head *bh)
171 {
172         int ret = 0;
173
174         clear_buffer_dirty(bh);
175         wait_on_buffer(bh);
176
177         if (unlikely(!buffer_uptodate(bh)))
178                 ret = -EIO;
179         put_bh(bh);            /* One for getblk() */
180
181         return ret;
182 }
183
184 /*
185  * write the filemap data using writepage() address_space_operations.
186  * We don't do block allocation here even for delalloc. We don't
187  * use writepages() because with dealyed allocation we may be doing
188  * block allocation in writepages().
189  */
190 static int journal_submit_inode_data_buffers(struct address_space *mapping)
191 {
192         int ret;
193         struct writeback_control wbc = {
194                 .sync_mode =  WB_SYNC_ALL,
195                 .nr_to_write = mapping->nrpages * 2,
196                 .range_start = 0,
197                 .range_end = i_size_read(mapping->host),
198         };
199
200         ret = generic_writepages(mapping, &wbc);
201         return ret;
202 }
203
204 /*
205  * Submit all the data buffers of inode associated with the transaction to
206  * disk.
207  *
208  * We are in a committing transaction. Therefore no new inode can be added to
209  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
210  * operate on from being released while we write out pages.
211  */
212 static int journal_submit_data_buffers(journal_t *journal,
213                 transaction_t *commit_transaction)
214 {
215         struct jbd2_inode *jinode;
216         int err, ret = 0;
217         struct address_space *mapping;
218
219         spin_lock(&journal->j_list_lock);
220         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
221                 if (!(jinode->i_flags & JI_WRITE_DATA))
222                         continue;
223                 mapping = jinode->i_vfs_inode->i_mapping;
224                 jinode->i_flags |= JI_COMMIT_RUNNING;
225                 spin_unlock(&journal->j_list_lock);
226                 /*
227                  * submit the inode data buffers. We use writepage
228                  * instead of writepages. Because writepages can do
229                  * block allocation  with delalloc. We need to write
230                  * only allocated blocks here.
231                  */
232                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
233                 err = journal_submit_inode_data_buffers(mapping);
234                 if (!ret)
235                         ret = err;
236                 spin_lock(&journal->j_list_lock);
237                 J_ASSERT(jinode->i_transaction == commit_transaction);
238                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
239                 smp_mb();
240                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
241         }
242         spin_unlock(&journal->j_list_lock);
243         return ret;
244 }
245
246 /*
247  * Wait for data submitted for writeout, refile inodes to proper
248  * transaction if needed.
249  *
250  */
251 static int journal_finish_inode_data_buffers(journal_t *journal,
252                 transaction_t *commit_transaction)
253 {
254         struct jbd2_inode *jinode, *next_i;
255         int err, ret = 0;
256
257         /* For locking, see the comment in journal_submit_data_buffers() */
258         spin_lock(&journal->j_list_lock);
259         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
260                 if (!(jinode->i_flags & JI_WAIT_DATA))
261                         continue;
262                 jinode->i_flags |= JI_COMMIT_RUNNING;
263                 spin_unlock(&journal->j_list_lock);
264                 err = filemap_fdatawait_keep_errors(
265                                 jinode->i_vfs_inode->i_mapping);
266                 if (!ret)
267                         ret = err;
268                 spin_lock(&journal->j_list_lock);
269                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
270                 smp_mb();
271                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
272         }
273
274         /* Now refile inode to proper lists */
275         list_for_each_entry_safe(jinode, next_i,
276                                  &commit_transaction->t_inode_list, i_list) {
277                 list_del(&jinode->i_list);
278                 if (jinode->i_next_transaction) {
279                         jinode->i_transaction = jinode->i_next_transaction;
280                         jinode->i_next_transaction = NULL;
281                         list_add(&jinode->i_list,
282                                 &jinode->i_transaction->t_inode_list);
283                 } else {
284                         jinode->i_transaction = NULL;
285                 }
286         }
287         spin_unlock(&journal->j_list_lock);
288
289         return ret;
290 }
291
292 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
293 {
294         struct page *page = bh->b_page;
295         char *addr;
296         __u32 checksum;
297
298         addr = kmap_atomic(page);
299         checksum = crc32_be(crc32_sum,
300                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
301         kunmap_atomic(addr);
302
303         return checksum;
304 }
305
306 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
307                                    unsigned long long block)
308 {
309         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
310         if (jbd2_has_feature_64bit(j))
311                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
312 }
313
314 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
315                                     struct buffer_head *bh, __u32 sequence)
316 {
317         journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
318         struct page *page = bh->b_page;
319         __u8 *addr;
320         __u32 csum32;
321         __be32 seq;
322
323         if (!jbd2_journal_has_csum_v2or3(j))
324                 return;
325
326         seq = cpu_to_be32(sequence);
327         addr = kmap_atomic(page);
328         csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
329         csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
330                              bh->b_size);
331         kunmap_atomic(addr);
332
333         if (jbd2_has_feature_csum3(j))
334                 tag3->t_checksum = cpu_to_be32(csum32);
335         else
336                 tag->t_checksum = cpu_to_be16(csum32);
337 }
338 /*
339  * jbd2_journal_commit_transaction
340  *
341  * The primary function for committing a transaction to the log.  This
342  * function is called by the journal thread to begin a complete commit.
343  */
344 void jbd2_journal_commit_transaction(journal_t *journal)
345 {
346         struct transaction_stats_s stats;
347         transaction_t *commit_transaction;
348         struct journal_head *jh;
349         struct buffer_head *descriptor;
350         struct buffer_head **wbuf = journal->j_wbuf;
351         int bufs;
352         int flags;
353         int err;
354         unsigned long long blocknr;
355         ktime_t start_time;
356         u64 commit_time;
357         char *tagp = NULL;
358         journal_block_tag_t *tag = NULL;
359         int space_left = 0;
360         int first_tag = 0;
361         int tag_flag;
362         int i;
363         int tag_bytes = journal_tag_bytes(journal);
364         struct buffer_head *cbh = NULL; /* For transactional checksums */
365         __u32 crc32_sum = ~0;
366         struct blk_plug plug;
367         /* Tail of the journal */
368         unsigned long first_block;
369         tid_t first_tid;
370         int update_tail;
371         int csum_size = 0;
372         LIST_HEAD(io_bufs);
373         LIST_HEAD(log_bufs);
374
375         if (jbd2_journal_has_csum_v2or3(journal))
376                 csum_size = sizeof(struct jbd2_journal_block_tail);
377
378         /*
379          * First job: lock down the current transaction and wait for
380          * all outstanding updates to complete.
381          */
382
383         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
384         if (journal->j_flags & JBD2_FLUSHED) {
385                 jbd_debug(3, "super block updated\n");
386                 mutex_lock_io(&journal->j_checkpoint_mutex);
387                 /*
388                  * We hold j_checkpoint_mutex so tail cannot change under us.
389                  * We don't need any special data guarantees for writing sb
390                  * since journal is empty and it is ok for write to be
391                  * flushed only with transaction commit.
392                  */
393                 jbd2_journal_update_sb_log_tail(journal,
394                                                 journal->j_tail_sequence,
395                                                 journal->j_tail,
396                                                 REQ_SYNC);
397                 mutex_unlock(&journal->j_checkpoint_mutex);
398         } else {
399                 jbd_debug(3, "superblock not updated\n");
400         }
401
402         J_ASSERT(journal->j_running_transaction != NULL);
403         J_ASSERT(journal->j_committing_transaction == NULL);
404
405         commit_transaction = journal->j_running_transaction;
406
407         trace_jbd2_start_commit(journal, commit_transaction);
408         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
409                         commit_transaction->t_tid);
410
411         write_lock(&journal->j_state_lock);
412         J_ASSERT(commit_transaction->t_state == T_RUNNING);
413         commit_transaction->t_state = T_LOCKED;
414
415         trace_jbd2_commit_locking(journal, commit_transaction);
416         stats.run.rs_wait = commit_transaction->t_max_wait;
417         stats.run.rs_request_delay = 0;
418         stats.run.rs_locked = jiffies;
419         if (commit_transaction->t_requested)
420                 stats.run.rs_request_delay =
421                         jbd2_time_diff(commit_transaction->t_requested,
422                                        stats.run.rs_locked);
423         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
424                                               stats.run.rs_locked);
425
426         spin_lock(&commit_transaction->t_handle_lock);
427         while (atomic_read(&commit_transaction->t_updates)) {
428                 DEFINE_WAIT(wait);
429
430                 prepare_to_wait(&journal->j_wait_updates, &wait,
431                                         TASK_UNINTERRUPTIBLE);
432                 if (atomic_read(&commit_transaction->t_updates)) {
433                         spin_unlock(&commit_transaction->t_handle_lock);
434                         write_unlock(&journal->j_state_lock);
435                         schedule();
436                         write_lock(&journal->j_state_lock);
437                         spin_lock(&commit_transaction->t_handle_lock);
438                 }
439                 finish_wait(&journal->j_wait_updates, &wait);
440         }
441         spin_unlock(&commit_transaction->t_handle_lock);
442         commit_transaction->t_state = T_SWITCH;
443         write_unlock(&journal->j_state_lock);
444
445         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
446                         journal->j_max_transaction_buffers);
447
448         /*
449          * First thing we are allowed to do is to discard any remaining
450          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
451          * that there are no such buffers: if a large filesystem
452          * operation like a truncate needs to split itself over multiple
453          * transactions, then it may try to do a jbd2_journal_restart() while
454          * there are still BJ_Reserved buffers outstanding.  These must
455          * be released cleanly from the current transaction.
456          *
457          * In this case, the filesystem must still reserve write access
458          * again before modifying the buffer in the new transaction, but
459          * we do not require it to remember exactly which old buffers it
460          * has reserved.  This is consistent with the existing behaviour
461          * that multiple jbd2_journal_get_write_access() calls to the same
462          * buffer are perfectly permissible.
463          */
464         while (commit_transaction->t_reserved_list) {
465                 jh = commit_transaction->t_reserved_list;
466                 JBUFFER_TRACE(jh, "reserved, unused: refile");
467                 /*
468                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
469                  * leave undo-committed data.
470                  */
471                 if (jh->b_committed_data) {
472                         struct buffer_head *bh = jh2bh(jh);
473
474                         jbd_lock_bh_state(bh);
475                         jbd2_free(jh->b_committed_data, bh->b_size);
476                         jh->b_committed_data = NULL;
477                         jbd_unlock_bh_state(bh);
478                 }
479                 jbd2_journal_refile_buffer(journal, jh);
480         }
481
482         /*
483          * Now try to drop any written-back buffers from the journal's
484          * checkpoint lists.  We do this *before* commit because it potentially
485          * frees some memory
486          */
487         spin_lock(&journal->j_list_lock);
488         __jbd2_journal_clean_checkpoint_list(journal, false);
489         spin_unlock(&journal->j_list_lock);
490
491         jbd_debug(3, "JBD2: commit phase 1\n");
492
493         /*
494          * Clear revoked flag to reflect there is no revoked buffers
495          * in the next transaction which is going to be started.
496          */
497         jbd2_clear_buffer_revoked_flags(journal);
498
499         /*
500          * Switch to a new revoke table.
501          */
502         jbd2_journal_switch_revoke_table(journal);
503
504         /*
505          * Reserved credits cannot be claimed anymore, free them
506          */
507         atomic_sub(atomic_read(&journal->j_reserved_credits),
508                    &commit_transaction->t_outstanding_credits);
509
510         write_lock(&journal->j_state_lock);
511         trace_jbd2_commit_flushing(journal, commit_transaction);
512         stats.run.rs_flushing = jiffies;
513         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
514                                              stats.run.rs_flushing);
515
516         commit_transaction->t_state = T_FLUSH;
517         journal->j_committing_transaction = commit_transaction;
518         journal->j_running_transaction = NULL;
519         start_time = ktime_get();
520         commit_transaction->t_log_start = journal->j_head;
521         wake_up(&journal->j_wait_transaction_locked);
522         write_unlock(&journal->j_state_lock);
523
524         jbd_debug(3, "JBD2: commit phase 2a\n");
525
526         /*
527          * Now start flushing things to disk, in the order they appear
528          * on the transaction lists.  Data blocks go first.
529          */
530         err = journal_submit_data_buffers(journal, commit_transaction);
531         if (err)
532                 jbd2_journal_abort(journal, err);
533
534         blk_start_plug(&plug);
535         jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
536
537         jbd_debug(3, "JBD2: commit phase 2b\n");
538
539         /*
540          * Way to go: we have now written out all of the data for a
541          * transaction!  Now comes the tricky part: we need to write out
542          * metadata.  Loop over the transaction's entire buffer list:
543          */
544         write_lock(&journal->j_state_lock);
545         commit_transaction->t_state = T_COMMIT;
546         write_unlock(&journal->j_state_lock);
547
548         trace_jbd2_commit_logging(journal, commit_transaction);
549         stats.run.rs_logging = jiffies;
550         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
551                                                stats.run.rs_logging);
552         stats.run.rs_blocks =
553                 atomic_read(&commit_transaction->t_outstanding_credits);
554         stats.run.rs_blocks_logged = 0;
555
556         J_ASSERT(commit_transaction->t_nr_buffers <=
557                  atomic_read(&commit_transaction->t_outstanding_credits));
558
559         err = 0;
560         bufs = 0;
561         descriptor = NULL;
562         while (commit_transaction->t_buffers) {
563
564                 /* Find the next buffer to be journaled... */
565
566                 jh = commit_transaction->t_buffers;
567
568                 /* If we're in abort mode, we just un-journal the buffer and
569                    release it. */
570
571                 if (is_journal_aborted(journal)) {
572                         clear_buffer_jbddirty(jh2bh(jh));
573                         JBUFFER_TRACE(jh, "journal is aborting: refile");
574                         jbd2_buffer_abort_trigger(jh,
575                                                   jh->b_frozen_data ?
576                                                   jh->b_frozen_triggers :
577                                                   jh->b_triggers);
578                         jbd2_journal_refile_buffer(journal, jh);
579                         /* If that was the last one, we need to clean up
580                          * any descriptor buffers which may have been
581                          * already allocated, even if we are now
582                          * aborting. */
583                         if (!commit_transaction->t_buffers)
584                                 goto start_journal_io;
585                         continue;
586                 }
587
588                 /* Make sure we have a descriptor block in which to
589                    record the metadata buffer. */
590
591                 if (!descriptor) {
592                         J_ASSERT (bufs == 0);
593
594                         jbd_debug(4, "JBD2: get descriptor\n");
595
596                         descriptor = jbd2_journal_get_descriptor_buffer(
597                                                         commit_transaction,
598                                                         JBD2_DESCRIPTOR_BLOCK);
599                         if (!descriptor) {
600                                 jbd2_journal_abort(journal, -EIO);
601                                 continue;
602                         }
603
604                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
605                                 (unsigned long long)descriptor->b_blocknr,
606                                 descriptor->b_data);
607                         tagp = &descriptor->b_data[sizeof(journal_header_t)];
608                         space_left = descriptor->b_size -
609                                                 sizeof(journal_header_t);
610                         first_tag = 1;
611                         set_buffer_jwrite(descriptor);
612                         set_buffer_dirty(descriptor);
613                         wbuf[bufs++] = descriptor;
614
615                         /* Record it so that we can wait for IO
616                            completion later */
617                         BUFFER_TRACE(descriptor, "ph3: file as descriptor");
618                         jbd2_file_log_bh(&log_bufs, descriptor);
619                 }
620
621                 /* Where is the buffer to be written? */
622
623                 err = jbd2_journal_next_log_block(journal, &blocknr);
624                 /* If the block mapping failed, just abandon the buffer
625                    and repeat this loop: we'll fall into the
626                    refile-on-abort condition above. */
627                 if (err) {
628                         jbd2_journal_abort(journal, err);
629                         continue;
630                 }
631
632                 /*
633                  * start_this_handle() uses t_outstanding_credits to determine
634                  * the free space in the log, but this counter is changed
635                  * by jbd2_journal_next_log_block() also.
636                  */
637                 atomic_dec(&commit_transaction->t_outstanding_credits);
638
639                 /* Bump b_count to prevent truncate from stumbling over
640                    the shadowed buffer!  @@@ This can go if we ever get
641                    rid of the shadow pairing of buffers. */
642                 atomic_inc(&jh2bh(jh)->b_count);
643
644                 /*
645                  * Make a temporary IO buffer with which to write it out
646                  * (this will requeue the metadata buffer to BJ_Shadow).
647                  */
648                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
649                 JBUFFER_TRACE(jh, "ph3: write metadata");
650                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
651                                                 jh, &wbuf[bufs], blocknr);
652                 if (flags < 0) {
653                         jbd2_journal_abort(journal, flags);
654                         continue;
655                 }
656                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
657
658                 /* Record the new block's tag in the current descriptor
659                    buffer */
660
661                 tag_flag = 0;
662                 if (flags & 1)
663                         tag_flag |= JBD2_FLAG_ESCAPE;
664                 if (!first_tag)
665                         tag_flag |= JBD2_FLAG_SAME_UUID;
666
667                 tag = (journal_block_tag_t *) tagp;
668                 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
669                 tag->t_flags = cpu_to_be16(tag_flag);
670                 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
671                                         commit_transaction->t_tid);
672                 tagp += tag_bytes;
673                 space_left -= tag_bytes;
674                 bufs++;
675
676                 if (first_tag) {
677                         memcpy (tagp, journal->j_uuid, 16);
678                         tagp += 16;
679                         space_left -= 16;
680                         first_tag = 0;
681                 }
682
683                 /* If there's no more to do, or if the descriptor is full,
684                    let the IO rip! */
685
686                 if (bufs == journal->j_wbufsize ||
687                     commit_transaction->t_buffers == NULL ||
688                     space_left < tag_bytes + 16 + csum_size) {
689
690                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
691
692                         /* Write an end-of-descriptor marker before
693                            submitting the IOs.  "tag" still points to
694                            the last tag we set up. */
695
696                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
697
698                         jbd2_descriptor_block_csum_set(journal, descriptor);
699 start_journal_io:
700                         for (i = 0; i < bufs; i++) {
701                                 struct buffer_head *bh = wbuf[i];
702                                 /*
703                                  * Compute checksum.
704                                  */
705                                 if (jbd2_has_feature_checksum(journal)) {
706                                         crc32_sum =
707                                             jbd2_checksum_data(crc32_sum, bh);
708                                 }
709
710                                 lock_buffer(bh);
711                                 clear_buffer_dirty(bh);
712                                 set_buffer_uptodate(bh);
713                                 bh->b_end_io = journal_end_buffer_io_sync;
714                                 submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
715                         }
716                         cond_resched();
717                         stats.run.rs_blocks_logged += bufs;
718
719                         /* Force a new descriptor to be generated next
720                            time round the loop. */
721                         descriptor = NULL;
722                         bufs = 0;
723                 }
724         }
725
726         err = journal_finish_inode_data_buffers(journal, commit_transaction);
727         if (err) {
728                 printk(KERN_WARNING
729                         "JBD2: Detected IO errors while flushing file data "
730                        "on %s\n", journal->j_devname);
731                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
732                         jbd2_journal_abort(journal, err);
733                 err = 0;
734         }
735
736         /*
737          * Get current oldest transaction in the log before we issue flush
738          * to the filesystem device. After the flush we can be sure that
739          * blocks of all older transactions are checkpointed to persistent
740          * storage and we will be safe to update journal start in the
741          * superblock with the numbers we get here.
742          */
743         update_tail =
744                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
745
746         write_lock(&journal->j_state_lock);
747         if (update_tail) {
748                 long freed = first_block - journal->j_tail;
749
750                 if (first_block < journal->j_tail)
751                         freed += journal->j_last - journal->j_first;
752                 /* Update tail only if we free significant amount of space */
753                 if (freed < journal->j_maxlen / 4)
754                         update_tail = 0;
755         }
756         J_ASSERT(commit_transaction->t_state == T_COMMIT);
757         commit_transaction->t_state = T_COMMIT_DFLUSH;
758         write_unlock(&journal->j_state_lock);
759
760         /* 
761          * If the journal is not located on the file system device,
762          * then we must flush the file system device before we issue
763          * the commit record
764          */
765         if (commit_transaction->t_need_data_flush &&
766             (journal->j_fs_dev != journal->j_dev) &&
767             (journal->j_flags & JBD2_BARRIER))
768                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
769
770         /* Done it all: now write the commit record asynchronously. */
771         if (jbd2_has_feature_async_commit(journal)) {
772                 err = journal_submit_commit_record(journal, commit_transaction,
773                                                  &cbh, crc32_sum);
774                 if (err)
775                         __jbd2_journal_abort_hard(journal);
776         }
777
778         blk_finish_plug(&plug);
779
780         /* Lo and behold: we have just managed to send a transaction to
781            the log.  Before we can commit it, wait for the IO so far to
782            complete.  Control buffers being written are on the
783            transaction's t_log_list queue, and metadata buffers are on
784            the io_bufs list.
785
786            Wait for the buffers in reverse order.  That way we are
787            less likely to be woken up until all IOs have completed, and
788            so we incur less scheduling load.
789         */
790
791         jbd_debug(3, "JBD2: commit phase 3\n");
792
793         while (!list_empty(&io_bufs)) {
794                 struct buffer_head *bh = list_entry(io_bufs.prev,
795                                                     struct buffer_head,
796                                                     b_assoc_buffers);
797
798                 wait_on_buffer(bh);
799                 cond_resched();
800
801                 if (unlikely(!buffer_uptodate(bh)))
802                         err = -EIO;
803                 jbd2_unfile_log_bh(bh);
804
805                 /*
806                  * The list contains temporary buffer heads created by
807                  * jbd2_journal_write_metadata_buffer().
808                  */
809                 BUFFER_TRACE(bh, "dumping temporary bh");
810                 __brelse(bh);
811                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
812                 free_buffer_head(bh);
813
814                 /* We also have to refile the corresponding shadowed buffer */
815                 jh = commit_transaction->t_shadow_list->b_tprev;
816                 bh = jh2bh(jh);
817                 clear_buffer_jwrite(bh);
818                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
819                 J_ASSERT_BH(bh, !buffer_shadow(bh));
820
821                 /* The metadata is now released for reuse, but we need
822                    to remember it against this transaction so that when
823                    we finally commit, we can do any checkpointing
824                    required. */
825                 JBUFFER_TRACE(jh, "file as BJ_Forget");
826                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
827                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
828                 __brelse(bh);
829         }
830
831         J_ASSERT (commit_transaction->t_shadow_list == NULL);
832
833         jbd_debug(3, "JBD2: commit phase 4\n");
834
835         /* Here we wait for the revoke record and descriptor record buffers */
836         while (!list_empty(&log_bufs)) {
837                 struct buffer_head *bh;
838
839                 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
840                 wait_on_buffer(bh);
841                 cond_resched();
842
843                 if (unlikely(!buffer_uptodate(bh)))
844                         err = -EIO;
845
846                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
847                 clear_buffer_jwrite(bh);
848                 jbd2_unfile_log_bh(bh);
849                 __brelse(bh);           /* One for getblk */
850                 /* AKPM: bforget here */
851         }
852
853         if (err)
854                 jbd2_journal_abort(journal, err);
855
856         jbd_debug(3, "JBD2: commit phase 5\n");
857         write_lock(&journal->j_state_lock);
858         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
859         commit_transaction->t_state = T_COMMIT_JFLUSH;
860         write_unlock(&journal->j_state_lock);
861
862         if (!jbd2_has_feature_async_commit(journal)) {
863                 err = journal_submit_commit_record(journal, commit_transaction,
864                                                 &cbh, crc32_sum);
865                 if (err)
866                         __jbd2_journal_abort_hard(journal);
867         }
868         if (cbh)
869                 err = journal_wait_on_commit_record(journal, cbh);
870         if (jbd2_has_feature_async_commit(journal) &&
871             journal->j_flags & JBD2_BARRIER) {
872                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
873         }
874
875         if (err)
876                 jbd2_journal_abort(journal, err);
877
878         /*
879          * Now disk caches for filesystem device are flushed so we are safe to
880          * erase checkpointed transactions from the log by updating journal
881          * superblock.
882          */
883         if (update_tail)
884                 jbd2_update_log_tail(journal, first_tid, first_block);
885
886         /* End of a transaction!  Finally, we can do checkpoint
887            processing: any buffers committed as a result of this
888            transaction can be removed from any checkpoint list it was on
889            before. */
890
891         jbd_debug(3, "JBD2: commit phase 6\n");
892
893         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
894         J_ASSERT(commit_transaction->t_buffers == NULL);
895         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
896         J_ASSERT(commit_transaction->t_shadow_list == NULL);
897
898 restart_loop:
899         /*
900          * As there are other places (journal_unmap_buffer()) adding buffers
901          * to this list we have to be careful and hold the j_list_lock.
902          */
903         spin_lock(&journal->j_list_lock);
904         while (commit_transaction->t_forget) {
905                 transaction_t *cp_transaction;
906                 struct buffer_head *bh;
907                 int try_to_free = 0;
908
909                 jh = commit_transaction->t_forget;
910                 spin_unlock(&journal->j_list_lock);
911                 bh = jh2bh(jh);
912                 /*
913                  * Get a reference so that bh cannot be freed before we are
914                  * done with it.
915                  */
916                 get_bh(bh);
917                 jbd_lock_bh_state(bh);
918                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
919
920                 /*
921                  * If there is undo-protected committed data against
922                  * this buffer, then we can remove it now.  If it is a
923                  * buffer needing such protection, the old frozen_data
924                  * field now points to a committed version of the
925                  * buffer, so rotate that field to the new committed
926                  * data.
927                  *
928                  * Otherwise, we can just throw away the frozen data now.
929                  *
930                  * We also know that the frozen data has already fired
931                  * its triggers if they exist, so we can clear that too.
932                  */
933                 if (jh->b_committed_data) {
934                         jbd2_free(jh->b_committed_data, bh->b_size);
935                         jh->b_committed_data = NULL;
936                         if (jh->b_frozen_data) {
937                                 jh->b_committed_data = jh->b_frozen_data;
938                                 jh->b_frozen_data = NULL;
939                                 jh->b_frozen_triggers = NULL;
940                         }
941                 } else if (jh->b_frozen_data) {
942                         jbd2_free(jh->b_frozen_data, bh->b_size);
943                         jh->b_frozen_data = NULL;
944                         jh->b_frozen_triggers = NULL;
945                 }
946
947                 spin_lock(&journal->j_list_lock);
948                 cp_transaction = jh->b_cp_transaction;
949                 if (cp_transaction) {
950                         JBUFFER_TRACE(jh, "remove from old cp transaction");
951                         cp_transaction->t_chp_stats.cs_dropped++;
952                         __jbd2_journal_remove_checkpoint(jh);
953                 }
954
955                 /* Only re-checkpoint the buffer_head if it is marked
956                  * dirty.  If the buffer was added to the BJ_Forget list
957                  * by jbd2_journal_forget, it may no longer be dirty and
958                  * there's no point in keeping a checkpoint record for
959                  * it. */
960
961                 /*
962                 * A buffer which has been freed while still being journaled by
963                 * a previous transaction.
964                 */
965                 if (buffer_freed(bh)) {
966                         /*
967                          * If the running transaction is the one containing
968                          * "add to orphan" operation (b_next_transaction !=
969                          * NULL), we have to wait for that transaction to
970                          * commit before we can really get rid of the buffer.
971                          * So just clear b_modified to not confuse transaction
972                          * credit accounting and refile the buffer to
973                          * BJ_Forget of the running transaction. If the just
974                          * committed transaction contains "add to orphan"
975                          * operation, we can completely invalidate the buffer
976                          * now. We are rather through in that since the
977                          * buffer may be still accessible when blocksize <
978                          * pagesize and it is attached to the last partial
979                          * page.
980                          */
981                         jh->b_modified = 0;
982                         if (!jh->b_next_transaction) {
983                                 clear_buffer_freed(bh);
984                                 clear_buffer_jbddirty(bh);
985                                 clear_buffer_mapped(bh);
986                                 clear_buffer_new(bh);
987                                 clear_buffer_req(bh);
988                                 bh->b_bdev = NULL;
989                         }
990                 }
991
992                 if (buffer_jbddirty(bh)) {
993                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
994                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
995                         if (is_journal_aborted(journal))
996                                 clear_buffer_jbddirty(bh);
997                 } else {
998                         J_ASSERT_BH(bh, !buffer_dirty(bh));
999                         /*
1000                          * The buffer on BJ_Forget list and not jbddirty means
1001                          * it has been freed by this transaction and hence it
1002                          * could not have been reallocated until this
1003                          * transaction has committed. *BUT* it could be
1004                          * reallocated once we have written all the data to
1005                          * disk and before we process the buffer on BJ_Forget
1006                          * list.
1007                          */
1008                         if (!jh->b_next_transaction)
1009                                 try_to_free = 1;
1010                 }
1011                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1012                 __jbd2_journal_refile_buffer(jh);
1013                 jbd_unlock_bh_state(bh);
1014                 if (try_to_free)
1015                         release_buffer_page(bh);        /* Drops bh reference */
1016                 else
1017                         __brelse(bh);
1018                 cond_resched_lock(&journal->j_list_lock);
1019         }
1020         spin_unlock(&journal->j_list_lock);
1021         /*
1022          * This is a bit sleazy.  We use j_list_lock to protect transition
1023          * of a transaction into T_FINISHED state and calling
1024          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1025          * other checkpointing code processing the transaction...
1026          */
1027         write_lock(&journal->j_state_lock);
1028         spin_lock(&journal->j_list_lock);
1029         /*
1030          * Now recheck if some buffers did not get attached to the transaction
1031          * while the lock was dropped...
1032          */
1033         if (commit_transaction->t_forget) {
1034                 spin_unlock(&journal->j_list_lock);
1035                 write_unlock(&journal->j_state_lock);
1036                 goto restart_loop;
1037         }
1038
1039         /* Add the transaction to the checkpoint list
1040          * __journal_remove_checkpoint() can not destroy transaction
1041          * under us because it is not marked as T_FINISHED yet */
1042         if (journal->j_checkpoint_transactions == NULL) {
1043                 journal->j_checkpoint_transactions = commit_transaction;
1044                 commit_transaction->t_cpnext = commit_transaction;
1045                 commit_transaction->t_cpprev = commit_transaction;
1046         } else {
1047                 commit_transaction->t_cpnext =
1048                         journal->j_checkpoint_transactions;
1049                 commit_transaction->t_cpprev =
1050                         commit_transaction->t_cpnext->t_cpprev;
1051                 commit_transaction->t_cpnext->t_cpprev =
1052                         commit_transaction;
1053                 commit_transaction->t_cpprev->t_cpnext =
1054                                 commit_transaction;
1055         }
1056         spin_unlock(&journal->j_list_lock);
1057
1058         /* Done with this transaction! */
1059
1060         jbd_debug(3, "JBD2: commit phase 7\n");
1061
1062         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1063
1064         commit_transaction->t_start = jiffies;
1065         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1066                                               commit_transaction->t_start);
1067
1068         /*
1069          * File the transaction statistics
1070          */
1071         stats.ts_tid = commit_transaction->t_tid;
1072         stats.run.rs_handle_count =
1073                 atomic_read(&commit_transaction->t_handle_count);
1074         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1075                              commit_transaction->t_tid, &stats.run);
1076         stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1077
1078         commit_transaction->t_state = T_COMMIT_CALLBACK;
1079         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1080         journal->j_commit_sequence = commit_transaction->t_tid;
1081         journal->j_committing_transaction = NULL;
1082         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1083
1084         /*
1085          * weight the commit time higher than the average time so we don't
1086          * react too strongly to vast changes in the commit time
1087          */
1088         if (likely(journal->j_average_commit_time))
1089                 journal->j_average_commit_time = (commit_time +
1090                                 journal->j_average_commit_time*3) / 4;
1091         else
1092                 journal->j_average_commit_time = commit_time;
1093
1094         write_unlock(&journal->j_state_lock);
1095
1096         if (journal->j_commit_callback)
1097                 journal->j_commit_callback(journal, commit_transaction);
1098
1099         trace_jbd2_end_commit(journal, commit_transaction);
1100         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1101                   journal->j_commit_sequence, journal->j_tail_sequence);
1102
1103         write_lock(&journal->j_state_lock);
1104         spin_lock(&journal->j_list_lock);
1105         commit_transaction->t_state = T_FINISHED;
1106         /* Check if the transaction can be dropped now that we are finished */
1107         if (commit_transaction->t_checkpoint_list == NULL &&
1108             commit_transaction->t_checkpoint_io_list == NULL) {
1109                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1110                 jbd2_journal_free_transaction(commit_transaction);
1111         }
1112         spin_unlock(&journal->j_list_lock);
1113         write_unlock(&journal->j_state_lock);
1114         wake_up(&journal->j_wait_done_commit);
1115
1116         /*
1117          * Calculate overall stats
1118          */
1119         spin_lock(&journal->j_history_lock);
1120         journal->j_stats.ts_tid++;
1121         journal->j_stats.ts_requested += stats.ts_requested;
1122         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1123         journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1124         journal->j_stats.run.rs_running += stats.run.rs_running;
1125         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1126         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1127         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1128         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1129         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1130         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1131         spin_unlock(&journal->j_history_lock);
1132 }