commit.c - fs/jbd/commit.c - Linux source code v3.5.6

  1/*
  2 * linux/fs/jbd/commit.c
  3 *
  4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
  5 *
  6 * Copyright 1998 Red Hat corp --- All Rights Reserved
  7 *
  8 * This file is part of the Linux kernel and is made available under
  9 * the terms of the GNU General Public License, version 2, or at your
 10 * option, any later version, incorporated herein by reference.
 11 *
 12 * Journal commit routines for the generic filesystem journaling code;
 13 * part of the ext2fs journaling system.
 14 */
 15
 16#include <linux/time.h>
 17#include <linux/fs.h>
 18#include <linux/jbd.h>
 19#include <linux/errno.h>
 20#include <linux/mm.h>
 21#include <linux/pagemap.h>
 22#include <linux/bio.h>
 23#include <linux/blkdev.h>
 24#include <trace/events/jbd.h>
 25
 26/*
 27 * Default IO end handler for temporary BJ_IO buffer_heads.
 28 */
 29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 30{
 31	BUFFER_TRACE(bh, "");
 32	if (uptodate)
 33		set_buffer_uptodate(bh);
 34	else
 35		clear_buffer_uptodate(bh);
 36	unlock_buffer(bh);
 37}
 38
 39/*
 40 * When an ext3-ordered file is truncated, it is possible that many pages are
 41 * not successfully freed, because they are attached to a committing transaction.
 42 * After the transaction commits, these pages are left on the LRU, with no
 43 * ->mapping, and with attached buffers.  These pages are trivially reclaimable
 44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
 45 * the numbers in /proc/meminfo look odd.
 46 *
 47 * So here, we have a buffer which has just come off the forget list.  Look to
 48 * see if we can strip all buffers from the backing page.
 49 *
 50 * Called under journal->j_list_lock.  The caller provided us with a ref
 51 * against the buffer, and we drop that here.
 52 */
 53static void release_buffer_page(struct buffer_head *bh)
 54{
 55	struct page *page;
 56
 57	if (buffer_dirty(bh))
 58		goto nope;
 59	if (atomic_read(&bh->b_count) != 1)
 60		goto nope;
 61	page = bh->b_page;
 62	if (!page)
 63		goto nope;
 64	if (page->mapping)
 65		goto nope;
 66
 67	/* OK, it's a truncated page */
 68	if (!trylock_page(page))
 69		goto nope;
 70
 71	page_cache_get(page);
 72	__brelse(bh);
 73	try_to_free_buffers(page);
 74	unlock_page(page);
 75	page_cache_release(page);
 76	return;
 77
 78nope:
 79	__brelse(bh);
 80}
 81
 82/*
 83 * Decrement reference counter for data buffer. If it has been marked
 84 * 'BH_Freed', release it and the page to which it belongs if possible.
 85 */
 86static void release_data_buffer(struct buffer_head *bh)
 87{
 88	if (buffer_freed(bh)) {
 89		clear_buffer_freed(bh);
 90		release_buffer_page(bh);
 91	} else
 92		put_bh(bh);
 93}
 94
 95/*
 96 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
 97 * held.  For ranking reasons we must trylock.  If we lose, schedule away and
 98 * return 0.  j_list_lock is dropped in this case.
 99 */
100static int inverted_lock(journal_t *journal, struct buffer_head *bh)
101{
102	if (!jbd_trylock_bh_state(bh)) {
103		spin_unlock(&journal->j_list_lock);
104		schedule();
105		return 0;
106	}
107	return 1;
108}
109
110/* Done it all: now write the commit record.  We should have
111 * cleaned up our previous buffers by now, so if we are in abort
112 * mode we can now just skip the rest of the journal write
113 * entirely.
114 *
115 * Returns 1 if the journal needs to be aborted or 0 on success
116 */
117static int journal_write_commit_record(journal_t *journal,
118					transaction_t *commit_transaction)
119{
120	struct journal_head *descriptor;
121	struct buffer_head *bh;
122	journal_header_t *header;
123	int ret;
124
125	if (is_journal_aborted(journal))
126		return 0;
127
128	descriptor = journal_get_descriptor_buffer(journal);
129	if (!descriptor)
130		return 1;
131
132	bh = jh2bh(descriptor);
133
134	header = (journal_header_t *)(bh->b_data);
135	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
138
139	JBUFFER_TRACE(descriptor, "write commit block");
140	set_buffer_dirty(bh);
141
142	if (journal->j_flags & JFS_BARRIER)
143		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
144	else
145		ret = sync_dirty_buffer(bh);
146
147	put_bh(bh);		/* One for getblk() */
148	journal_put_journal_head(descriptor);
149
150	return (ret == -EIO);
151}
152
153static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
154				   int write_op)
155{
156	int i;
157
158	for (i = 0; i < bufs; i++) {
159		wbuf[i]->b_end_io = end_buffer_write_sync;
160		/* We use-up our safety reference in submit_bh() */
161		submit_bh(write_op, wbuf[i]);
162	}
163}
164
165/*
166 *  Submit all the data buffers to disk
167 */
168static int journal_submit_data_buffers(journal_t *journal,
169				       transaction_t *commit_transaction,
170				       int write_op)
171{
172	struct journal_head *jh;
173	struct buffer_head *bh;
174	int locked;
175	int bufs = 0;
176	struct buffer_head **wbuf = journal->j_wbuf;
177	int err = 0;
178
179	/*
180	 * Whenever we unlock the journal and sleep, things can get added
181	 * onto ->t_sync_datalist, so we have to keep looping back to
182	 * write_out_data until we *know* that the list is empty.
183	 *
184	 * Cleanup any flushed data buffers from the data list.  Even in
185	 * abort mode, we want to flush this out as soon as possible.
186	 */
187write_out_data:
188	cond_resched();
189	spin_lock(&journal->j_list_lock);
190
191	while (commit_transaction->t_sync_datalist) {
192		jh = commit_transaction->t_sync_datalist;
193		bh = jh2bh(jh);
194		locked = 0;
195
196		/* Get reference just to make sure buffer does not disappear
197		 * when we are forced to drop various locks */
198		get_bh(bh);
199		/* If the buffer is dirty, we need to submit IO and hence
200		 * we need the buffer lock. We try to lock the buffer without
201		 * blocking. If we fail, we need to drop j_list_lock and do
202		 * blocking lock_buffer().
203		 */
204		if (buffer_dirty(bh)) {
205			if (!trylock_buffer(bh)) {
206				BUFFER_TRACE(bh, "needs blocking lock");
207				spin_unlock(&journal->j_list_lock);
208				trace_jbd_do_submit_data(journal,
209						     commit_transaction);
210				/* Write out all data to prevent deadlocks */
211				journal_do_submit_data(wbuf, bufs, write_op);
212				bufs = 0;
213				lock_buffer(bh);
214				spin_lock(&journal->j_list_lock);
215			}
216			locked = 1;
217		}
218		/* We have to get bh_state lock. Again out of order, sigh. */
219		if (!inverted_lock(journal, bh)) {
220			jbd_lock_bh_state(bh);
221			spin_lock(&journal->j_list_lock);
222		}
223		/* Someone already cleaned up the buffer? */
224		if (!buffer_jbd(bh) || bh2jh(bh) != jh
225			|| jh->b_transaction != commit_transaction
226			|| jh->b_jlist != BJ_SyncData) {
227			jbd_unlock_bh_state(bh);
228			if (locked)
229				unlock_buffer(bh);
230			BUFFER_TRACE(bh, "already cleaned up");
231			release_data_buffer(bh);
232			continue;
233		}
234		if (locked && test_clear_buffer_dirty(bh)) {
235			BUFFER_TRACE(bh, "needs writeout, adding to array");
236			wbuf[bufs++] = bh;
237			__journal_file_buffer(jh, commit_transaction,
238						BJ_Locked);
239			jbd_unlock_bh_state(bh);
240			if (bufs == journal->j_wbufsize) {
241				spin_unlock(&journal->j_list_lock);
242				trace_jbd_do_submit_data(journal,
243						     commit_transaction);
244				journal_do_submit_data(wbuf, bufs, write_op);
245				bufs = 0;
246				goto write_out_data;
247			}
248		} else if (!locked && buffer_locked(bh)) {
249			__journal_file_buffer(jh, commit_transaction,
250						BJ_Locked);
251			jbd_unlock_bh_state(bh);
252			put_bh(bh);
253		} else {
254			BUFFER_TRACE(bh, "writeout complete: unfile");
255			if (unlikely(!buffer_uptodate(bh)))
256				err = -EIO;
257			__journal_unfile_buffer(jh);
258			jbd_unlock_bh_state(bh);
259			if (locked)
260				unlock_buffer(bh);
261			release_data_buffer(bh);
262		}
263
264		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
265			spin_unlock(&journal->j_list_lock);
266			goto write_out_data;
267		}
268	}
269	spin_unlock(&journal->j_list_lock);
270	trace_jbd_do_submit_data(journal, commit_transaction);
271	journal_do_submit_data(wbuf, bufs, write_op);
272
273	return err;
274}
275
276/*
277 * journal_commit_transaction
278 *
279 * The primary function for committing a transaction to the log.  This
280 * function is called by the journal thread to begin a complete commit.
281 */
282void journal_commit_transaction(journal_t *journal)
283{
284	transaction_t *commit_transaction;
285	struct journal_head *jh, *new_jh, *descriptor;
286	struct buffer_head **wbuf = journal->j_wbuf;
287	int bufs;
288	int flags;
289	int err;
290	unsigned int blocknr;
291	ktime_t start_time;
292	u64 commit_time;
293	char *tagp = NULL;
294	journal_header_t *header;
295	journal_block_tag_t *tag = NULL;
296	int space_left = 0;
297	int first_tag = 0;
298	int tag_flag;
299	int i;
300	struct blk_plug plug;
301	int write_op = WRITE;
302
303	/*
304	 * First job: lock down the current transaction and wait for
305	 * all outstanding updates to complete.
306	 */
307
308	/* Do we need to erase the effects of a prior journal_flush? */
309	if (journal->j_flags & JFS_FLUSHED) {
310		jbd_debug(3, "super block updated\n");
311		mutex_lock(&journal->j_checkpoint_mutex);
312		/*
313		 * We hold j_checkpoint_mutex so tail cannot change under us.
314		 * We don't need any special data guarantees for writing sb
315		 * since journal is empty and it is ok for write to be
316		 * flushed only with transaction commit.
317		 */
318		journal_update_sb_log_tail(journal, journal->j_tail_sequence,
319					   journal->j_tail, WRITE_SYNC);
320		mutex_unlock(&journal->j_checkpoint_mutex);
321	} else {
322		jbd_debug(3, "superblock not updated\n");
323	}
324
325	J_ASSERT(journal->j_running_transaction != NULL);
326	J_ASSERT(journal->j_committing_transaction == NULL);
327
328	commit_transaction = journal->j_running_transaction;
329	J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
331	trace_jbd_start_commit(journal, commit_transaction);
332	jbd_debug(1, "JBD: starting commit of transaction %d\n",
333			commit_transaction->t_tid);
334
335	spin_lock(&journal->j_state_lock);
336	commit_transaction->t_state = T_LOCKED;
337
338	trace_jbd_commit_locking(journal, commit_transaction);
339	spin_lock(&commit_transaction->t_handle_lock);
340	while (commit_transaction->t_updates) {
341		DEFINE_WAIT(wait);
342
343		prepare_to_wait(&journal->j_wait_updates, &wait,
344					TASK_UNINTERRUPTIBLE);
345		if (commit_transaction->t_updates) {
346			spin_unlock(&commit_transaction->t_handle_lock);
347			spin_unlock(&journal->j_state_lock);
348			schedule();
349			spin_lock(&journal->j_state_lock);
350			spin_lock(&commit_transaction->t_handle_lock);
351		}
352		finish_wait(&journal->j_wait_updates, &wait);
353	}
354	spin_unlock(&commit_transaction->t_handle_lock);
355
356	J_ASSERT (commit_transaction->t_outstanding_credits <=
357			journal->j_max_transaction_buffers);
358
359	/*
360	 * First thing we are allowed to do is to discard any remaining
361	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
362	 * that there are no such buffers: if a large filesystem
363	 * operation like a truncate needs to split itself over multiple
364	 * transactions, then it may try to do a journal_restart() while
365	 * there are still BJ_Reserved buffers outstanding.  These must
366	 * be released cleanly from the current transaction.
367	 *
368	 * In this case, the filesystem must still reserve write access
369	 * again before modifying the buffer in the new transaction, but
370	 * we do not require it to remember exactly which old buffers it
371	 * has reserved.  This is consistent with the existing behaviour
372	 * that multiple journal_get_write_access() calls to the same
373	 * buffer are perfectly permissible.
374	 */
375	while (commit_transaction->t_reserved_list) {
376		jh = commit_transaction->t_reserved_list;
377		JBUFFER_TRACE(jh, "reserved, unused: refile");
378		/*
379		 * A journal_get_undo_access()+journal_release_buffer() may
380		 * leave undo-committed data.
381		 */
382		if (jh->b_committed_data) {
383			struct buffer_head *bh = jh2bh(jh);
384
385			jbd_lock_bh_state(bh);
386			jbd_free(jh->b_committed_data, bh->b_size);
387			jh->b_committed_data = NULL;
388			jbd_unlock_bh_state(bh);
389		}
390		journal_refile_buffer(journal, jh);
391	}
392
393	/*
394	 * Now try to drop any written-back buffers from the journal's
395	 * checkpoint lists.  We do this *before* commit because it potentially
396	 * frees some memory
397	 */
398	spin_lock(&journal->j_list_lock);
399	__journal_clean_checkpoint_list(journal);
400	spin_unlock(&journal->j_list_lock);
401
402	jbd_debug (3, "JBD: commit phase 1\n");
403
404	/*
405	 * Clear revoked flag to reflect there is no revoked buffers
406	 * in the next transaction which is going to be started.
407	 */
408	journal_clear_buffer_revoked_flags(journal);
409
410	/*
411	 * Switch to a new revoke table.
412	 */
413	journal_switch_revoke_table(journal);
414
415	trace_jbd_commit_flushing(journal, commit_transaction);
416	commit_transaction->t_state = T_FLUSH;
417	journal->j_committing_transaction = commit_transaction;
418	journal->j_running_transaction = NULL;
419	start_time = ktime_get();
420	commit_transaction->t_log_start = journal->j_head;
421	wake_up(&journal->j_wait_transaction_locked);
422	spin_unlock(&journal->j_state_lock);
423
424	jbd_debug (3, "JBD: commit phase 2\n");
425
426	if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
427		write_op = WRITE_SYNC;
428
429	/*
430	 * Now start flushing things to disk, in the order they appear
431	 * on the transaction lists.  Data blocks go first.
432	 */
433	blk_start_plug(&plug);
434	err = journal_submit_data_buffers(journal, commit_transaction,
435					  write_op);
436	blk_finish_plug(&plug);
437
438	/*
439	 * Wait for all previously submitted IO to complete.
440	 */
441	spin_lock(&journal->j_list_lock);
442	while (commit_transaction->t_locked_list) {
443		struct buffer_head *bh;
444
445		jh = commit_transaction->t_locked_list->b_tprev;
446		bh = jh2bh(jh);
447		get_bh(bh);
448		if (buffer_locked(bh)) {
449			spin_unlock(&journal->j_list_lock);
450			wait_on_buffer(bh);
451			spin_lock(&journal->j_list_lock);
452		}
453		if (unlikely(!buffer_uptodate(bh))) {
454			if (!trylock_page(bh->b_page)) {
455				spin_unlock(&journal->j_list_lock);
456				lock_page(bh->b_page);
457				spin_lock(&journal->j_list_lock);
458			}
459			if (bh->b_page->mapping)
460				set_bit(AS_EIO, &bh->b_page->mapping->flags);
461
462			unlock_page(bh->b_page);
463			SetPageError(bh->b_page);
464			err = -EIO;
465		}
466		if (!inverted_lock(journal, bh)) {
467			put_bh(bh);
468			spin_lock(&journal->j_list_lock);
469			continue;
470		}
471		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
472		    jh->b_transaction == commit_transaction &&
473		    jh->b_jlist == BJ_Locked)
474			__journal_unfile_buffer(jh);
475		jbd_unlock_bh_state(bh);
476		release_data_buffer(bh);
477		cond_resched_lock(&journal->j_list_lock);
478	}
479	spin_unlock(&journal->j_list_lock);
480
481	if (err) {
482		char b[BDEVNAME_SIZE];
483
484		printk(KERN_WARNING
485			"JBD: Detected IO errors while flushing file data "
486			"on %s\n", bdevname(journal->j_fs_dev, b));
487		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
488			journal_abort(journal, err);
489		err = 0;
490	}
491
492	blk_start_plug(&plug);
493
494	journal_write_revoke_records(journal, commit_transaction, write_op);
495
496	/*
497	 * If we found any dirty or locked buffers, then we should have
498	 * looped back up to the write_out_data label.  If there weren't
499	 * any then journal_clean_data_list should have wiped the list
500	 * clean by now, so check that it is in fact empty.
501	 */
502	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
503
504	jbd_debug (3, "JBD: commit phase 3\n");
505
506	/*
507	 * Way to go: we have now written out all of the data for a
508	 * transaction!  Now comes the tricky part: we need to write out
509	 * metadata.  Loop over the transaction's entire buffer list:
510	 */
511	spin_lock(&journal->j_state_lock);
512	commit_transaction->t_state = T_COMMIT;
513	spin_unlock(&journal->j_state_lock);
514
515	trace_jbd_commit_logging(journal, commit_transaction);
516	J_ASSERT(commit_transaction->t_nr_buffers <=
517		 commit_transaction->t_outstanding_credits);
518
519	descriptor = NULL;
520	bufs = 0;
521	while (commit_transaction->t_buffers) {
522
523		/* Find the next buffer to be journaled... */
524
525		jh = commit_transaction->t_buffers;
526
527		/* If we're in abort mode, we just un-journal the buffer and
528		   release it. */
529
530		if (is_journal_aborted(journal)) {
531			clear_buffer_jbddirty(jh2bh(jh));
532			JBUFFER_TRACE(jh, "journal is aborting: refile");
533			journal_refile_buffer(journal, jh);
534			/* If that was the last one, we need to clean up
535			 * any descriptor buffers which may have been
536			 * already allocated, even if we are now
537			 * aborting. */
538			if (!commit_transaction->t_buffers)
539				goto start_journal_io;
540			continue;
541		}
542
543		/* Make sure we have a descriptor block in which to
544		   record the metadata buffer. */
545
546		if (!descriptor) {
547			struct buffer_head *bh;
548
549			J_ASSERT (bufs == 0);
550
551			jbd_debug(4, "JBD: get descriptor\n");
552
553			descriptor = journal_get_descriptor_buffer(journal);
554			if (!descriptor) {
555				journal_abort(journal, -EIO);
556				continue;
557			}
558
559			bh = jh2bh(descriptor);
560			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
561				(unsigned long long)bh->b_blocknr, bh->b_data);
562			header = (journal_header_t *)&bh->b_data[0];
563			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
564			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
565			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
566
567			tagp = &bh->b_data[sizeof(journal_header_t)];
568			space_left = bh->b_size - sizeof(journal_header_t);
569			first_tag = 1;
570			set_buffer_jwrite(bh);
571			set_buffer_dirty(bh);
572			wbuf[bufs++] = bh;
573
574			/* Record it so that we can wait for IO
575                           completion later */
576			BUFFER_TRACE(bh, "ph3: file as descriptor");
577			journal_file_buffer(descriptor, commit_transaction,
578					BJ_LogCtl);
579		}
580
581		/* Where is the buffer to be written? */
582
583		err = journal_next_log_block(journal, &blocknr);
584		/* If the block mapping failed, just abandon the buffer
585		   and repeat this loop: we'll fall into the
586		   refile-on-abort condition above. */
587		if (err) {
588			journal_abort(journal, err);
589			continue;
590		}
591
592		/*
593		 * start_this_handle() uses t_outstanding_credits to determine
594		 * the free space in the log, but this counter is changed
595		 * by journal_next_log_block() also.
596		 */
597		commit_transaction->t_outstanding_credits--;
598
599		/* Bump b_count to prevent truncate from stumbling over
600                   the shadowed buffer!  @@@ This can go if we ever get
601                   rid of the BJ_IO/BJ_Shadow pairing of buffers. */
602		get_bh(jh2bh(jh));
603
604		/* Make a temporary IO buffer with which to write it out
605                   (this will requeue both the metadata buffer and the
606                   temporary IO buffer). new_bh goes on BJ_IO*/
607
608		set_buffer_jwrite(jh2bh(jh));
609		/*
610		 * akpm: journal_write_metadata_buffer() sets
611		 * new_bh->b_transaction to commit_transaction.
612		 * We need to clean this up before we release new_bh
613		 * (which is of type BJ_IO)
614		 */
615		JBUFFER_TRACE(jh, "ph3: write metadata");
616		flags = journal_write_metadata_buffer(commit_transaction,
617						      jh, &new_jh, blocknr);
618		set_buffer_jwrite(jh2bh(new_jh));
619		wbuf[bufs++] = jh2bh(new_jh);
620
621		/* Record the new block's tag in the current descriptor
622                   buffer */
623
624		tag_flag = 0;
625		if (flags & 1)
626			tag_flag |= JFS_FLAG_ESCAPE;
627		if (!first_tag)
628			tag_flag |= JFS_FLAG_SAME_UUID;
629
630		tag = (journal_block_tag_t *) tagp;
631		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
632		tag->t_flags = cpu_to_be32(tag_flag);
633		tagp += sizeof(journal_block_tag_t);
634		space_left -= sizeof(journal_block_tag_t);
635
636		if (first_tag) {
637			memcpy (tagp, journal->j_uuid, 16);
638			tagp += 16;
639			space_left -= 16;
640			first_tag = 0;
641		}
642
643		/* If there's no more to do, or if the descriptor is full,
644		   let the IO rip! */
645
646		if (bufs == journal->j_wbufsize ||
647		    commit_transaction->t_buffers == NULL ||
648		    space_left < sizeof(journal_block_tag_t) + 16) {
649
650			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
651
652			/* Write an end-of-descriptor marker before
653                           submitting the IOs.  "tag" still points to
654                           the last tag we set up. */
655
656			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
657
658start_journal_io:
659			for (i = 0; i < bufs; i++) {
660				struct buffer_head *bh = wbuf[i];
661				lock_buffer(bh);
662				clear_buffer_dirty(bh);
663				set_buffer_uptodate(bh);
664				bh->b_end_io = journal_end_buffer_io_sync;
665				submit_bh(write_op, bh);
666			}
667			cond_resched();
668
669			/* Force a new descriptor to be generated next
670                           time round the loop. */
671			descriptor = NULL;
672			bufs = 0;
673		}
674	}
675
676	blk_finish_plug(&plug);
677
678	/* Lo and behold: we have just managed to send a transaction to
679           the log.  Before we can commit it, wait for the IO so far to
680           complete.  Control buffers being written are on the
681           transaction's t_log_list queue, and metadata buffers are on
682           the t_iobuf_list queue.
683
684	   Wait for the buffers in reverse order.  That way we are
685	   less likely to be woken up until all IOs have completed, and
686	   so we incur less scheduling load.
687	*/
688
689	jbd_debug(3, "JBD: commit phase 4\n");
690
691	/*
692	 * akpm: these are BJ_IO, and j_list_lock is not needed.
693	 * See __journal_try_to_free_buffer.
694	 */
695wait_for_iobuf:
696	while (commit_transaction->t_iobuf_list != NULL) {
697		struct buffer_head *bh;
698
699		jh = commit_transaction->t_iobuf_list->b_tprev;
700		bh = jh2bh(jh);
701		if (buffer_locked(bh)) {
702			wait_on_buffer(bh);
703			goto wait_for_iobuf;
704		}
705		if (cond_resched())
706			goto wait_for_iobuf;
707
708		if (unlikely(!buffer_uptodate(bh)))
709			err = -EIO;
710
711		clear_buffer_jwrite(bh);
712
713		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
714		journal_unfile_buffer(journal, jh);
715
716		/*
717		 * ->t_iobuf_list should contain only dummy buffer_heads
718		 * which were created by journal_write_metadata_buffer().
719		 */
720		BUFFER_TRACE(bh, "dumping temporary bh");
721		journal_put_journal_head(jh);
722		__brelse(bh);
723		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
724		free_buffer_head(bh);
725
726		/* We also have to unlock and free the corresponding
727                   shadowed buffer */
728		jh = commit_transaction->t_shadow_list->b_tprev;
729		bh = jh2bh(jh);
730		clear_buffer_jwrite(bh);
731		J_ASSERT_BH(bh, buffer_jbddirty(bh));
732
733		/* The metadata is now released for reuse, but we need
734                   to remember it against this transaction so that when
735                   we finally commit, we can do any checkpointing
736                   required. */
737		JBUFFER_TRACE(jh, "file as BJ_Forget");
738		journal_file_buffer(jh, commit_transaction, BJ_Forget);
739		/*
740		 * Wake up any transactions which were waiting for this
741		 * IO to complete. The barrier must be here so that changes
742		 * by journal_file_buffer() take effect before wake_up_bit()
743		 * does the waitqueue check.
744		 */
745		smp_mb();
746		wake_up_bit(&bh->b_state, BH_Unshadow);
747		JBUFFER_TRACE(jh, "brelse shadowed buffer");
748		__brelse(bh);
749	}
750
751	J_ASSERT (commit_transaction->t_shadow_list == NULL);
752
753	jbd_debug(3, "JBD: commit phase 5\n");
754
755	/* Here we wait for the revoke record and descriptor record buffers */
756 wait_for_ctlbuf:
757	while (commit_transaction->t_log_list != NULL) {
758		struct buffer_head *bh;
759
760		jh = commit_transaction->t_log_list->b_tprev;
761		bh = jh2bh(jh);
762		if (buffer_locked(bh)) {
763			wait_on_buffer(bh);
764			goto wait_for_ctlbuf;
765		}
766		if (cond_resched())
767			goto wait_for_ctlbuf;
768
769		if (unlikely(!buffer_uptodate(bh)))
770			err = -EIO;
771
772		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
773		clear_buffer_jwrite(bh);
774		journal_unfile_buffer(journal, jh);
775		journal_put_journal_head(jh);
776		__brelse(bh);		/* One for getblk */
777		/* AKPM: bforget here */
778	}
779
780	if (err)
781		journal_abort(journal, err);
782
783	jbd_debug(3, "JBD: commit phase 6\n");
784
785	/* All metadata is written, now write commit record and do cleanup */
786	spin_lock(&journal->j_state_lock);
787	J_ASSERT(commit_transaction->t_state == T_COMMIT);
788	commit_transaction->t_state = T_COMMIT_RECORD;
789	spin_unlock(&journal->j_state_lock);
790
791	if (journal_write_commit_record(journal, commit_transaction))
792		err = -EIO;
793
794	if (err)
795		journal_abort(journal, err);
796
797	/* End of a transaction!  Finally, we can do checkpoint
798           processing: any buffers committed as a result of this
799           transaction can be removed from any checkpoint list it was on
800           before. */
801
802	jbd_debug(3, "JBD: commit phase 7\n");
803
804	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
805	J_ASSERT(commit_transaction->t_buffers == NULL);
806	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
807	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
808	J_ASSERT(commit_transaction->t_shadow_list == NULL);
809	J_ASSERT(commit_transaction->t_log_list == NULL);
810
811restart_loop:
812	/*
813	 * As there are other places (journal_unmap_buffer()) adding buffers
814	 * to this list we have to be careful and hold the j_list_lock.
815	 */
816	spin_lock(&journal->j_list_lock);
817	while (commit_transaction->t_forget) {
818		transaction_t *cp_transaction;
819		struct buffer_head *bh;
820		int try_to_free = 0;
821
822		jh = commit_transaction->t_forget;
823		spin_unlock(&journal->j_list_lock);
824		bh = jh2bh(jh);
825		/*
826		 * Get a reference so that bh cannot be freed before we are
827		 * done with it.
828		 */
829		get_bh(bh);
830		jbd_lock_bh_state(bh);
831		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
832			jh->b_transaction == journal->j_running_transaction);
833
834		/*
835		 * If there is undo-protected committed data against
836		 * this buffer, then we can remove it now.  If it is a
837		 * buffer needing such protection, the old frozen_data
838		 * field now points to a committed version of the
839		 * buffer, so rotate that field to the new committed
840		 * data.
841		 *
842		 * Otherwise, we can just throw away the frozen data now.
843		 */
844		if (jh->b_committed_data) {
845			jbd_free(jh->b_committed_data, bh->b_size);
846			jh->b_committed_data = NULL;
847			if (jh->b_frozen_data) {
848				jh->b_committed_data = jh->b_frozen_data;
849				jh->b_frozen_data = NULL;
850			}
851		} else if (jh->b_frozen_data) {
852			jbd_free(jh->b_frozen_data, bh->b_size);
853			jh->b_frozen_data = NULL;
854		}
855
856		spin_lock(&journal->j_list_lock);
857		cp_transaction = jh->b_cp_transaction;
858		if (cp_transaction) {
859			JBUFFER_TRACE(jh, "remove from old cp transaction");
860			__journal_remove_checkpoint(jh);
861		}
862
863		/* Only re-checkpoint the buffer_head if it is marked
864		 * dirty.  If the buffer was added to the BJ_Forget list
865		 * by journal_forget, it may no longer be dirty and
866		 * there's no point in keeping a checkpoint record for
867		 * it. */
868
869		/* A buffer which has been freed while still being
870		 * journaled by a previous transaction may end up still
871		 * being dirty here, but we want to avoid writing back
872		 * that buffer in the future after the "add to orphan"
873		 * operation been committed,  That's not only a performance
874		 * gain, it also stops aliasing problems if the buffer is
875		 * left behind for writeback and gets reallocated for another
876		 * use in a different page. */
877		if (buffer_freed(bh) && !jh->b_next_transaction) {
878			clear_buffer_freed(bh);
879			clear_buffer_jbddirty(bh);
880		}
881
882		if (buffer_jbddirty(bh)) {
883			JBUFFER_TRACE(jh, "add to new checkpointing trans");
884			__journal_insert_checkpoint(jh, commit_transaction);
885			if (is_journal_aborted(journal))
886				clear_buffer_jbddirty(bh);
887		} else {
888			J_ASSERT_BH(bh, !buffer_dirty(bh));
889			/*
890			 * The buffer on BJ_Forget list and not jbddirty means
891			 * it has been freed by this transaction and hence it
892			 * could not have been reallocated until this
893			 * transaction has committed. *BUT* it could be
894			 * reallocated once we have written all the data to
895			 * disk and before we process the buffer on BJ_Forget
896			 * list.
897			 */
898			if (!jh->b_next_transaction)
899				try_to_free = 1;
900		}
901		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
902		__journal_refile_buffer(jh);
903		jbd_unlock_bh_state(bh);
904		if (try_to_free)
905			release_buffer_page(bh);
906		else
907			__brelse(bh);
908		cond_resched_lock(&journal->j_list_lock);
909	}
910	spin_unlock(&journal->j_list_lock);
911	/*
912	 * This is a bit sleazy.  We use j_list_lock to protect transition
913	 * of a transaction into T_FINISHED state and calling
914	 * __journal_drop_transaction(). Otherwise we could race with
915	 * other checkpointing code processing the transaction...
916	 */
917	spin_lock(&journal->j_state_lock);
918	spin_lock(&journal->j_list_lock);
919	/*
920	 * Now recheck if some buffers did not get attached to the transaction
921	 * while the lock was dropped...
922	 */
923	if (commit_transaction->t_forget) {
924		spin_unlock(&journal->j_list_lock);
925		spin_unlock(&journal->j_state_lock);
926		goto restart_loop;
927	}
928
929	/* Done with this transaction! */
930
931	jbd_debug(3, "JBD: commit phase 8\n");
932
933	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
934
935	commit_transaction->t_state = T_FINISHED;
936	J_ASSERT(commit_transaction == journal->j_committing_transaction);
937	journal->j_commit_sequence = commit_transaction->t_tid;
938	journal->j_committing_transaction = NULL;
939	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
940
941	/*
942	 * weight the commit time higher than the average time so we don't
943	 * react too strongly to vast changes in commit time
944	 */
945	if (likely(journal->j_average_commit_time))
946		journal->j_average_commit_time = (commit_time*3 +
947				journal->j_average_commit_time) / 4;
948	else
949		journal->j_average_commit_time = commit_time;
950
951	spin_unlock(&journal->j_state_lock);
952
953	if (commit_transaction->t_checkpoint_list == NULL &&
954	    commit_transaction->t_checkpoint_io_list == NULL) {
955		__journal_drop_transaction(journal, commit_transaction);
956	} else {
957		if (journal->j_checkpoint_transactions == NULL) {
958			journal->j_checkpoint_transactions = commit_transaction;
959			commit_transaction->t_cpnext = commit_transaction;
960			commit_transaction->t_cpprev = commit_transaction;
961		} else {
962			commit_transaction->t_cpnext =
963				journal->j_checkpoint_transactions;
964			commit_transaction->t_cpprev =
965				commit_transaction->t_cpnext->t_cpprev;
966			commit_transaction->t_cpnext->t_cpprev =
967				commit_transaction;
968			commit_transaction->t_cpprev->t_cpnext =
969				commit_transaction;
970		}
971	}
972	spin_unlock(&journal->j_list_lock);
973
974	trace_jbd_end_commit(journal, commit_transaction);
975	jbd_debug(1, "JBD: commit %d complete, head %d\n",
976		  journal->j_commit_sequence, journal->j_tail_sequence);
977
978	wake_up(&journal->j_wait_done_commit);
979}