journal.c - fs/jbd2/journal.c - Linux diff v4.10.11

 
   1/*
   2 * linux/fs/jbd2/journal.c
   3 *
   4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5 *
   6 * Copyright 1998 Red Hat corp --- All Rights Reserved
   7 *
   8 * This file is part of the Linux kernel and is made available under
   9 * the terms of the GNU General Public License, version 2, or at your
  10 * option, any later version, incorporated herein by reference.
  11 *
  12 * Generic filesystem journal-writing code; part of the ext2fs
  13 * journaling system.
  14 *
  15 * This file manages journals: areas of disk reserved for logging
  16 * transactional updates.  This includes the kernel journaling thread
  17 * which is responsible for scheduling updates to the log.
  18 *
  19 * We do not actually manage the physical storage of the journal in this
  20 * file: that is left to a per-journal policy function, which allows us
  21 * to store the journal within a filesystem-specified area for ext2
  22 * journaling (ext2 can use a reserved inode for storing the log).
  23 */
  24
  25#include <linux/module.h>
  26#include <linux/time.h>
  27#include <linux/fs.h>
  28#include <linux/jbd2.h>
  29#include <linux/errno.h>
  30#include <linux/slab.h>
  31#include <linux/init.h>
  32#include <linux/mm.h>
  33#include <linux/freezer.h>
  34#include <linux/pagemap.h>
  35#include <linux/kthread.h>
  36#include <linux/poison.h>
  37#include <linux/proc_fs.h>
  38#include <linux/seq_file.h>
  39#include <linux/math64.h>
  40#include <linux/hash.h>
  41#include <linux/log2.h>
  42#include <linux/vmalloc.h>
  43#include <linux/backing-dev.h>
  44#include <linux/bitops.h>
  45#include <linux/ratelimit.h>
 
  46
  47#define CREATE_TRACE_POINTS
  48#include <trace/events/jbd2.h>
  49
  50#include <linux/uaccess.h>
  51#include <asm/page.h>
  52
  53#ifdef CONFIG_JBD2_DEBUG
  54ushort jbd2_journal_enable_debug __read_mostly;
  55EXPORT_SYMBOL(jbd2_journal_enable_debug);
  56
  57module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
  58MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
  59#endif
  60
  61EXPORT_SYMBOL(jbd2_journal_extend);
  62EXPORT_SYMBOL(jbd2_journal_stop);
  63EXPORT_SYMBOL(jbd2_journal_lock_updates);
  64EXPORT_SYMBOL(jbd2_journal_unlock_updates);
  65EXPORT_SYMBOL(jbd2_journal_get_write_access);
  66EXPORT_SYMBOL(jbd2_journal_get_create_access);
  67EXPORT_SYMBOL(jbd2_journal_get_undo_access);
  68EXPORT_SYMBOL(jbd2_journal_set_triggers);
  69EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
  70EXPORT_SYMBOL(jbd2_journal_forget);
  71#if 0
  72EXPORT_SYMBOL(journal_sync_buffer);
  73#endif
  74EXPORT_SYMBOL(jbd2_journal_flush);
  75EXPORT_SYMBOL(jbd2_journal_revoke);
  76
  77EXPORT_SYMBOL(jbd2_journal_init_dev);
  78EXPORT_SYMBOL(jbd2_journal_init_inode);
  79EXPORT_SYMBOL(jbd2_journal_check_used_features);
  80EXPORT_SYMBOL(jbd2_journal_check_available_features);
  81EXPORT_SYMBOL(jbd2_journal_set_features);
  82EXPORT_SYMBOL(jbd2_journal_load);
  83EXPORT_SYMBOL(jbd2_journal_destroy);
  84EXPORT_SYMBOL(jbd2_journal_abort);
  85EXPORT_SYMBOL(jbd2_journal_errno);
  86EXPORT_SYMBOL(jbd2_journal_ack_err);
  87EXPORT_SYMBOL(jbd2_journal_clear_err);
  88EXPORT_SYMBOL(jbd2_log_wait_commit);
  89EXPORT_SYMBOL(jbd2_log_start_commit);
  90EXPORT_SYMBOL(jbd2_journal_start_commit);
  91EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
  92EXPORT_SYMBOL(jbd2_journal_wipe);
  93EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
  94EXPORT_SYMBOL(jbd2_journal_invalidatepage);
  95EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
  96EXPORT_SYMBOL(jbd2_journal_force_commit);
  97EXPORT_SYMBOL(jbd2_journal_inode_add_write);
  98EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
  99EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
 100EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
 101EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 102EXPORT_SYMBOL(jbd2_inode_cache);
 103
 104static void __journal_abort_soft (journal_t *journal, int errno);
 105static int jbd2_journal_create_slab(size_t slab_size);
 106
 107#ifdef CONFIG_JBD2_DEBUG
 108void __jbd2_debug(int level, const char *file, const char *func,
 109		  unsigned int line, const char *fmt, ...)
 110{
 111	struct va_format vaf;
 112	va_list args;
 113
 114	if (level > jbd2_journal_enable_debug)
 115		return;
 116	va_start(args, fmt);
 117	vaf.fmt = fmt;
 118	vaf.va = &args;
 119	printk(KERN_DEBUG "%s: (%s, %u): %pV\n", file, func, line, &vaf);
 120	va_end(args);
 121}
 122EXPORT_SYMBOL(__jbd2_debug);
 123#endif
 124
 125/* Checksumming functions */
 126static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 127{
 128	if (!jbd2_journal_has_csum_v2or3_feature(j))
 129		return 1;
 130
 131	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
 132}
 133
 134static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 135{
 136	__u32 csum;
 137	__be32 old_csum;
 138
 139	old_csum = sb->s_checksum;
 140	sb->s_checksum = 0;
 141	csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
 142	sb->s_checksum = old_csum;
 143
 144	return cpu_to_be32(csum);
 145}
 146
 147static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
 148{
 149	if (!jbd2_journal_has_csum_v2or3(j))
 150		return 1;
 151
 152	return sb->s_checksum == jbd2_superblock_csum(j, sb);
 153}
 154
 155static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
 156{
 157	if (!jbd2_journal_has_csum_v2or3(j))
 158		return;
 159
 160	sb->s_checksum = jbd2_superblock_csum(j, sb);
 161}
 162
 163/*
 164 * Helper function used to manage commit timeouts
 165 */
 166
 167static void commit_timeout(unsigned long __data)
 168{
 169	struct task_struct * p = (struct task_struct *) __data;
 170
 171	wake_up_process(p);
 172}
 173
 174/*
 175 * kjournald2: The main thread function used to manage a logging device
 176 * journal.
 177 *
 178 * This kernel thread is responsible for two things:
 179 *
 180 * 1) COMMIT:  Every so often we need to commit the current state of the
 181 *    filesystem to disk.  The journal thread is responsible for writing
 182 *    all of the metadata buffers to disk.
 183 *
 184 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
 185 *    of the data in that part of the log has been rewritten elsewhere on
 186 *    the disk.  Flushing these old buffers to reclaim space in the log is
 187 *    known as checkpointing, and this thread is responsible for that job.
 188 */
 189
 190static int kjournald2(void *arg)
 191{
 192	journal_t *journal = arg;
 193	transaction_t *transaction;
 194
 195	/*
 196	 * Set up an interval timer which can be used to trigger a commit wakeup
 197	 * after the commit interval expires
 198	 */
 199	setup_timer(&journal->j_commit_timer, commit_timeout,
 200			(unsigned long)current);
 201
 202	set_freezable();
 203
 204	/* Record that the journal thread is running */
 205	journal->j_task = current;
 206	wake_up(&journal->j_wait_done_commit);
 207
 208	/*
 
 
 
 
 
 
 
 
 209	 * And now, wait forever for commit wakeup events.
 210	 */
 211	write_lock(&journal->j_state_lock);
 212
 213loop:
 214	if (journal->j_flags & JBD2_UNMOUNT)
 215		goto end_loop;
 216
 217	jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
 218		journal->j_commit_sequence, journal->j_commit_request);
 219
 220	if (journal->j_commit_sequence != journal->j_commit_request) {
 221		jbd_debug(1, "OK, requests differ\n");
 222		write_unlock(&journal->j_state_lock);
 223		del_timer_sync(&journal->j_commit_timer);
 224		jbd2_journal_commit_transaction(journal);
 225		write_lock(&journal->j_state_lock);
 226		goto loop;
 227	}
 228
 229	wake_up(&journal->j_wait_done_commit);
 230	if (freezing(current)) {
 231		/*
 232		 * The simpler the better. Flushing journal isn't a
 233		 * good idea, because that depends on threads that may
 234		 * be already stopped.
 235		 */
 236		jbd_debug(1, "Now suspending kjournald2\n");
 237		write_unlock(&journal->j_state_lock);
 238		try_to_freeze();
 239		write_lock(&journal->j_state_lock);
 240	} else {
 241		/*
 242		 * We assume on resume that commits are already there,
 243		 * so we don't sleep
 244		 */
 245		DEFINE_WAIT(wait);
 246		int should_sleep = 1;
 247
 248		prepare_to_wait(&journal->j_wait_commit, &wait,
 249				TASK_INTERRUPTIBLE);
 250		if (journal->j_commit_sequence != journal->j_commit_request)
 251			should_sleep = 0;
 252		transaction = journal->j_running_transaction;
 253		if (transaction && time_after_eq(jiffies,
 254						transaction->t_expires))
 255			should_sleep = 0;
 256		if (journal->j_flags & JBD2_UNMOUNT)
 257			should_sleep = 0;
 258		if (should_sleep) {
 259			write_unlock(&journal->j_state_lock);
 260			schedule();
 261			write_lock(&journal->j_state_lock);
 262		}
 263		finish_wait(&journal->j_wait_commit, &wait);
 264	}
 265
 266	jbd_debug(1, "kjournald2 wakes\n");
 267
 268	/*
 269	 * Were we woken up by a commit wakeup event?
 270	 */
 271	transaction = journal->j_running_transaction;
 272	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 273		journal->j_commit_request = transaction->t_tid;
 274		jbd_debug(1, "woke because of timeout\n");
 275	}
 276	goto loop;
 277
 278end_loop:
 279	write_unlock(&journal->j_state_lock);
 280	del_timer_sync(&journal->j_commit_timer);
 281	journal->j_task = NULL;
 282	wake_up(&journal->j_wait_done_commit);
 283	jbd_debug(1, "Journal thread exiting.\n");
 
 284	return 0;
 285}
 286
 287static int jbd2_journal_start_thread(journal_t *journal)
 288{
 289	struct task_struct *t;
 290
 291	t = kthread_run(kjournald2, journal, "jbd2/%s",
 292			journal->j_devname);
 293	if (IS_ERR(t))
 294		return PTR_ERR(t);
 295
 296	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
 297	return 0;
 298}
 299
 300static void journal_kill_thread(journal_t *journal)
 301{
 302	write_lock(&journal->j_state_lock);
 303	journal->j_flags |= JBD2_UNMOUNT;
 304
 305	while (journal->j_task) {
 306		write_unlock(&journal->j_state_lock);
 307		wake_up(&journal->j_wait_commit);
 308		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
 309		write_lock(&journal->j_state_lock);
 310	}
 311	write_unlock(&journal->j_state_lock);
 312}
 313
 314/*
 315 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
 316 *
 317 * Writes a metadata buffer to a given disk block.  The actual IO is not
 318 * performed but a new buffer_head is constructed which labels the data
 319 * to be written with the correct destination disk block.
 320 *
 321 * Any magic-number escaping which needs to be done will cause a
 322 * copy-out here.  If the buffer happens to start with the
 323 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
 324 * magic number is only written to the log for descripter blocks.  In
 325 * this case, we copy the data and replace the first word with 0, and we
 326 * return a result code which indicates that this buffer needs to be
 327 * marked as an escaped buffer in the corresponding log descriptor
 328 * block.  The missing word can then be restored when the block is read
 329 * during recovery.
 330 *
 331 * If the source buffer has already been modified by a new transaction
 332 * since we took the last commit snapshot, we use the frozen copy of
 333 * that data for IO. If we end up using the existing buffer_head's data
 334 * for the write, then we have to make sure nobody modifies it while the
 335 * IO is in progress. do_get_write_access() handles this.
 336 *
 337 * The function returns a pointer to the buffer_head to be used for IO.
 338 * 
 339 *
 340 * Return value:
 341 *  <0: Error
 342 * >=0: Finished OK
 343 *
 344 * On success:
 345 * Bit 0 set == escape performed on the data
 346 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
 347 */
 348
 349int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 350				  struct journal_head  *jh_in,
 351				  struct buffer_head **bh_out,
 352				  sector_t blocknr)
 353{
 354	int need_copy_out = 0;
 355	int done_copy_out = 0;
 356	int do_escape = 0;
 357	char *mapped_data;
 358	struct buffer_head *new_bh;
 359	struct page *new_page;
 360	unsigned int new_offset;
 361	struct buffer_head *bh_in = jh2bh(jh_in);
 362	journal_t *journal = transaction->t_journal;
 363
 364	/*
 365	 * The buffer really shouldn't be locked: only the current committing
 366	 * transaction is allowed to write it, so nobody else is allowed
 367	 * to do any IO.
 368	 *
 369	 * akpm: except if we're journalling data, and write() output is
 370	 * also part of a shared mapping, and another thread has
 371	 * decided to launch a writepage() against this buffer.
 372	 */
 373	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 374
 375	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 376
 377	/* keep subsequent assertions sane */
 378	atomic_set(&new_bh->b_count, 1);
 379
 380	jbd_lock_bh_state(bh_in);
 381repeat:
 382	/*
 383	 * If a new transaction has already done a buffer copy-out, then
 384	 * we use that version of the data for the commit.
 385	 */
 386	if (jh_in->b_frozen_data) {
 387		done_copy_out = 1;
 388		new_page = virt_to_page(jh_in->b_frozen_data);
 389		new_offset = offset_in_page(jh_in->b_frozen_data);
 390	} else {
 391		new_page = jh2bh(jh_in)->b_page;
 392		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
 393	}
 394
 395	mapped_data = kmap_atomic(new_page);
 396	/*
 397	 * Fire data frozen trigger if data already wasn't frozen.  Do this
 398	 * before checking for escaping, as the trigger may modify the magic
 399	 * offset.  If a copy-out happens afterwards, it will have the correct
 400	 * data in the buffer.
 401	 */
 402	if (!done_copy_out)
 403		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
 404					   jh_in->b_triggers);
 405
 406	/*
 407	 * Check for escaping
 408	 */
 409	if (*((__be32 *)(mapped_data + new_offset)) ==
 410				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
 411		need_copy_out = 1;
 412		do_escape = 1;
 413	}
 414	kunmap_atomic(mapped_data);
 415
 416	/*
 417	 * Do we need to do a data copy?
 418	 */
 419	if (need_copy_out && !done_copy_out) {
 420		char *tmp;
 421
 422		jbd_unlock_bh_state(bh_in);
 423		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
 424		if (!tmp) {
 425			brelse(new_bh);
 426			return -ENOMEM;
 427		}
 428		jbd_lock_bh_state(bh_in);
 429		if (jh_in->b_frozen_data) {
 430			jbd2_free(tmp, bh_in->b_size);
 431			goto repeat;
 432		}
 433
 434		jh_in->b_frozen_data = tmp;
 435		mapped_data = kmap_atomic(new_page);
 436		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
 437		kunmap_atomic(mapped_data);
 438
 439		new_page = virt_to_page(tmp);
 440		new_offset = offset_in_page(tmp);
 441		done_copy_out = 1;
 442
 443		/*
 444		 * This isn't strictly necessary, as we're using frozen
 445		 * data for the escaping, but it keeps consistency with
 446		 * b_frozen_data usage.
 447		 */
 448		jh_in->b_frozen_triggers = jh_in->b_triggers;
 449	}
 450
 451	/*
 452	 * Did we need to do an escaping?  Now we've done all the
 453	 * copying, we can finally do so.
 454	 */
 455	if (do_escape) {
 456		mapped_data = kmap_atomic(new_page);
 457		*((unsigned int *)(mapped_data + new_offset)) = 0;
 458		kunmap_atomic(mapped_data);
 459	}
 460
 461	set_bh_page(new_bh, new_page, new_offset);
 462	new_bh->b_size = bh_in->b_size;
 463	new_bh->b_bdev = journal->j_dev;
 464	new_bh->b_blocknr = blocknr;
 465	new_bh->b_private = bh_in;
 466	set_buffer_mapped(new_bh);
 467	set_buffer_dirty(new_bh);
 468
 469	*bh_out = new_bh;
 470
 471	/*
 472	 * The to-be-written buffer needs to get moved to the io queue,
 473	 * and the original buffer whose contents we are shadowing or
 474	 * copying is moved to the transaction's shadow queue.
 475	 */
 476	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
 477	spin_lock(&journal->j_list_lock);
 478	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 479	spin_unlock(&journal->j_list_lock);
 480	set_buffer_shadow(bh_in);
 481	jbd_unlock_bh_state(bh_in);
 482
 483	return do_escape | (done_copy_out << 1);
 484}
 485
 486/*
 487 * Allocation code for the journal file.  Manage the space left in the
 488 * journal, so that we can begin checkpointing when appropriate.
 489 */
 490
 491/*
 492 * Called with j_state_lock locked for writing.
 493 * Returns true if a transaction commit was started.
 494 */
 495int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 496{
 497	/* Return if the txn has already requested to be committed */
 498	if (journal->j_commit_request == target)
 499		return 0;
 500
 501	/*
 502	 * The only transaction we can possibly wait upon is the
 503	 * currently running transaction (if it exists).  Otherwise,
 504	 * the target tid must be an old one.
 505	 */
 506	if (journal->j_running_transaction &&
 507	    journal->j_running_transaction->t_tid == target) {
 508		/*
 509		 * We want a new commit: OK, mark the request and wakeup the
 510		 * commit thread.  We do _not_ do the commit ourselves.
 511		 */
 512
 513		journal->j_commit_request = target;
 514		jbd_debug(1, "JBD2: requesting commit %d/%d\n",
 515			  journal->j_commit_request,
 516			  journal->j_commit_sequence);
 517		journal->j_running_transaction->t_requested = jiffies;
 518		wake_up(&journal->j_wait_commit);
 519		return 1;
 520	} else if (!tid_geq(journal->j_commit_request, target))
 521		/* This should never happen, but if it does, preserve
 522		   the evidence before kjournald goes into a loop and
 523		   increments j_commit_sequence beyond all recognition. */
 524		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
 525			  journal->j_commit_request,
 526			  journal->j_commit_sequence,
 527			  target, journal->j_running_transaction ? 
 528			  journal->j_running_transaction->t_tid : 0);
 529	return 0;
 530}
 531
 532int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 533{
 534	int ret;
 535
 536	write_lock(&journal->j_state_lock);
 537	ret = __jbd2_log_start_commit(journal, tid);
 538	write_unlock(&journal->j_state_lock);
 539	return ret;
 540}
 541
 542/*
 543 * Force and wait any uncommitted transactions.  We can only force the running
 544 * transaction if we don't have an active handle, otherwise, we will deadlock.
 545 * Returns: <0 in case of error,
 546 *           0 if nothing to commit,
 547 *           1 if transaction was successfully committed.
 548 */
 549static int __jbd2_journal_force_commit(journal_t *journal)
 550{
 551	transaction_t *transaction = NULL;
 552	tid_t tid;
 553	int need_to_start = 0, ret = 0;
 554
 555	read_lock(&journal->j_state_lock);
 556	if (journal->j_running_transaction && !current->journal_info) {
 557		transaction = journal->j_running_transaction;
 558		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
 559			need_to_start = 1;
 560	} else if (journal->j_committing_transaction)
 561		transaction = journal->j_committing_transaction;
 562
 563	if (!transaction) {
 564		/* Nothing to commit */
 565		read_unlock(&journal->j_state_lock);
 566		return 0;
 567	}
 568	tid = transaction->t_tid;
 569	read_unlock(&journal->j_state_lock);
 570	if (need_to_start)
 571		jbd2_log_start_commit(journal, tid);
 572	ret = jbd2_log_wait_commit(journal, tid);
 573	if (!ret)
 574		ret = 1;
 575
 576	return ret;
 577}
 578
 579/**
 580 * Force and wait upon a commit if the calling process is not within
 581 * transaction.  This is used for forcing out undo-protected data which contains
 582 * bitmaps, when the fs is running out of space.
 583 *
 584 * @journal: journal to force
 585 * Returns true if progress was made.
 586 */
 587int jbd2_journal_force_commit_nested(journal_t *journal)
 588{
 589	int ret;
 590
 591	ret = __jbd2_journal_force_commit(journal);
 592	return ret > 0;
 593}
 594
 595/**
 596 * int journal_force_commit() - force any uncommitted transactions
 597 * @journal: journal to force
 598 *
 599 * Caller want unconditional commit. We can only force the running transaction
 600 * if we don't have an active handle, otherwise, we will deadlock.
 601 */
 602int jbd2_journal_force_commit(journal_t *journal)
 603{
 604	int ret;
 605
 606	J_ASSERT(!current->journal_info);
 607	ret = __jbd2_journal_force_commit(journal);
 608	if (ret > 0)
 609		ret = 0;
 610	return ret;
 611}
 612
 613/*
 614 * Start a commit of the current running transaction (if any).  Returns true
 615 * if a transaction is going to be committed (or is currently already
 616 * committing), and fills its tid in at *ptid
 617 */
 618int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 619{
 620	int ret = 0;
 621
 622	write_lock(&journal->j_state_lock);
 623	if (journal->j_running_transaction) {
 624		tid_t tid = journal->j_running_transaction->t_tid;
 625
 626		__jbd2_log_start_commit(journal, tid);
 627		/* There's a running transaction and we've just made sure
 628		 * it's commit has been scheduled. */
 629		if (ptid)
 630			*ptid = tid;
 631		ret = 1;
 632	} else if (journal->j_committing_transaction) {
 633		/*
 634		 * If commit has been started, then we have to wait for
 635		 * completion of that transaction.
 636		 */
 637		if (ptid)
 638			*ptid = journal->j_committing_transaction->t_tid;
 639		ret = 1;
 640	}
 641	write_unlock(&journal->j_state_lock);
 642	return ret;
 643}
 644
 645/*
 646 * Return 1 if a given transaction has not yet sent barrier request
 647 * connected with a transaction commit. If 0 is returned, transaction
 648 * may or may not have sent the barrier. Used to avoid sending barrier
 649 * twice in common cases.
 650 */
 651int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
 652{
 653	int ret = 0;
 654	transaction_t *commit_trans;
 655
 656	if (!(journal->j_flags & JBD2_BARRIER))
 657		return 0;
 658	read_lock(&journal->j_state_lock);
 659	/* Transaction already committed? */
 660	if (tid_geq(journal->j_commit_sequence, tid))
 661		goto out;
 662	commit_trans = journal->j_committing_transaction;
 663	if (!commit_trans || commit_trans->t_tid != tid) {
 664		ret = 1;
 665		goto out;
 666	}
 667	/*
 668	 * Transaction is being committed and we already proceeded to
 669	 * submitting a flush to fs partition?
 670	 */
 671	if (journal->j_fs_dev != journal->j_dev) {
 672		if (!commit_trans->t_need_data_flush ||
 673		    commit_trans->t_state >= T_COMMIT_DFLUSH)
 674			goto out;
 675	} else {
 676		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
 677			goto out;
 678	}
 679	ret = 1;
 680out:
 681	read_unlock(&journal->j_state_lock);
 682	return ret;
 683}
 684EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
 685
 686/*
 687 * Wait for a specified commit to complete.
 688 * The caller may not hold the journal lock.
 689 */
 690int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 691{
 692	int err = 0;
 693
 694	jbd2_might_wait_for_commit(journal);
 695	read_lock(&journal->j_state_lock);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 696#ifdef CONFIG_JBD2_DEBUG
 697	if (!tid_geq(journal->j_commit_request, tid)) {
 698		printk(KERN_ERR
 699		       "%s: error: j_commit_request=%d, tid=%d\n",
 700		       __func__, journal->j_commit_request, tid);
 701	}
 702#endif
 703	while (tid_gt(tid, journal->j_commit_sequence)) {
 704		jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
 705				  tid, journal->j_commit_sequence);
 706		read_unlock(&journal->j_state_lock);
 707		wake_up(&journal->j_wait_commit);
 708		wait_event(journal->j_wait_done_commit,
 709				!tid_gt(tid, journal->j_commit_sequence));
 710		read_lock(&journal->j_state_lock);
 711	}
 712	read_unlock(&journal->j_state_lock);
 713
 714	if (unlikely(is_journal_aborted(journal)))
 715		err = -EIO;
 716	return err;
 717}
 718
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 719/*
 720 * When this function returns the transaction corresponding to tid
 721 * will be completed.  If the transaction has currently running, start
 722 * committing that transaction before waiting for it to complete.  If
 723 * the transaction id is stale, it is by definition already completed,
 724 * so just return SUCCESS.
 725 */
 726int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 727{
 728	int	need_to_wait = 1;
 729
 730	read_lock(&journal->j_state_lock);
 731	if (journal->j_running_transaction &&
 732	    journal->j_running_transaction->t_tid == tid) {
 733		if (journal->j_commit_request != tid) {
 734			/* transaction not yet started, so request it */
 735			read_unlock(&journal->j_state_lock);
 736			jbd2_log_start_commit(journal, tid);
 737			goto wait_commit;
 738		}
 739	} else if (!(journal->j_committing_transaction &&
 740		     journal->j_committing_transaction->t_tid == tid))
 741		need_to_wait = 0;
 742	read_unlock(&journal->j_state_lock);
 743	if (!need_to_wait)
 744		return 0;
 745wait_commit:
 746	return jbd2_log_wait_commit(journal, tid);
 747}
 748EXPORT_SYMBOL(jbd2_complete_transaction);
 749
 750/*
 751 * Log buffer allocation routines:
 752 */
 753
 754int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 755{
 756	unsigned long blocknr;
 757
 758	write_lock(&journal->j_state_lock);
 759	J_ASSERT(journal->j_free > 1);
 760
 761	blocknr = journal->j_head;
 762	journal->j_head++;
 763	journal->j_free--;
 764	if (journal->j_head == journal->j_last)
 765		journal->j_head = journal->j_first;
 766	write_unlock(&journal->j_state_lock);
 767	return jbd2_journal_bmap(journal, blocknr, retp);
 768}
 769
 770/*
 771 * Conversion of logical to physical block numbers for the journal
 772 *
 773 * On external journals the journal blocks are identity-mapped, so
 774 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 775 * ready.
 776 */
 777int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 778		 unsigned long long *retp)
 779{
 780	int err = 0;
 781	unsigned long long ret;
 
 782
 783	if (journal->j_inode) {
 784		ret = bmap(journal->j_inode, blocknr);
 785		if (ret)
 786			*retp = ret;
 787		else {
 788			printk(KERN_ALERT "%s: journal block not found "
 789					"at offset %lu on %s\n",
 790			       __func__, blocknr, journal->j_devname);
 791			err = -EIO;
 792			__journal_abort_soft(journal, err);
 
 
 793		}
 
 794	} else {
 795		*retp = blocknr; /* +journal->j_blk_offset */
 796	}
 797	return err;
 798}
 799
 800/*
 801 * We play buffer_head aliasing tricks to write data/metadata blocks to
 802 * the journal without copying their contents, but for journal
 803 * descriptor blocks we do need to generate bona fide buffers.
 804 *
 805 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
 806 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
 807 * But we don't bother doing that, so there will be coherency problems with
 808 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 809 */
 810struct buffer_head *
 811jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
 812{
 813	journal_t *journal = transaction->t_journal;
 814	struct buffer_head *bh;
 815	unsigned long long blocknr;
 816	journal_header_t *header;
 817	int err;
 818
 819	err = jbd2_journal_next_log_block(journal, &blocknr);
 820
 821	if (err)
 822		return NULL;
 823
 824	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 825	if (!bh)
 826		return NULL;
 
 827	lock_buffer(bh);
 828	memset(bh->b_data, 0, journal->j_blocksize);
 829	header = (journal_header_t *)bh->b_data;
 830	header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 831	header->h_blocktype = cpu_to_be32(type);
 832	header->h_sequence = cpu_to_be32(transaction->t_tid);
 833	set_buffer_uptodate(bh);
 834	unlock_buffer(bh);
 835	BUFFER_TRACE(bh, "return this buffer");
 836	return bh;
 837}
 838
 839void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
 840{
 841	struct jbd2_journal_block_tail *tail;
 842	__u32 csum;
 843
 844	if (!jbd2_journal_has_csum_v2or3(j))
 845		return;
 846
 847	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
 848			sizeof(struct jbd2_journal_block_tail));
 849	tail->t_checksum = 0;
 850	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 851	tail->t_checksum = cpu_to_be32(csum);
 852}
 853
 854/*
 855 * Return tid of the oldest transaction in the journal and block in the journal
 856 * where the transaction starts.
 857 *
 858 * If the journal is now empty, return which will be the next transaction ID
 859 * we will write and where will that transaction start.
 860 *
 861 * The return value is 0 if journal tail cannot be pushed any further, 1 if
 862 * it can.
 863 */
 864int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 865			      unsigned long *block)
 866{
 867	transaction_t *transaction;
 868	int ret;
 869
 870	read_lock(&journal->j_state_lock);
 871	spin_lock(&journal->j_list_lock);
 872	transaction = journal->j_checkpoint_transactions;
 873	if (transaction) {
 874		*tid = transaction->t_tid;
 875		*block = transaction->t_log_start;
 876	} else if ((transaction = journal->j_committing_transaction) != NULL) {
 877		*tid = transaction->t_tid;
 878		*block = transaction->t_log_start;
 879	} else if ((transaction = journal->j_running_transaction) != NULL) {
 880		*tid = transaction->t_tid;
 881		*block = journal->j_head;
 882	} else {
 883		*tid = journal->j_transaction_sequence;
 884		*block = journal->j_head;
 885	}
 886	ret = tid_gt(*tid, journal->j_tail_sequence);
 887	spin_unlock(&journal->j_list_lock);
 888	read_unlock(&journal->j_state_lock);
 889
 890	return ret;
 891}
 892
 893/*
 894 * Update information in journal structure and in on disk journal superblock
 895 * about log tail. This function does not check whether information passed in
 896 * really pushes log tail further. It's responsibility of the caller to make
 897 * sure provided log tail information is valid (e.g. by holding
 898 * j_checkpoint_mutex all the time between computing log tail and calling this
 899 * function as is the case with jbd2_cleanup_journal_tail()).
 900 *
 901 * Requires j_checkpoint_mutex
 902 */
 903int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 904{
 905	unsigned long freed;
 906	int ret;
 907
 908	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 909
 910	/*
 911	 * We cannot afford for write to remain in drive's caches since as
 912	 * soon as we update j_tail, next transaction can start reusing journal
 913	 * space and if we lose sb update during power failure we'd replay
 914	 * old transaction with possibly newly overwritten data.
 915	 */
 916	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 
 917	if (ret)
 918		goto out;
 919
 920	write_lock(&journal->j_state_lock);
 921	freed = block - journal->j_tail;
 922	if (block < journal->j_tail)
 923		freed += journal->j_last - journal->j_first;
 924
 925	trace_jbd2_update_log_tail(journal, tid, block, freed);
 926	jbd_debug(1,
 927		  "Cleaning journal tail from %d to %d (offset %lu), "
 928		  "freeing %lu\n",
 929		  journal->j_tail_sequence, tid, block, freed);
 930
 931	journal->j_free += freed;
 932	journal->j_tail_sequence = tid;
 933	journal->j_tail = block;
 934	write_unlock(&journal->j_state_lock);
 935
 936out:
 937	return ret;
 938}
 939
 940/*
 941 * This is a variaon of __jbd2_update_log_tail which checks for validity of
 942 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
 943 * with other threads updating log tail.
 944 */
 945void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 946{
 947	mutex_lock(&journal->j_checkpoint_mutex);
 948	if (tid_gt(tid, journal->j_tail_sequence))
 949		__jbd2_update_log_tail(journal, tid, block);
 950	mutex_unlock(&journal->j_checkpoint_mutex);
 951}
 952
 953struct jbd2_stats_proc_session {
 954	journal_t *journal;
 955	struct transaction_stats_s *stats;
 956	int start;
 957	int max;
 958};
 959
 960static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
 961{
 962	return *pos ? NULL : SEQ_START_TOKEN;
 963}
 964
 965static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
 966{
 
 967	return NULL;
 968}
 969
 970static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 971{
 972	struct jbd2_stats_proc_session *s = seq->private;
 973
 974	if (v != SEQ_START_TOKEN)
 975		return 0;
 976	seq_printf(seq, "%lu transactions (%lu requested), "
 977		   "each up to %u blocks\n",
 978		   s->stats->ts_tid, s->stats->ts_requested,
 979		   s->journal->j_max_transaction_buffers);
 980	if (s->stats->ts_tid == 0)
 981		return 0;
 982	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
 983	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
 984	seq_printf(seq, "  %ums request delay\n",
 985	    (s->stats->ts_requested == 0) ? 0 :
 986	    jiffies_to_msecs(s->stats->run.rs_request_delay /
 987			     s->stats->ts_requested));
 988	seq_printf(seq, "  %ums running transaction\n",
 989	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
 990	seq_printf(seq, "  %ums transaction was being locked\n",
 991	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
 992	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
 993	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
 994	seq_printf(seq, "  %ums logging transaction\n",
 995	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
 996	seq_printf(seq, "  %lluus average transaction commit time\n",
 997		   div_u64(s->journal->j_average_commit_time, 1000));
 998	seq_printf(seq, "  %lu handles per transaction\n",
 999	    s->stats->run.rs_handle_count / s->stats->ts_tid);
1000	seq_printf(seq, "  %lu blocks per transaction\n",
1001	    s->stats->run.rs_blocks / s->stats->ts_tid);
1002	seq_printf(seq, "  %lu logged blocks per transaction\n",
1003	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
1004	return 0;
1005}
1006
1007static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
1008{
1009}
1010
1011static const struct seq_operations jbd2_seq_info_ops = {
1012	.start  = jbd2_seq_info_start,
1013	.next   = jbd2_seq_info_next,
1014	.stop   = jbd2_seq_info_stop,
1015	.show   = jbd2_seq_info_show,
1016};
1017
1018static int jbd2_seq_info_open(struct inode *inode, struct file *file)
1019{
1020	journal_t *journal = PDE_DATA(inode);
1021	struct jbd2_stats_proc_session *s;
1022	int rc, size;
1023
1024	s = kmalloc(sizeof(*s), GFP_KERNEL);
1025	if (s == NULL)
1026		return -ENOMEM;
1027	size = sizeof(struct transaction_stats_s);
1028	s->stats = kmalloc(size, GFP_KERNEL);
1029	if (s->stats == NULL) {
1030		kfree(s);
1031		return -ENOMEM;
1032	}
1033	spin_lock(&journal->j_history_lock);
1034	memcpy(s->stats, &journal->j_stats, size);
1035	s->journal = journal;
1036	spin_unlock(&journal->j_history_lock);
1037
1038	rc = seq_open(file, &jbd2_seq_info_ops);
1039	if (rc == 0) {
1040		struct seq_file *m = file->private_data;
1041		m->private = s;
1042	} else {
1043		kfree(s->stats);
1044		kfree(s);
1045	}
1046	return rc;
1047
1048}
1049
1050static int jbd2_seq_info_release(struct inode *inode, struct file *file)
1051{
1052	struct seq_file *seq = file->private_data;
1053	struct jbd2_stats_proc_session *s = seq->private;
1054	kfree(s->stats);
1055	kfree(s);
1056	return seq_release(inode, file);
1057}
1058
1059static const struct file_operations jbd2_seq_info_fops = {
1060	.owner		= THIS_MODULE,
1061	.open           = jbd2_seq_info_open,
1062	.read           = seq_read,
1063	.llseek         = seq_lseek,
1064	.release        = jbd2_seq_info_release,
1065};
1066
1067static struct proc_dir_entry *proc_jbd2_stats;
1068
1069static void jbd2_stats_proc_init(journal_t *journal)
1070{
1071	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
1072	if (journal->j_proc_entry) {
1073		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
1074				 &jbd2_seq_info_fops, journal);
1075	}
1076}
1077
1078static void jbd2_stats_proc_exit(journal_t *journal)
1079{
1080	remove_proc_entry("info", journal->j_proc_entry);
1081	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
1082}
1083
 
 
 
 
 
 
 
 
 
 
1084/*
1085 * Management for journal control blocks: functions to create and
1086 * destroy journal_t structures, and to initialise and read existing
1087 * journal blocks from disk.  */
1088
1089/* First: create and setup a journal_t object in memory.  We initialise
1090 * very few fields yet: that has to wait until we have created the
1091 * journal structures from from scratch, or loaded them from disk. */
1092
1093static journal_t *journal_init_common(struct block_device *bdev,
1094			struct block_device *fs_dev,
1095			unsigned long long start, int len, int blocksize)
1096{
1097	static struct lock_class_key jbd2_trans_commit_key;
1098	journal_t *journal;
1099	int err;
1100	struct buffer_head *bh;
1101	int n;
1102
1103	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
1104	if (!journal)
1105		return NULL;
1106
1107	init_waitqueue_head(&journal->j_wait_transaction_locked);
1108	init_waitqueue_head(&journal->j_wait_done_commit);
1109	init_waitqueue_head(&journal->j_wait_commit);
1110	init_waitqueue_head(&journal->j_wait_updates);
1111	init_waitqueue_head(&journal->j_wait_reserved);
 
1112	mutex_init(&journal->j_barrier);
1113	mutex_init(&journal->j_checkpoint_mutex);
1114	spin_lock_init(&journal->j_revoke_lock);
1115	spin_lock_init(&journal->j_list_lock);
1116	rwlock_init(&journal->j_state_lock);
1117
1118	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1119	journal->j_min_batch_time = 0;
1120	journal->j_max_batch_time = 15000; /* 15ms */
1121	atomic_set(&journal->j_reserved_credits, 0);
1122
1123	/* The journal is marked for error until we succeed with recovery! */
1124	journal->j_flags = JBD2_ABORT;
1125
1126	/* Set up a default-sized revoke table for the new mount. */
1127	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
1128	if (err)
1129		goto err_cleanup;
1130
1131	spin_lock_init(&journal->j_history_lock);
1132
1133	lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
1134			 &jbd2_trans_commit_key, 0);
1135
1136	/* journal descriptor can store up to n blocks -bzzz */
1137	journal->j_blocksize = blocksize;
1138	journal->j_dev = bdev;
1139	journal->j_fs_dev = fs_dev;
1140	journal->j_blk_offset = start;
1141	journal->j_maxlen = len;
1142	n = journal->j_blocksize / sizeof(journal_block_tag_t);
 
1143	journal->j_wbufsize = n;
1144	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
1145					GFP_KERNEL);
1146	if (!journal->j_wbuf)
1147		goto err_cleanup;
1148
1149	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
1150	if (!bh) {
1151		pr_err("%s: Cannot get buffer for journal superblock\n",
1152			__func__);
1153		goto err_cleanup;
1154	}
1155	journal->j_sb_buffer = bh;
1156	journal->j_superblock = (journal_superblock_t *)bh->b_data;
1157
1158	return journal;
1159
1160err_cleanup:
1161	kfree(journal->j_wbuf);
1162	jbd2_journal_destroy_revoke(journal);
1163	kfree(journal);
1164	return NULL;
1165}
1166
1167/* jbd2_journal_init_dev and jbd2_journal_init_inode:
1168 *
1169 * Create a journal structure assigned some fixed set of disk blocks to
1170 * the journal.  We don't actually touch those disk blocks yet, but we
1171 * need to set up all of the mapping information to tell the journaling
1172 * system where the journal blocks are.
1173 *
1174 */
1175
1176/**
1177 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
1178 *  @bdev: Block device on which to create the journal
1179 *  @fs_dev: Device which hold journalled filesystem for this journal.
1180 *  @start: Block nr Start of journal.
1181 *  @len:  Length of the journal in blocks.
1182 *  @blocksize: blocksize of journalling device
1183 *
1184 *  Returns: a newly created journal_t *
1185 *
1186 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
1187 *  range of blocks on an arbitrary block device.
1188 *
1189 */
1190journal_t *jbd2_journal_init_dev(struct block_device *bdev,
1191			struct block_device *fs_dev,
1192			unsigned long long start, int len, int blocksize)
1193{
1194	journal_t *journal;
1195
1196	journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
1197	if (!journal)
1198		return NULL;
1199
1200	bdevname(journal->j_dev, journal->j_devname);
1201	strreplace(journal->j_devname, '/', '!');
1202	jbd2_stats_proc_init(journal);
1203
1204	return journal;
1205}
1206
1207/**
1208 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
1209 *  @inode: An inode to create the journal in
1210 *
1211 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
1212 * the journal.  The inode must exist already, must support bmap() and
1213 * must have all data blocks preallocated.
1214 */
1215journal_t *jbd2_journal_init_inode(struct inode *inode)
1216{
1217	journal_t *journal;
 
1218	char *p;
1219	unsigned long long blocknr;
 
 
 
1220
1221	blocknr = bmap(inode, 0);
1222	if (!blocknr) {
1223		pr_err("%s: Cannot locate journal superblock\n",
1224			__func__);
1225		return NULL;
1226	}
1227
1228	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
1229		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
1230		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
1231
1232	journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
1233			blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
1234			inode->i_sb->s_blocksize);
1235	if (!journal)
1236		return NULL;
1237
1238	journal->j_inode = inode;
1239	bdevname(journal->j_dev, journal->j_devname);
1240	p = strreplace(journal->j_devname, '/', '!');
1241	sprintf(p, "-%lu", journal->j_inode->i_ino);
1242	jbd2_stats_proc_init(journal);
1243
1244	return journal;
1245}
1246
1247/*
1248 * If the journal init or create aborts, we need to mark the journal
1249 * superblock as being NULL to prevent the journal destroy from writing
1250 * back a bogus superblock.
1251 */
1252static void journal_fail_superblock (journal_t *journal)
1253{
1254	struct buffer_head *bh = journal->j_sb_buffer;
1255	brelse(bh);
1256	journal->j_sb_buffer = NULL;
1257}
1258
1259/*
1260 * Given a journal_t structure, initialise the various fields for
1261 * startup of a new journaling session.  We use this both when creating
1262 * a journal, and after recovering an old journal to reset it for
1263 * subsequent use.
1264 */
1265
1266static int journal_reset(journal_t *journal)
1267{
1268	journal_superblock_t *sb = journal->j_superblock;
1269	unsigned long long first, last;
1270
1271	first = be32_to_cpu(sb->s_first);
1272	last = be32_to_cpu(sb->s_maxlen);
1273	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1274		printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
1275		       first, last);
1276		journal_fail_superblock(journal);
1277		return -EINVAL;
1278	}
1279
1280	journal->j_first = first;
1281	journal->j_last = last;
1282
1283	journal->j_head = first;
1284	journal->j_tail = first;
1285	journal->j_free = last - first;
1286
1287	journal->j_tail_sequence = journal->j_transaction_sequence;
1288	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
1289	journal->j_commit_request = journal->j_commit_sequence;
1290
1291	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1292
1293	/*
1294	 * As a special case, if the on-disk copy is already marked as needing
1295	 * no recovery (s_start == 0), then we can safely defer the superblock
1296	 * update until the next commit by setting JBD2_FLUSHED.  This avoids
1297	 * attempting a write to a potential-readonly device.
1298	 */
1299	if (sb->s_start == 0) {
1300		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1301			"(start %ld, seq %d, errno %d)\n",
1302			journal->j_tail, journal->j_tail_sequence,
1303			journal->j_errno);
1304		journal->j_flags |= JBD2_FLUSHED;
1305	} else {
1306		/* Lock here to make assertions happy... */
1307		mutex_lock(&journal->j_checkpoint_mutex);
1308		/*
1309		 * Update log tail information. We use REQ_FUA since new
1310		 * transaction will start reusing journal space and so we
1311		 * must make sure information about current log tail is on
1312		 * disk before that.
1313		 */
1314		jbd2_journal_update_sb_log_tail(journal,
1315						journal->j_tail_sequence,
1316						journal->j_tail,
1317						REQ_FUA);
1318		mutex_unlock(&journal->j_checkpoint_mutex);
1319	}
1320	return jbd2_journal_start_thread(journal);
1321}
1322
 
 
 
 
1323static int jbd2_write_superblock(journal_t *journal, int write_flags)
1324{
1325	struct buffer_head *bh = journal->j_sb_buffer;
1326	journal_superblock_t *sb = journal->j_superblock;
1327	int ret;
1328
 
 
 
 
 
 
1329	trace_jbd2_write_superblock(journal, write_flags);
1330	if (!(journal->j_flags & JBD2_BARRIER))
1331		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
1332	lock_buffer(bh);
1333	if (buffer_write_io_error(bh)) {
1334		/*
1335		 * Oh, dear.  A previous attempt to write the journal
1336		 * superblock failed.  This could happen because the
1337		 * USB device was yanked out.  Or it could happen to
1338		 * be a transient write error and maybe the block will
1339		 * be remapped.  Nothing we can do but to retry the
1340		 * write and hope for the best.
1341		 */
1342		printk(KERN_ERR "JBD2: previous I/O error detected "
1343		       "for journal superblock update for %s.\n",
1344		       journal->j_devname);
1345		clear_buffer_write_io_error(bh);
1346		set_buffer_uptodate(bh);
1347	}
1348	jbd2_superblock_csum_set(journal, sb);
 
1349	get_bh(bh);
1350	bh->b_end_io = end_buffer_write_sync;
1351	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
1352	wait_on_buffer(bh);
1353	if (buffer_write_io_error(bh)) {
1354		clear_buffer_write_io_error(bh);
1355		set_buffer_uptodate(bh);
1356		ret = -EIO;
1357	}
1358	if (ret) {
1359		printk(KERN_ERR "JBD2: Error %d detected when updating "
1360		       "journal superblock for %s.\n", ret,
1361		       journal->j_devname);
1362		jbd2_journal_abort(journal, ret);
 
1363	}
1364
1365	return ret;
1366}
1367
1368/**
1369 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1370 * @journal: The journal to update.
1371 * @tail_tid: TID of the new transaction at the tail of the log
1372 * @tail_block: The first block of the transaction at the tail of the log
1373 * @write_op: With which operation should we write the journal sb
1374 *
1375 * Update a journal's superblock information about log tail and write it to
1376 * disk, waiting for the IO to complete.
1377 */
1378int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1379				     unsigned long tail_block, int write_op)
1380{
1381	journal_superblock_t *sb = journal->j_superblock;
1382	int ret;
1383
 
 
 
1384	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1385	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1386		  tail_block, tail_tid);
1387
 
1388	sb->s_sequence = cpu_to_be32(tail_tid);
1389	sb->s_start    = cpu_to_be32(tail_block);
1390
1391	ret = jbd2_write_superblock(journal, write_op);
1392	if (ret)
1393		goto out;
1394
1395	/* Log is no longer empty */
1396	write_lock(&journal->j_state_lock);
1397	WARN_ON(!sb->s_sequence);
1398	journal->j_flags &= ~JBD2_FLUSHED;
1399	write_unlock(&journal->j_state_lock);
1400
1401out:
1402	return ret;
1403}
1404
1405/**
1406 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1407 * @journal: The journal to update.
1408 * @write_op: With which operation should we write the journal sb
1409 *
1410 * Update a journal's dynamic superblock fields to show that journal is empty.
1411 * Write updated superblock to disk waiting for IO to complete.
1412 */
1413static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
1414{
1415	journal_superblock_t *sb = journal->j_superblock;
1416
1417	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1418	read_lock(&journal->j_state_lock);
1419	/* Is it already empty? */
1420	if (sb->s_start == 0) {
1421		read_unlock(&journal->j_state_lock);
1422		return;
1423	}
1424	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
 
1425		  journal->j_tail_sequence);
1426
1427	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1428	sb->s_start    = cpu_to_be32(0);
1429	read_unlock(&journal->j_state_lock);
1430
1431	jbd2_write_superblock(journal, write_op);
1432
1433	/* Log is no longer empty */
1434	write_lock(&journal->j_state_lock);
1435	journal->j_flags |= JBD2_FLUSHED;
1436	write_unlock(&journal->j_state_lock);
1437}
1438
1439
1440/**
1441 * jbd2_journal_update_sb_errno() - Update error in the journal.
1442 * @journal: The journal to update.
1443 *
1444 * Update a journal's errno.  Write updated superblock to disk waiting for IO
1445 * to complete.
1446 */
1447void jbd2_journal_update_sb_errno(journal_t *journal)
1448{
1449	journal_superblock_t *sb = journal->j_superblock;
 
1450
1451	read_lock(&journal->j_state_lock);
1452	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
1453		  journal->j_errno);
1454	sb->s_errno    = cpu_to_be32(journal->j_errno);
1455	read_unlock(&journal->j_state_lock);
 
1456
1457	jbd2_write_superblock(journal, REQ_FUA);
1458}
1459EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
1460
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1461/*
1462 * Read the superblock for a given journal, performing initial
1463 * validation of the format.
1464 */
1465static int journal_get_superblock(journal_t *journal)
1466{
1467	struct buffer_head *bh;
1468	journal_superblock_t *sb;
1469	int err = -EIO;
1470
1471	bh = journal->j_sb_buffer;
1472
1473	J_ASSERT(bh != NULL);
1474	if (!buffer_uptodate(bh)) {
1475		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1476		wait_on_buffer(bh);
1477		if (!buffer_uptodate(bh)) {
1478			printk(KERN_ERR
1479				"JBD2: IO error reading journal superblock\n");
1480			goto out;
1481		}
1482	}
1483
1484	if (buffer_verified(bh))
1485		return 0;
1486
1487	sb = journal->j_superblock;
1488
1489	err = -EINVAL;
1490
1491	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1492	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1493		printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
1494		goto out;
1495	}
1496
1497	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1498	case JBD2_SUPERBLOCK_V1:
1499		journal->j_format_version = 1;
1500		break;
1501	case JBD2_SUPERBLOCK_V2:
1502		journal->j_format_version = 2;
1503		break;
1504	default:
1505		printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
1506		goto out;
1507	}
1508
1509	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1510		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1511	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1512		printk(KERN_WARNING "JBD2: journal file too short\n");
1513		goto out;
1514	}
1515
1516	if (be32_to_cpu(sb->s_first) == 0 ||
1517	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1518		printk(KERN_WARNING
1519			"JBD2: Invalid start block of journal: %u\n",
1520			be32_to_cpu(sb->s_first));
1521		goto out;
1522	}
1523
1524	if (jbd2_has_feature_csum2(journal) &&
1525	    jbd2_has_feature_csum3(journal)) {
1526		/* Can't have checksum v2 and v3 at the same time! */
1527		printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
1528		       "at the same time!\n");
1529		goto out;
1530	}
1531
1532	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
1533	    jbd2_has_feature_checksum(journal)) {
1534		/* Can't have checksum v1 and v2 on at the same time! */
1535		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
1536		       "at the same time!\n");
1537		goto out;
1538	}
1539
1540	if (!jbd2_verify_csum_type(journal, sb)) {
1541		printk(KERN_ERR "JBD2: Unknown checksum type\n");
1542		goto out;
1543	}
1544
1545	/* Load the checksum driver */
1546	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
1547		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1548		if (IS_ERR(journal->j_chksum_driver)) {
1549			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1550			err = PTR_ERR(journal->j_chksum_driver);
1551			journal->j_chksum_driver = NULL;
1552			goto out;
1553		}
1554	}
1555
1556	/* Check superblock checksum */
1557	if (!jbd2_superblock_csum_verify(journal, sb)) {
1558		printk(KERN_ERR "JBD2: journal checksum error\n");
1559		err = -EFSBADCRC;
1560		goto out;
1561	}
 
1562
1563	/* Precompute checksum seed for all metadata */
1564	if (jbd2_journal_has_csum_v2or3(journal))
1565		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1566						   sizeof(sb->s_uuid));
 
1567
 
 
1568	set_buffer_verified(bh);
1569
1570	return 0;
1571
1572out:
1573	journal_fail_superblock(journal);
1574	return err;
1575}
1576
1577/*
1578 * Load the on-disk journal superblock and read the key fields into the
1579 * journal_t.
1580 */
1581
1582static int load_superblock(journal_t *journal)
1583{
1584	int err;
1585	journal_superblock_t *sb;
1586
1587	err = journal_get_superblock(journal);
1588	if (err)
1589		return err;
1590
1591	sb = journal->j_superblock;
1592
1593	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1594	journal->j_tail = be32_to_cpu(sb->s_start);
1595	journal->j_first = be32_to_cpu(sb->s_first);
1596	journal->j_last = be32_to_cpu(sb->s_maxlen);
1597	journal->j_errno = be32_to_cpu(sb->s_errno);
1598
1599	return 0;
1600}
1601
1602
1603/**
1604 * int jbd2_journal_load() - Read journal from disk.
1605 * @journal: Journal to act on.
1606 *
1607 * Given a journal_t structure which tells us which disk blocks contain
1608 * a journal, read the journal from disk to initialise the in-memory
1609 * structures.
1610 */
1611int jbd2_journal_load(journal_t *journal)
1612{
1613	int err;
1614	journal_superblock_t *sb;
1615
1616	err = load_superblock(journal);
1617	if (err)
1618		return err;
1619
1620	sb = journal->j_superblock;
1621	/* If this is a V2 superblock, then we have to check the
1622	 * features flags on it. */
1623
1624	if (journal->j_format_version >= 2) {
1625		if ((sb->s_feature_ro_compat &
1626		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1627		    (sb->s_feature_incompat &
1628		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1629			printk(KERN_WARNING
1630				"JBD2: Unrecognised features on journal\n");
1631			return -EINVAL;
1632		}
1633	}
1634
1635	/*
1636	 * Create a slab for this blocksize
1637	 */
1638	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1639	if (err)
1640		return err;
1641
1642	/* Let the recovery code check whether it needs to recover any
1643	 * data from the journal. */
1644	if (jbd2_journal_recover(journal))
1645		goto recovery_error;
1646
1647	if (journal->j_failed_commit) {
1648		printk(KERN_ERR "JBD2: journal transaction %u on %s "
1649		       "is corrupt.\n", journal->j_failed_commit,
1650		       journal->j_devname);
1651		return -EFSCORRUPTED;
1652	}
 
 
 
 
 
1653
1654	/* OK, we've finished with the dynamic journal bits:
1655	 * reinitialise the dynamic contents of the superblock in memory
1656	 * and reset them on disk. */
1657	if (journal_reset(journal))
1658		goto recovery_error;
1659
1660	journal->j_flags &= ~JBD2_ABORT;
1661	journal->j_flags |= JBD2_LOADED;
1662	return 0;
1663
1664recovery_error:
1665	printk(KERN_WARNING "JBD2: recovery failed\n");
1666	return -EIO;
1667}
1668
1669/**
1670 * void jbd2_journal_destroy() - Release a journal_t structure.
1671 * @journal: Journal to act on.
1672 *
1673 * Release a journal_t structure once it is no longer in use by the
1674 * journaled object.
1675 * Return <0 if we couldn't clean up the journal.
1676 */
1677int jbd2_journal_destroy(journal_t *journal)
1678{
1679	int err = 0;
1680
1681	/* Wait for the commit thread to wake up and die. */
1682	journal_kill_thread(journal);
1683
1684	/* Force a final log commit */
1685	if (journal->j_running_transaction)
1686		jbd2_journal_commit_transaction(journal);
1687
1688	/* Force any old transactions to disk */
1689
1690	/* Totally anal locking here... */
1691	spin_lock(&journal->j_list_lock);
1692	while (journal->j_checkpoint_transactions != NULL) {
1693		spin_unlock(&journal->j_list_lock);
1694		mutex_lock(&journal->j_checkpoint_mutex);
1695		err = jbd2_log_do_checkpoint(journal);
1696		mutex_unlock(&journal->j_checkpoint_mutex);
1697		/*
1698		 * If checkpointing failed, just free the buffers to avoid
1699		 * looping forever
1700		 */
1701		if (err) {
1702			jbd2_journal_destroy_checkpoint(journal);
1703			spin_lock(&journal->j_list_lock);
1704			break;
1705		}
1706		spin_lock(&journal->j_list_lock);
1707	}
1708
1709	J_ASSERT(journal->j_running_transaction == NULL);
1710	J_ASSERT(journal->j_committing_transaction == NULL);
1711	J_ASSERT(journal->j_checkpoint_transactions == NULL);
1712	spin_unlock(&journal->j_list_lock);
1713
1714	if (journal->j_sb_buffer) {
1715		if (!is_journal_aborted(journal)) {
1716			mutex_lock(&journal->j_checkpoint_mutex);
1717
1718			write_lock(&journal->j_state_lock);
1719			journal->j_tail_sequence =
1720				++journal->j_transaction_sequence;
1721			write_unlock(&journal->j_state_lock);
1722
1723			jbd2_mark_journal_empty(journal,
1724					REQ_PREFLUSH | REQ_FUA);
1725			mutex_unlock(&journal->j_checkpoint_mutex);
1726		} else
1727			err = -EIO;
1728		brelse(journal->j_sb_buffer);
1729	}
1730
1731	if (journal->j_proc_entry)
1732		jbd2_stats_proc_exit(journal);
1733	iput(journal->j_inode);
1734	if (journal->j_revoke)
1735		jbd2_journal_destroy_revoke(journal);
1736	if (journal->j_chksum_driver)
1737		crypto_free_shash(journal->j_chksum_driver);
1738	kfree(journal->j_wbuf);
1739	kfree(journal);
1740
1741	return err;
1742}
1743
1744
1745/**
1746 *int jbd2_journal_check_used_features () - Check if features specified are used.
1747 * @journal: Journal to check.
1748 * @compat: bitmask of compatible features
1749 * @ro: bitmask of features that force read-only mount
1750 * @incompat: bitmask of incompatible features
1751 *
1752 * Check whether the journal uses all of a given set of
1753 * features.  Return true (non-zero) if it does.
1754 **/
1755
1756int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
1757				 unsigned long ro, unsigned long incompat)
1758{
1759	journal_superblock_t *sb;
1760
1761	if (!compat && !ro && !incompat)
1762		return 1;
1763	/* Load journal superblock if it is not loaded yet. */
1764	if (journal->j_format_version == 0 &&
1765	    journal_get_superblock(journal) != 0)
1766		return 0;
1767	if (journal->j_format_version == 1)
1768		return 0;
1769
1770	sb = journal->j_superblock;
1771
1772	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1773	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1774	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1775		return 1;
1776
1777	return 0;
1778}
1779
1780/**
1781 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1782 * @journal: Journal to check.
1783 * @compat: bitmask of compatible features
1784 * @ro: bitmask of features that force read-only mount
1785 * @incompat: bitmask of incompatible features
1786 *
1787 * Check whether the journaling code supports the use of
1788 * all of a given set of features on this journal.  Return true
1789 * (non-zero) if it can. */
1790
1791int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
1792				      unsigned long ro, unsigned long incompat)
1793{
1794	if (!compat && !ro && !incompat)
1795		return 1;
1796
1797	/* We can support any known requested features iff the
1798	 * superblock is in version 2.  Otherwise we fail to support any
1799	 * extended sb features. */
1800
1801	if (journal->j_format_version != 2)
1802		return 0;
1803
1804	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1805	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1806	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1807		return 1;
1808
1809	return 0;
1810}
1811
1812/**
1813 * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
1814 * @journal: Journal to act on.
1815 * @compat: bitmask of compatible features
1816 * @ro: bitmask of features that force read-only mount
1817 * @incompat: bitmask of incompatible features
1818 *
1819 * Mark a given journal feature as present on the
1820 * superblock.  Returns true if the requested features could be set.
1821 *
1822 */
1823
1824int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1825			  unsigned long ro, unsigned long incompat)
1826{
1827#define INCOMPAT_FEATURE_ON(f) \
1828		((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
1829#define COMPAT_FEATURE_ON(f) \
1830		((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
1831	journal_superblock_t *sb;
1832
1833	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1834		return 1;
1835
1836	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1837		return 0;
1838
1839	/* If enabling v2 checksums, turn on v3 instead */
1840	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
1841		incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
1842		incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
1843	}
1844
1845	/* Asking for checksumming v3 and v1?  Only give them v3. */
1846	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
1847	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1848		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1849
1850	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1851		  compat, ro, incompat);
1852
1853	sb = journal->j_superblock;
1854
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1855	/* If enabling v3 checksums, update superblock */
1856	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1857		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1858		sb->s_feature_compat &=
1859			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
1860
1861		/* Load the checksum driver */
1862		if (journal->j_chksum_driver == NULL) {
1863			journal->j_chksum_driver = crypto_alloc_shash("crc32c",
1864								      0, 0);
1865			if (IS_ERR(journal->j_chksum_driver)) {
1866				printk(KERN_ERR "JBD2: Cannot load crc32c "
1867				       "driver.\n");
1868				journal->j_chksum_driver = NULL;
1869				return 0;
1870			}
1871
1872			/* Precompute checksum seed for all metadata */
1873			journal->j_csum_seed = jbd2_chksum(journal, ~0,
1874							   sb->s_uuid,
1875							   sizeof(sb->s_uuid));
1876		}
1877	}
1878
1879	/* If enabling v1 checksums, downgrade superblock */
1880	if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1881		sb->s_feature_incompat &=
1882			~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
1883				     JBD2_FEATURE_INCOMPAT_CSUM_V3);
1884
1885	sb->s_feature_compat    |= cpu_to_be32(compat);
1886	sb->s_feature_ro_compat |= cpu_to_be32(ro);
1887	sb->s_feature_incompat  |= cpu_to_be32(incompat);
 
 
 
1888
1889	return 1;
1890#undef COMPAT_FEATURE_ON
1891#undef INCOMPAT_FEATURE_ON
1892}
1893
1894/*
1895 * jbd2_journal_clear_features () - Clear a given journal feature in the
1896 * 				    superblock
1897 * @journal: Journal to act on.
1898 * @compat: bitmask of compatible features
1899 * @ro: bitmask of features that force read-only mount
1900 * @incompat: bitmask of incompatible features
1901 *
1902 * Clear a given journal feature as present on the
1903 * superblock.
1904 */
1905void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1906				unsigned long ro, unsigned long incompat)
1907{
1908	journal_superblock_t *sb;
1909
1910	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1911		  compat, ro, incompat);
1912
1913	sb = journal->j_superblock;
1914
1915	sb->s_feature_compat    &= ~cpu_to_be32(compat);
1916	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1917	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
 
 
1918}
1919EXPORT_SYMBOL(jbd2_journal_clear_features);
1920
1921/**
1922 * int jbd2_journal_flush () - Flush journal
1923 * @journal: Journal to act on.
1924 *
1925 * Flush all data for a given journal to disk and empty the journal.
1926 * Filesystems can use this when remounting readonly to ensure that
1927 * recovery does not need to happen on remount.
1928 */
1929
1930int jbd2_journal_flush(journal_t *journal)
1931{
1932	int err = 0;
1933	transaction_t *transaction = NULL;
1934
1935	write_lock(&journal->j_state_lock);
1936
1937	/* Force everything buffered to the log... */
1938	if (journal->j_running_transaction) {
1939		transaction = journal->j_running_transaction;
1940		__jbd2_log_start_commit(journal, transaction->t_tid);
1941	} else if (journal->j_committing_transaction)
1942		transaction = journal->j_committing_transaction;
1943
1944	/* Wait for the log commit to complete... */
1945	if (transaction) {
1946		tid_t tid = transaction->t_tid;
1947
1948		write_unlock(&journal->j_state_lock);
1949		jbd2_log_wait_commit(journal, tid);
1950	} else {
1951		write_unlock(&journal->j_state_lock);
1952	}
1953
1954	/* ...and flush everything in the log out to disk. */
1955	spin_lock(&journal->j_list_lock);
1956	while (!err && journal->j_checkpoint_transactions != NULL) {
1957		spin_unlock(&journal->j_list_lock);
1958		mutex_lock(&journal->j_checkpoint_mutex);
1959		err = jbd2_log_do_checkpoint(journal);
1960		mutex_unlock(&journal->j_checkpoint_mutex);
1961		spin_lock(&journal->j_list_lock);
1962	}
1963	spin_unlock(&journal->j_list_lock);
1964
1965	if (is_journal_aborted(journal))
1966		return -EIO;
1967
1968	mutex_lock(&journal->j_checkpoint_mutex);
1969	if (!err) {
1970		err = jbd2_cleanup_journal_tail(journal);
1971		if (err < 0) {
1972			mutex_unlock(&journal->j_checkpoint_mutex);
1973			goto out;
1974		}
1975		err = 0;
1976	}
1977
1978	/* Finally, mark the journal as really needing no recovery.
1979	 * This sets s_start==0 in the underlying superblock, which is
1980	 * the magic code for a fully-recovered superblock.  Any future
1981	 * commits of data to the journal will restore the current
1982	 * s_start value. */
1983	jbd2_mark_journal_empty(journal, REQ_FUA);
1984	mutex_unlock(&journal->j_checkpoint_mutex);
1985	write_lock(&journal->j_state_lock);
1986	J_ASSERT(!journal->j_running_transaction);
1987	J_ASSERT(!journal->j_committing_transaction);
1988	J_ASSERT(!journal->j_checkpoint_transactions);
1989	J_ASSERT(journal->j_head == journal->j_tail);
1990	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
1991	write_unlock(&journal->j_state_lock);
1992out:
1993	return err;
1994}
1995
1996/**
1997 * int jbd2_journal_wipe() - Wipe journal contents
1998 * @journal: Journal to act on.
1999 * @write: flag (see below)
2000 *
2001 * Wipe out all of the contents of a journal, safely.  This will produce
2002 * a warning if the journal contains any valid recovery information.
2003 * Must be called between journal_init_*() and jbd2_journal_load().
2004 *
2005 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
2006 * we merely suppress recovery.
2007 */
2008
2009int jbd2_journal_wipe(journal_t *journal, int write)
2010{
2011	int err = 0;
2012
2013	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
2014
2015	err = load_superblock(journal);
2016	if (err)
2017		return err;
2018
2019	if (!journal->j_tail)
2020		goto no_recovery;
2021
2022	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
2023		write ? "Clearing" : "Ignoring");
2024
2025	err = jbd2_journal_skip_recovery(journal);
2026	if (write) {
2027		/* Lock to make assertions happy... */
2028		mutex_lock(&journal->j_checkpoint_mutex);
2029		jbd2_mark_journal_empty(journal, REQ_FUA);
2030		mutex_unlock(&journal->j_checkpoint_mutex);
2031	}
2032
2033 no_recovery:
2034	return err;
2035}
2036
2037/*
2038 * Journal abort has very specific semantics, which we describe
2039 * for journal abort.
2040 *
2041 * Two internal functions, which provide abort to the jbd layer
2042 * itself are here.
2043 */
2044
2045/*
2046 * Quick version for internal journal use (doesn't lock the journal).
2047 * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
2048 * and don't attempt to make any other journal updates.
2049 */
2050void __jbd2_journal_abort_hard(journal_t *journal)
2051{
2052	transaction_t *transaction;
2053
2054	if (journal->j_flags & JBD2_ABORT)
2055		return;
2056
2057	printk(KERN_ERR "Aborting journal on device %s.\n",
2058	       journal->j_devname);
2059
2060	write_lock(&journal->j_state_lock);
2061	journal->j_flags |= JBD2_ABORT;
2062	transaction = journal->j_running_transaction;
2063	if (transaction)
2064		__jbd2_log_start_commit(journal, transaction->t_tid);
2065	write_unlock(&journal->j_state_lock);
2066}
2067
2068/* Soft abort: record the abort error status in the journal superblock,
2069 * but don't do any other IO. */
2070static void __journal_abort_soft (journal_t *journal, int errno)
2071{
2072	if (journal->j_flags & JBD2_ABORT)
2073		return;
2074
2075	if (!journal->j_errno)
2076		journal->j_errno = errno;
2077
2078	__jbd2_journal_abort_hard(journal);
2079
2080	if (errno) {
2081		jbd2_journal_update_sb_errno(journal);
2082		write_lock(&journal->j_state_lock);
2083		journal->j_flags |= JBD2_REC_ERR;
2084		write_unlock(&journal->j_state_lock);
2085	}
2086}
2087
2088/**
2089 * void jbd2_journal_abort () - Shutdown the journal immediately.
2090 * @journal: the journal to shutdown.
2091 * @errno:   an error number to record in the journal indicating
2092 *           the reason for the shutdown.
2093 *
2094 * Perform a complete, immediate shutdown of the ENTIRE
2095 * journal (not of a single transaction).  This operation cannot be
2096 * undone without closing and reopening the journal.
2097 *
2098 * The jbd2_journal_abort function is intended to support higher level error
2099 * recovery mechanisms such as the ext2/ext3 remount-readonly error
2100 * mode.
2101 *
2102 * Journal abort has very specific semantics.  Any existing dirty,
2103 * unjournaled buffers in the main filesystem will still be written to
2104 * disk by bdflush, but the journaling mechanism will be suspended
2105 * immediately and no further transaction commits will be honoured.
2106 *
2107 * Any dirty, journaled buffers will be written back to disk without
2108 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
2109 * filesystem, but we _do_ attempt to leave as much data as possible
2110 * behind for fsck to use for cleanup.
2111 *
2112 * Any attempt to get a new transaction handle on a journal which is in
2113 * ABORT state will just result in an -EROFS error return.  A
2114 * jbd2_journal_stop on an existing handle will return -EIO if we have
2115 * entered abort state during the update.
2116 *
2117 * Recursive transactions are not disturbed by journal abort until the
2118 * final jbd2_journal_stop, which will receive the -EIO error.
2119 *
2120 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
2121 * which will be recorded (if possible) in the journal superblock.  This
2122 * allows a client to record failure conditions in the middle of a
2123 * transaction without having to complete the transaction to record the
2124 * failure to disk.  ext3_error, for example, now uses this
2125 * functionality.
2126 *
2127 * Errors which originate from within the journaling layer will NOT
2128 * supply an errno; a null errno implies that absolutely no further
2129 * writes are done to the journal (unless there are any already in
2130 * progress).
2131 *
2132 */
2133
2134void jbd2_journal_abort(journal_t *journal, int errno)
2135{
2136	__journal_abort_soft(journal, errno);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2137}
2138
2139/**
2140 * int jbd2_journal_errno () - returns the journal's error state.
2141 * @journal: journal to examine.
2142 *
2143 * This is the errno number set with jbd2_journal_abort(), the last
2144 * time the journal was mounted - if the journal was stopped
2145 * without calling abort this will be 0.
2146 *
2147 * If the journal has been aborted on this mount time -EROFS will
2148 * be returned.
2149 */
2150int jbd2_journal_errno(journal_t *journal)
2151{
2152	int err;
2153
2154	read_lock(&journal->j_state_lock);
2155	if (journal->j_flags & JBD2_ABORT)
2156		err = -EROFS;
2157	else
2158		err = journal->j_errno;
2159	read_unlock(&journal->j_state_lock);
2160	return err;
2161}
2162
2163/**
2164 * int jbd2_journal_clear_err () - clears the journal's error state
2165 * @journal: journal to act on.
2166 *
2167 * An error must be cleared or acked to take a FS out of readonly
2168 * mode.
2169 */
2170int jbd2_journal_clear_err(journal_t *journal)
2171{
2172	int err = 0;
2173
2174	write_lock(&journal->j_state_lock);
2175	if (journal->j_flags & JBD2_ABORT)
2176		err = -EROFS;
2177	else
2178		journal->j_errno = 0;
2179	write_unlock(&journal->j_state_lock);
2180	return err;
2181}
2182
2183/**
2184 * void jbd2_journal_ack_err() - Ack journal err.
2185 * @journal: journal to act on.
2186 *
2187 * An error must be cleared or acked to take a FS out of readonly
2188 * mode.
2189 */
2190void jbd2_journal_ack_err(journal_t *journal)
2191{
2192	write_lock(&journal->j_state_lock);
2193	if (journal->j_errno)
2194		journal->j_flags |= JBD2_ACK_ERR;
2195	write_unlock(&journal->j_state_lock);
2196}
2197
2198int jbd2_journal_blocks_per_page(struct inode *inode)
2199{
2200	return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
2201}
2202
2203/*
2204 * helper functions to deal with 32 or 64bit block numbers.
2205 */
2206size_t journal_tag_bytes(journal_t *journal)
2207{
2208	size_t sz;
2209
2210	if (jbd2_has_feature_csum3(journal))
2211		return sizeof(journal_block_tag3_t);
2212
2213	sz = sizeof(journal_block_tag_t);
2214
2215	if (jbd2_has_feature_csum2(journal))
2216		sz += sizeof(__u16);
2217
2218	if (jbd2_has_feature_64bit(journal))
2219		return sz;
2220	else
2221		return sz - sizeof(__u32);
2222}
2223
2224/*
2225 * JBD memory management
2226 *
2227 * These functions are used to allocate block-sized chunks of memory
2228 * used for making copies of buffer_head data.  Very often it will be
2229 * page-sized chunks of data, but sometimes it will be in
2230 * sub-page-size chunks.  (For example, 16k pages on Power systems
2231 * with a 4k block file system.)  For blocks smaller than a page, we
2232 * use a SLAB allocator.  There are slab caches for each block size,
2233 * which are allocated at mount time, if necessary, and we only free
2234 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
2235 * this reason we don't need to a mutex to protect access to
2236 * jbd2_slab[] allocating or releasing memory; only in
2237 * jbd2_journal_create_slab().
2238 */
2239#define JBD2_MAX_SLABS 8
2240static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
2241
2242static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
2243	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
2244	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
2245};
2246
2247
2248static void jbd2_journal_destroy_slabs(void)
2249{
2250	int i;
2251
2252	for (i = 0; i < JBD2_MAX_SLABS; i++) {
2253		if (jbd2_slab[i])
2254			kmem_cache_destroy(jbd2_slab[i]);
2255		jbd2_slab[i] = NULL;
2256	}
2257}
2258
2259static int jbd2_journal_create_slab(size_t size)
2260{
2261	static DEFINE_MUTEX(jbd2_slab_create_mutex);
2262	int i = order_base_2(size) - 10;
2263	size_t slab_size;
2264
2265	if (size == PAGE_SIZE)
2266		return 0;
2267
2268	if (i >= JBD2_MAX_SLABS)
2269		return -EINVAL;
2270
2271	if (unlikely(i < 0))
2272		i = 0;
2273	mutex_lock(&jbd2_slab_create_mutex);
2274	if (jbd2_slab[i]) {
2275		mutex_unlock(&jbd2_slab_create_mutex);
2276		return 0;	/* Already created */
2277	}
2278
2279	slab_size = 1 << (i+10);
2280	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
2281					 slab_size, 0, NULL);
2282	mutex_unlock(&jbd2_slab_create_mutex);
2283	if (!jbd2_slab[i]) {
2284		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
2285		return -ENOMEM;
2286	}
2287	return 0;
2288}
2289
2290static struct kmem_cache *get_slab(size_t size)
2291{
2292	int i = order_base_2(size) - 10;
2293
2294	BUG_ON(i >= JBD2_MAX_SLABS);
2295	if (unlikely(i < 0))
2296		i = 0;
2297	BUG_ON(jbd2_slab[i] == NULL);
2298	return jbd2_slab[i];
2299}
2300
2301void *jbd2_alloc(size_t size, gfp_t flags)
2302{
2303	void *ptr;
2304
2305	BUG_ON(size & (size-1)); /* Must be a power of 2 */
2306
2307	if (size < PAGE_SIZE)
2308		ptr = kmem_cache_alloc(get_slab(size), flags);
2309	else
2310		ptr = (void *)__get_free_pages(flags, get_order(size));
2311
2312	/* Check alignment; SLUB has gotten this wrong in the past,
2313	 * and this can lead to user data corruption! */
2314	BUG_ON(((unsigned long) ptr) & (size-1));
2315
2316	return ptr;
2317}
2318
2319void jbd2_free(void *ptr, size_t size)
2320{
2321	if (size < PAGE_SIZE)
2322		kmem_cache_free(get_slab(size), ptr);
2323	else
2324		free_pages((unsigned long)ptr, get_order(size));
2325};
2326
2327/*
2328 * Journal_head storage management
2329 */
2330static struct kmem_cache *jbd2_journal_head_cache;
2331#ifdef CONFIG_JBD2_DEBUG
2332static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2333#endif
2334
2335static int jbd2_journal_init_journal_head_cache(void)
2336{
2337	int retval;
2338
2339	J_ASSERT(jbd2_journal_head_cache == NULL);
2340	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
2341				sizeof(struct journal_head),
2342				0,		/* offset */
2343				SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU,
2344				NULL);		/* ctor */
2345	retval = 0;
2346	if (!jbd2_journal_head_cache) {
2347		retval = -ENOMEM;
2348		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
 
2349	}
2350	return retval;
2351}
2352
2353static void jbd2_journal_destroy_journal_head_cache(void)
2354{
2355	if (jbd2_journal_head_cache) {
2356		kmem_cache_destroy(jbd2_journal_head_cache);
2357		jbd2_journal_head_cache = NULL;
2358	}
2359}
2360
2361/*
2362 * journal_head splicing and dicing
2363 */
2364static struct journal_head *journal_alloc_journal_head(void)
2365{
2366	struct journal_head *ret;
2367
2368#ifdef CONFIG_JBD2_DEBUG
2369	atomic_inc(&nr_journal_heads);
2370#endif
2371	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2372	if (!ret) {
2373		jbd_debug(1, "out of memory for journal_head\n");
2374		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2375		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
2376				GFP_NOFS | __GFP_NOFAIL);
2377	}
 
 
2378	return ret;
2379}
2380
2381static void journal_free_journal_head(struct journal_head *jh)
2382{
2383#ifdef CONFIG_JBD2_DEBUG
2384	atomic_dec(&nr_journal_heads);
2385	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
2386#endif
2387	kmem_cache_free(jbd2_journal_head_cache, jh);
2388}
2389
2390/*
2391 * A journal_head is attached to a buffer_head whenever JBD has an
2392 * interest in the buffer.
2393 *
2394 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
2395 * is set.  This bit is tested in core kernel code where we need to take
2396 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
2397 * there.
2398 *
2399 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
2400 *
2401 * When a buffer has its BH_JBD bit set it is immune from being released by
2402 * core kernel code, mainly via ->b_count.
2403 *
2404 * A journal_head is detached from its buffer_head when the journal_head's
2405 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
2406 * transaction (b_cp_transaction) hold their references to b_jcount.
2407 *
2408 * Various places in the kernel want to attach a journal_head to a buffer_head
2409 * _before_ attaching the journal_head to a transaction.  To protect the
2410 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
2411 * journal_head's b_jcount refcount by one.  The caller must call
2412 * jbd2_journal_put_journal_head() to undo this.
2413 *
2414 * So the typical usage would be:
2415 *
2416 *	(Attach a journal_head if needed.  Increments b_jcount)
2417 *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
2418 *	...
2419 *      (Get another reference for transaction)
2420 *	jbd2_journal_grab_journal_head(bh);
2421 *	jh->b_transaction = xxx;
2422 *	(Put original reference)
2423 *	jbd2_journal_put_journal_head(jh);
2424 */
2425
2426/*
2427 * Give a buffer_head a journal_head.
2428 *
2429 * May sleep.
2430 */
2431struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2432{
2433	struct journal_head *jh;
2434	struct journal_head *new_jh = NULL;
2435
2436repeat:
2437	if (!buffer_jbd(bh))
2438		new_jh = journal_alloc_journal_head();
2439
2440	jbd_lock_bh_journal_head(bh);
2441	if (buffer_jbd(bh)) {
2442		jh = bh2jh(bh);
2443	} else {
2444		J_ASSERT_BH(bh,
2445			(atomic_read(&bh->b_count) > 0) ||
2446			(bh->b_page && bh->b_page->mapping));
2447
2448		if (!new_jh) {
2449			jbd_unlock_bh_journal_head(bh);
2450			goto repeat;
2451		}
2452
2453		jh = new_jh;
2454		new_jh = NULL;		/* We consumed it */
2455		set_buffer_jbd(bh);
2456		bh->b_private = jh;
2457		jh->b_bh = bh;
2458		get_bh(bh);
2459		BUFFER_TRACE(bh, "added journal_head");
2460	}
2461	jh->b_jcount++;
2462	jbd_unlock_bh_journal_head(bh);
2463	if (new_jh)
2464		journal_free_journal_head(new_jh);
2465	return bh->b_private;
2466}
2467
2468/*
2469 * Grab a ref against this buffer_head's journal_head.  If it ended up not
2470 * having a journal_head, return NULL
2471 */
2472struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
2473{
2474	struct journal_head *jh = NULL;
2475
2476	jbd_lock_bh_journal_head(bh);
2477	if (buffer_jbd(bh)) {
2478		jh = bh2jh(bh);
2479		jh->b_jcount++;
2480	}
2481	jbd_unlock_bh_journal_head(bh);
2482	return jh;
2483}
2484
2485static void __journal_remove_journal_head(struct buffer_head *bh)
2486{
2487	struct journal_head *jh = bh2jh(bh);
2488
2489	J_ASSERT_JH(jh, jh->b_jcount >= 0);
2490	J_ASSERT_JH(jh, jh->b_transaction == NULL);
2491	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2492	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
2493	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
2494	J_ASSERT_BH(bh, buffer_jbd(bh));
2495	J_ASSERT_BH(bh, jh2bh(jh) == bh);
2496	BUFFER_TRACE(bh, "remove journal_head");
 
 
 
 
 
 
 
 
 
2497	if (jh->b_frozen_data) {
2498		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
2499		jbd2_free(jh->b_frozen_data, bh->b_size);
2500	}
2501	if (jh->b_committed_data) {
2502		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
2503		jbd2_free(jh->b_committed_data, bh->b_size);
2504	}
2505	bh->b_private = NULL;
2506	jh->b_bh = NULL;	/* debug, really */
2507	clear_buffer_jbd(bh);
2508	journal_free_journal_head(jh);
2509}
2510
2511/*
2512 * Drop a reference on the passed journal_head.  If it fell to zero then
2513 * release the journal_head from the buffer_head.
2514 */
2515void jbd2_journal_put_journal_head(struct journal_head *jh)
2516{
2517	struct buffer_head *bh = jh2bh(jh);
2518
2519	jbd_lock_bh_journal_head(bh);
2520	J_ASSERT_JH(jh, jh->b_jcount > 0);
2521	--jh->b_jcount;
2522	if (!jh->b_jcount) {
2523		__journal_remove_journal_head(bh);
2524		jbd_unlock_bh_journal_head(bh);
 
2525		__brelse(bh);
2526	} else
2527		jbd_unlock_bh_journal_head(bh);
 
2528}
2529
2530/*
2531 * Initialize jbd inode head
2532 */
2533void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2534{
2535	jinode->i_transaction = NULL;
2536	jinode->i_next_transaction = NULL;
2537	jinode->i_vfs_inode = inode;
2538	jinode->i_flags = 0;
 
 
2539	INIT_LIST_HEAD(&jinode->i_list);
2540}
2541
2542/*
2543 * Function to be called before we start removing inode from memory (i.e.,
2544 * clear_inode() is a fine place to be called from). It removes inode from
2545 * transaction's lists.
2546 */
2547void jbd2_journal_release_jbd_inode(journal_t *journal,
2548				    struct jbd2_inode *jinode)
2549{
2550	if (!journal)
2551		return;
2552restart:
2553	spin_lock(&journal->j_list_lock);
2554	/* Is commit writing out inode - we have to wait */
2555	if (jinode->i_flags & JI_COMMIT_RUNNING) {
2556		wait_queue_head_t *wq;
2557		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2558		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2559		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2560		spin_unlock(&journal->j_list_lock);
2561		schedule();
2562		finish_wait(wq, &wait.wait);
2563		goto restart;
2564	}
2565
2566	if (jinode->i_transaction) {
2567		list_del(&jinode->i_list);
2568		jinode->i_transaction = NULL;
2569	}
2570	spin_unlock(&journal->j_list_lock);
2571}
2572
2573
2574#ifdef CONFIG_PROC_FS
2575
2576#define JBD2_STATS_PROC_NAME "fs/jbd2"
2577
2578static void __init jbd2_create_jbd_stats_proc_entry(void)
2579{
2580	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
2581}
2582
2583static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2584{
2585	if (proc_jbd2_stats)
2586		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
2587}
2588
2589#else
2590
2591#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
2592#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
2593
2594#endif
2595
2596struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2597
 
 
 
 
 
 
 
 
 
 
 
2598static int __init jbd2_journal_init_handle_cache(void)
2599{
 
2600	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2601	if (jbd2_handle_cache == NULL) {
2602		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2603		return -ENOMEM;
2604	}
2605	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2606	if (jbd2_inode_cache == NULL) {
2607		printk(KERN_EMERG "JBD2: failed to create inode cache\n");
2608		kmem_cache_destroy(jbd2_handle_cache);
2609		return -ENOMEM;
2610	}
2611	return 0;
2612}
2613
2614static void jbd2_journal_destroy_handle_cache(void)
2615{
2616	if (jbd2_handle_cache)
2617		kmem_cache_destroy(jbd2_handle_cache);
2618	if (jbd2_inode_cache)
2619		kmem_cache_destroy(jbd2_inode_cache);
2620
 
 
 
 
2621}
2622
2623/*
2624 * Module startup and shutdown
2625 */
2626
2627static int __init journal_init_caches(void)
2628{
2629	int ret;
2630
2631	ret = jbd2_journal_init_revoke_caches();
 
 
2632	if (ret == 0)
2633		ret = jbd2_journal_init_journal_head_cache();
2634	if (ret == 0)
2635		ret = jbd2_journal_init_handle_cache();
2636	if (ret == 0)
 
 
2637		ret = jbd2_journal_init_transaction_cache();
2638	return ret;
2639}
2640
2641static void jbd2_journal_destroy_caches(void)
2642{
2643	jbd2_journal_destroy_revoke_caches();
 
2644	jbd2_journal_destroy_journal_head_cache();
2645	jbd2_journal_destroy_handle_cache();
 
2646	jbd2_journal_destroy_transaction_cache();
2647	jbd2_journal_destroy_slabs();
2648}
2649
2650static int __init journal_init(void)
2651{
2652	int ret;
2653
2654	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2655
2656	ret = journal_init_caches();
2657	if (ret == 0) {
2658		jbd2_create_jbd_stats_proc_entry();
2659	} else {
2660		jbd2_journal_destroy_caches();
2661	}
2662	return ret;
2663}
2664
2665static void __exit journal_exit(void)
2666{
2667#ifdef CONFIG_JBD2_DEBUG
2668	int n = atomic_read(&nr_journal_heads);
2669	if (n)
2670		printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
2671#endif
2672	jbd2_remove_jbd_stats_proc_entry();
2673	jbd2_journal_destroy_caches();
2674}
2675
2676MODULE_LICENSE("GPL");
2677module_init(journal_init);
2678module_exit(journal_exit);
2679

   1// SPDX-License-Identifier: GPL-2.0+
   2/*
   3 * linux/fs/jbd2/journal.c
   4 *
   5 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6 *
   7 * Copyright 1998 Red Hat corp --- All Rights Reserved
   8 *
 
 
 
 
   9 * Generic filesystem journal-writing code; part of the ext2fs
  10 * journaling system.
  11 *
  12 * This file manages journals: areas of disk reserved for logging
  13 * transactional updates.  This includes the kernel journaling thread
  14 * which is responsible for scheduling updates to the log.
  15 *
  16 * We do not actually manage the physical storage of the journal in this
  17 * file: that is left to a per-journal policy function, which allows us
  18 * to store the journal within a filesystem-specified area for ext2
  19 * journaling (ext2 can use a reserved inode for storing the log).
  20 */
  21
  22#include <linux/module.h>
  23#include <linux/time.h>
  24#include <linux/fs.h>
  25#include <linux/jbd2.h>
  26#include <linux/errno.h>
  27#include <linux/slab.h>
  28#include <linux/init.h>
  29#include <linux/mm.h>
  30#include <linux/freezer.h>
  31#include <linux/pagemap.h>
  32#include <linux/kthread.h>
  33#include <linux/poison.h>
  34#include <linux/proc_fs.h>
  35#include <linux/seq_file.h>
  36#include <linux/math64.h>
  37#include <linux/hash.h>
  38#include <linux/log2.h>
  39#include <linux/vmalloc.h>
  40#include <linux/backing-dev.h>
  41#include <linux/bitops.h>
  42#include <linux/ratelimit.h>
  43#include <linux/sched/mm.h>
  44
  45#define CREATE_TRACE_POINTS
  46#include <trace/events/jbd2.h>
  47
  48#include <linux/uaccess.h>
  49#include <asm/page.h>
  50
  51#ifdef CONFIG_JBD2_DEBUG
  52ushort jbd2_journal_enable_debug __read_mostly;
  53EXPORT_SYMBOL(jbd2_journal_enable_debug);
  54
  55module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644);
  56MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2");
  57#endif
  58
  59EXPORT_SYMBOL(jbd2_journal_extend);
  60EXPORT_SYMBOL(jbd2_journal_stop);
  61EXPORT_SYMBOL(jbd2_journal_lock_updates);
  62EXPORT_SYMBOL(jbd2_journal_unlock_updates);
  63EXPORT_SYMBOL(jbd2_journal_get_write_access);
  64EXPORT_SYMBOL(jbd2_journal_get_create_access);
  65EXPORT_SYMBOL(jbd2_journal_get_undo_access);
  66EXPORT_SYMBOL(jbd2_journal_set_triggers);
  67EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
  68EXPORT_SYMBOL(jbd2_journal_forget);
 
 
 
  69EXPORT_SYMBOL(jbd2_journal_flush);
  70EXPORT_SYMBOL(jbd2_journal_revoke);
  71
  72EXPORT_SYMBOL(jbd2_journal_init_dev);
  73EXPORT_SYMBOL(jbd2_journal_init_inode);
  74EXPORT_SYMBOL(jbd2_journal_check_used_features);
  75EXPORT_SYMBOL(jbd2_journal_check_available_features);
  76EXPORT_SYMBOL(jbd2_journal_set_features);
  77EXPORT_SYMBOL(jbd2_journal_load);
  78EXPORT_SYMBOL(jbd2_journal_destroy);
  79EXPORT_SYMBOL(jbd2_journal_abort);
  80EXPORT_SYMBOL(jbd2_journal_errno);
  81EXPORT_SYMBOL(jbd2_journal_ack_err);
  82EXPORT_SYMBOL(jbd2_journal_clear_err);
  83EXPORT_SYMBOL(jbd2_log_wait_commit);
  84EXPORT_SYMBOL(jbd2_log_start_commit);
  85EXPORT_SYMBOL(jbd2_journal_start_commit);
  86EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
  87EXPORT_SYMBOL(jbd2_journal_wipe);
  88EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
  89EXPORT_SYMBOL(jbd2_journal_invalidatepage);
  90EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
  91EXPORT_SYMBOL(jbd2_journal_force_commit);
  92EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
  93EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
  94EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
  95EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
  96EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
  97EXPORT_SYMBOL(jbd2_inode_cache);
  98
 
  99static int jbd2_journal_create_slab(size_t slab_size);
 100
 101#ifdef CONFIG_JBD2_DEBUG
 102void __jbd2_debug(int level, const char *file, const char *func,
 103		  unsigned int line, const char *fmt, ...)
 104{
 105	struct va_format vaf;
 106	va_list args;
 107
 108	if (level > jbd2_journal_enable_debug)
 109		return;
 110	va_start(args, fmt);
 111	vaf.fmt = fmt;
 112	vaf.va = &args;
 113	printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf);
 114	va_end(args);
 115}
 116EXPORT_SYMBOL(__jbd2_debug);
 117#endif
 118
 119/* Checksumming functions */
 120static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
 121{
 122	if (!jbd2_journal_has_csum_v2or3_feature(j))
 123		return 1;
 124
 125	return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
 126}
 127
 128static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
 129{
 130	__u32 csum;
 131	__be32 old_csum;
 132
 133	old_csum = sb->s_checksum;
 134	sb->s_checksum = 0;
 135	csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
 136	sb->s_checksum = old_csum;
 137
 138	return cpu_to_be32(csum);
 139}
 140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 141/*
 142 * Helper function used to manage commit timeouts
 143 */
 144
 145static void commit_timeout(struct timer_list *t)
 146{
 147	journal_t *journal = from_timer(journal, t, j_commit_timer);
 148
 149	wake_up_process(journal->j_task);
 150}
 151
 152/*
 153 * kjournald2: The main thread function used to manage a logging device
 154 * journal.
 155 *
 156 * This kernel thread is responsible for two things:
 157 *
 158 * 1) COMMIT:  Every so often we need to commit the current state of the
 159 *    filesystem to disk.  The journal thread is responsible for writing
 160 *    all of the metadata buffers to disk.
 161 *
 162 * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
 163 *    of the data in that part of the log has been rewritten elsewhere on
 164 *    the disk.  Flushing these old buffers to reclaim space in the log is
 165 *    known as checkpointing, and this thread is responsible for that job.
 166 */
 167
 168static int kjournald2(void *arg)
 169{
 170	journal_t *journal = arg;
 171	transaction_t *transaction;
 172
 173	/*
 174	 * Set up an interval timer which can be used to trigger a commit wakeup
 175	 * after the commit interval expires
 176	 */
 177	timer_setup(&journal->j_commit_timer, commit_timeout, 0);
 
 178
 179	set_freezable();
 180
 181	/* Record that the journal thread is running */
 182	journal->j_task = current;
 183	wake_up(&journal->j_wait_done_commit);
 184
 185	/*
 186	 * Make sure that no allocations from this kernel thread will ever
 187	 * recurse to the fs layer because we are responsible for the
 188	 * transaction commit and any fs involvement might get stuck waiting for
 189	 * the trasn. commit.
 190	 */
 191	memalloc_nofs_save();
 192
 193	/*
 194	 * And now, wait forever for commit wakeup events.
 195	 */
 196	write_lock(&journal->j_state_lock);
 197
 198loop:
 199	if (journal->j_flags & JBD2_UNMOUNT)
 200		goto end_loop;
 201
 202	jbd_debug(1, "commit_sequence=%u, commit_request=%u\n",
 203		journal->j_commit_sequence, journal->j_commit_request);
 204
 205	if (journal->j_commit_sequence != journal->j_commit_request) {
 206		jbd_debug(1, "OK, requests differ\n");
 207		write_unlock(&journal->j_state_lock);
 208		del_timer_sync(&journal->j_commit_timer);
 209		jbd2_journal_commit_transaction(journal);
 210		write_lock(&journal->j_state_lock);
 211		goto loop;
 212	}
 213
 214	wake_up(&journal->j_wait_done_commit);
 215	if (freezing(current)) {
 216		/*
 217		 * The simpler the better. Flushing journal isn't a
 218		 * good idea, because that depends on threads that may
 219		 * be already stopped.
 220		 */
 221		jbd_debug(1, "Now suspending kjournald2\n");
 222		write_unlock(&journal->j_state_lock);
 223		try_to_freeze();
 224		write_lock(&journal->j_state_lock);
 225	} else {
 226		/*
 227		 * We assume on resume that commits are already there,
 228		 * so we don't sleep
 229		 */
 230		DEFINE_WAIT(wait);
 231		int should_sleep = 1;
 232
 233		prepare_to_wait(&journal->j_wait_commit, &wait,
 234				TASK_INTERRUPTIBLE);
 235		if (journal->j_commit_sequence != journal->j_commit_request)
 236			should_sleep = 0;
 237		transaction = journal->j_running_transaction;
 238		if (transaction && time_after_eq(jiffies,
 239						transaction->t_expires))
 240			should_sleep = 0;
 241		if (journal->j_flags & JBD2_UNMOUNT)
 242			should_sleep = 0;
 243		if (should_sleep) {
 244			write_unlock(&journal->j_state_lock);
 245			schedule();
 246			write_lock(&journal->j_state_lock);
 247		}
 248		finish_wait(&journal->j_wait_commit, &wait);
 249	}
 250
 251	jbd_debug(1, "kjournald2 wakes\n");
 252
 253	/*
 254	 * Were we woken up by a commit wakeup event?
 255	 */
 256	transaction = journal->j_running_transaction;
 257	if (transaction && time_after_eq(jiffies, transaction->t_expires)) {
 258		journal->j_commit_request = transaction->t_tid;
 259		jbd_debug(1, "woke because of timeout\n");
 260	}
 261	goto loop;
 262
 263end_loop:
 
 264	del_timer_sync(&journal->j_commit_timer);
 265	journal->j_task = NULL;
 266	wake_up(&journal->j_wait_done_commit);
 267	jbd_debug(1, "Journal thread exiting.\n");
 268	write_unlock(&journal->j_state_lock);
 269	return 0;
 270}
 271
 272static int jbd2_journal_start_thread(journal_t *journal)
 273{
 274	struct task_struct *t;
 275
 276	t = kthread_run(kjournald2, journal, "jbd2/%s",
 277			journal->j_devname);
 278	if (IS_ERR(t))
 279		return PTR_ERR(t);
 280
 281	wait_event(journal->j_wait_done_commit, journal->j_task != NULL);
 282	return 0;
 283}
 284
 285static void journal_kill_thread(journal_t *journal)
 286{
 287	write_lock(&journal->j_state_lock);
 288	journal->j_flags |= JBD2_UNMOUNT;
 289
 290	while (journal->j_task) {
 291		write_unlock(&journal->j_state_lock);
 292		wake_up(&journal->j_wait_commit);
 293		wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
 294		write_lock(&journal->j_state_lock);
 295	}
 296	write_unlock(&journal->j_state_lock);
 297}
 298
 299/*
 300 * jbd2_journal_write_metadata_buffer: write a metadata buffer to the journal.
 301 *
 302 * Writes a metadata buffer to a given disk block.  The actual IO is not
 303 * performed but a new buffer_head is constructed which labels the data
 304 * to be written with the correct destination disk block.
 305 *
 306 * Any magic-number escaping which needs to be done will cause a
 307 * copy-out here.  If the buffer happens to start with the
 308 * JBD2_MAGIC_NUMBER, then we can't write it to the log directly: the
 309 * magic number is only written to the log for descripter blocks.  In
 310 * this case, we copy the data and replace the first word with 0, and we
 311 * return a result code which indicates that this buffer needs to be
 312 * marked as an escaped buffer in the corresponding log descriptor
 313 * block.  The missing word can then be restored when the block is read
 314 * during recovery.
 315 *
 316 * If the source buffer has already been modified by a new transaction
 317 * since we took the last commit snapshot, we use the frozen copy of
 318 * that data for IO. If we end up using the existing buffer_head's data
 319 * for the write, then we have to make sure nobody modifies it while the
 320 * IO is in progress. do_get_write_access() handles this.
 321 *
 322 * The function returns a pointer to the buffer_head to be used for IO.
 323 *
 324 *
 325 * Return value:
 326 *  <0: Error
 327 * >=0: Finished OK
 328 *
 329 * On success:
 330 * Bit 0 set == escape performed on the data
 331 * Bit 1 set == buffer copy-out performed (kfree the data after IO)
 332 */
 333
 334int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 335				  struct journal_head  *jh_in,
 336				  struct buffer_head **bh_out,
 337				  sector_t blocknr)
 338{
 339	int need_copy_out = 0;
 340	int done_copy_out = 0;
 341	int do_escape = 0;
 342	char *mapped_data;
 343	struct buffer_head *new_bh;
 344	struct page *new_page;
 345	unsigned int new_offset;
 346	struct buffer_head *bh_in = jh2bh(jh_in);
 347	journal_t *journal = transaction->t_journal;
 348
 349	/*
 350	 * The buffer really shouldn't be locked: only the current committing
 351	 * transaction is allowed to write it, so nobody else is allowed
 352	 * to do any IO.
 353	 *
 354	 * akpm: except if we're journalling data, and write() output is
 355	 * also part of a shared mapping, and another thread has
 356	 * decided to launch a writepage() against this buffer.
 357	 */
 358	J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
 359
 360	new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
 361
 362	/* keep subsequent assertions sane */
 363	atomic_set(&new_bh->b_count, 1);
 364
 365	spin_lock(&jh_in->b_state_lock);
 366repeat:
 367	/*
 368	 * If a new transaction has already done a buffer copy-out, then
 369	 * we use that version of the data for the commit.
 370	 */
 371	if (jh_in->b_frozen_data) {
 372		done_copy_out = 1;
 373		new_page = virt_to_page(jh_in->b_frozen_data);
 374		new_offset = offset_in_page(jh_in->b_frozen_data);
 375	} else {
 376		new_page = jh2bh(jh_in)->b_page;
 377		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
 378	}
 379
 380	mapped_data = kmap_atomic(new_page);
 381	/*
 382	 * Fire data frozen trigger if data already wasn't frozen.  Do this
 383	 * before checking for escaping, as the trigger may modify the magic
 384	 * offset.  If a copy-out happens afterwards, it will have the correct
 385	 * data in the buffer.
 386	 */
 387	if (!done_copy_out)
 388		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
 389					   jh_in->b_triggers);
 390
 391	/*
 392	 * Check for escaping
 393	 */
 394	if (*((__be32 *)(mapped_data + new_offset)) ==
 395				cpu_to_be32(JBD2_MAGIC_NUMBER)) {
 396		need_copy_out = 1;
 397		do_escape = 1;
 398	}
 399	kunmap_atomic(mapped_data);
 400
 401	/*
 402	 * Do we need to do a data copy?
 403	 */
 404	if (need_copy_out && !done_copy_out) {
 405		char *tmp;
 406
 407		spin_unlock(&jh_in->b_state_lock);
 408		tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
 409		if (!tmp) {
 410			brelse(new_bh);
 411			return -ENOMEM;
 412		}
 413		spin_lock(&jh_in->b_state_lock);
 414		if (jh_in->b_frozen_data) {
 415			jbd2_free(tmp, bh_in->b_size);
 416			goto repeat;
 417		}
 418
 419		jh_in->b_frozen_data = tmp;
 420		mapped_data = kmap_atomic(new_page);
 421		memcpy(tmp, mapped_data + new_offset, bh_in->b_size);
 422		kunmap_atomic(mapped_data);
 423
 424		new_page = virt_to_page(tmp);
 425		new_offset = offset_in_page(tmp);
 426		done_copy_out = 1;
 427
 428		/*
 429		 * This isn't strictly necessary, as we're using frozen
 430		 * data for the escaping, but it keeps consistency with
 431		 * b_frozen_data usage.
 432		 */
 433		jh_in->b_frozen_triggers = jh_in->b_triggers;
 434	}
 435
 436	/*
 437	 * Did we need to do an escaping?  Now we've done all the
 438	 * copying, we can finally do so.
 439	 */
 440	if (do_escape) {
 441		mapped_data = kmap_atomic(new_page);
 442		*((unsigned int *)(mapped_data + new_offset)) = 0;
 443		kunmap_atomic(mapped_data);
 444	}
 445
 446	set_bh_page(new_bh, new_page, new_offset);
 447	new_bh->b_size = bh_in->b_size;
 448	new_bh->b_bdev = journal->j_dev;
 449	new_bh->b_blocknr = blocknr;
 450	new_bh->b_private = bh_in;
 451	set_buffer_mapped(new_bh);
 452	set_buffer_dirty(new_bh);
 453
 454	*bh_out = new_bh;
 455
 456	/*
 457	 * The to-be-written buffer needs to get moved to the io queue,
 458	 * and the original buffer whose contents we are shadowing or
 459	 * copying is moved to the transaction's shadow queue.
 460	 */
 461	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
 462	spin_lock(&journal->j_list_lock);
 463	__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
 464	spin_unlock(&journal->j_list_lock);
 465	set_buffer_shadow(bh_in);
 466	spin_unlock(&jh_in->b_state_lock);
 467
 468	return do_escape | (done_copy_out << 1);
 469}
 470
 471/*
 472 * Allocation code for the journal file.  Manage the space left in the
 473 * journal, so that we can begin checkpointing when appropriate.
 474 */
 475
 476/*
 477 * Called with j_state_lock locked for writing.
 478 * Returns true if a transaction commit was started.
 479 */
 480int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 481{
 482	/* Return if the txn has already requested to be committed */
 483	if (journal->j_commit_request == target)
 484		return 0;
 485
 486	/*
 487	 * The only transaction we can possibly wait upon is the
 488	 * currently running transaction (if it exists).  Otherwise,
 489	 * the target tid must be an old one.
 490	 */
 491	if (journal->j_running_transaction &&
 492	    journal->j_running_transaction->t_tid == target) {
 493		/*
 494		 * We want a new commit: OK, mark the request and wakeup the
 495		 * commit thread.  We do _not_ do the commit ourselves.
 496		 */
 497
 498		journal->j_commit_request = target;
 499		jbd_debug(1, "JBD2: requesting commit %u/%u\n",
 500			  journal->j_commit_request,
 501			  journal->j_commit_sequence);
 502		journal->j_running_transaction->t_requested = jiffies;
 503		wake_up(&journal->j_wait_commit);
 504		return 1;
 505	} else if (!tid_geq(journal->j_commit_request, target))
 506		/* This should never happen, but if it does, preserve
 507		   the evidence before kjournald goes into a loop and
 508		   increments j_commit_sequence beyond all recognition. */
 509		WARN_ONCE(1, "JBD2: bad log_start_commit: %u %u %u %u\n",
 510			  journal->j_commit_request,
 511			  journal->j_commit_sequence,
 512			  target, journal->j_running_transaction ?
 513			  journal->j_running_transaction->t_tid : 0);
 514	return 0;
 515}
 516
 517int jbd2_log_start_commit(journal_t *journal, tid_t tid)
 518{
 519	int ret;
 520
 521	write_lock(&journal->j_state_lock);
 522	ret = __jbd2_log_start_commit(journal, tid);
 523	write_unlock(&journal->j_state_lock);
 524	return ret;
 525}
 526
 527/*
 528 * Force and wait any uncommitted transactions.  We can only force the running
 529 * transaction if we don't have an active handle, otherwise, we will deadlock.
 530 * Returns: <0 in case of error,
 531 *           0 if nothing to commit,
 532 *           1 if transaction was successfully committed.
 533 */
 534static int __jbd2_journal_force_commit(journal_t *journal)
 535{
 536	transaction_t *transaction = NULL;
 537	tid_t tid;
 538	int need_to_start = 0, ret = 0;
 539
 540	read_lock(&journal->j_state_lock);
 541	if (journal->j_running_transaction && !current->journal_info) {
 542		transaction = journal->j_running_transaction;
 543		if (!tid_geq(journal->j_commit_request, transaction->t_tid))
 544			need_to_start = 1;
 545	} else if (journal->j_committing_transaction)
 546		transaction = journal->j_committing_transaction;
 547
 548	if (!transaction) {
 549		/* Nothing to commit */
 550		read_unlock(&journal->j_state_lock);
 551		return 0;
 552	}
 553	tid = transaction->t_tid;
 554	read_unlock(&journal->j_state_lock);
 555	if (need_to_start)
 556		jbd2_log_start_commit(journal, tid);
 557	ret = jbd2_log_wait_commit(journal, tid);
 558	if (!ret)
 559		ret = 1;
 560
 561	return ret;
 562}
 563
 564/**
 565 * Force and wait upon a commit if the calling process is not within
 566 * transaction.  This is used for forcing out undo-protected data which contains
 567 * bitmaps, when the fs is running out of space.
 568 *
 569 * @journal: journal to force
 570 * Returns true if progress was made.
 571 */
 572int jbd2_journal_force_commit_nested(journal_t *journal)
 573{
 574	int ret;
 575
 576	ret = __jbd2_journal_force_commit(journal);
 577	return ret > 0;
 578}
 579
 580/**
 581 * int journal_force_commit() - force any uncommitted transactions
 582 * @journal: journal to force
 583 *
 584 * Caller want unconditional commit. We can only force the running transaction
 585 * if we don't have an active handle, otherwise, we will deadlock.
 586 */
 587int jbd2_journal_force_commit(journal_t *journal)
 588{
 589	int ret;
 590
 591	J_ASSERT(!current->journal_info);
 592	ret = __jbd2_journal_force_commit(journal);
 593	if (ret > 0)
 594		ret = 0;
 595	return ret;
 596}
 597
 598/*
 599 * Start a commit of the current running transaction (if any).  Returns true
 600 * if a transaction is going to be committed (or is currently already
 601 * committing), and fills its tid in at *ptid
 602 */
 603int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 604{
 605	int ret = 0;
 606
 607	write_lock(&journal->j_state_lock);
 608	if (journal->j_running_transaction) {
 609		tid_t tid = journal->j_running_transaction->t_tid;
 610
 611		__jbd2_log_start_commit(journal, tid);
 612		/* There's a running transaction and we've just made sure
 613		 * it's commit has been scheduled. */
 614		if (ptid)
 615			*ptid = tid;
 616		ret = 1;
 617	} else if (journal->j_committing_transaction) {
 618		/*
 619		 * If commit has been started, then we have to wait for
 620		 * completion of that transaction.
 621		 */
 622		if (ptid)
 623			*ptid = journal->j_committing_transaction->t_tid;
 624		ret = 1;
 625	}
 626	write_unlock(&journal->j_state_lock);
 627	return ret;
 628}
 629
 630/*
 631 * Return 1 if a given transaction has not yet sent barrier request
 632 * connected with a transaction commit. If 0 is returned, transaction
 633 * may or may not have sent the barrier. Used to avoid sending barrier
 634 * twice in common cases.
 635 */
 636int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
 637{
 638	int ret = 0;
 639	transaction_t *commit_trans;
 640
 641	if (!(journal->j_flags & JBD2_BARRIER))
 642		return 0;
 643	read_lock(&journal->j_state_lock);
 644	/* Transaction already committed? */
 645	if (tid_geq(journal->j_commit_sequence, tid))
 646		goto out;
 647	commit_trans = journal->j_committing_transaction;
 648	if (!commit_trans || commit_trans->t_tid != tid) {
 649		ret = 1;
 650		goto out;
 651	}
 652	/*
 653	 * Transaction is being committed and we already proceeded to
 654	 * submitting a flush to fs partition?
 655	 */
 656	if (journal->j_fs_dev != journal->j_dev) {
 657		if (!commit_trans->t_need_data_flush ||
 658		    commit_trans->t_state >= T_COMMIT_DFLUSH)
 659			goto out;
 660	} else {
 661		if (commit_trans->t_state >= T_COMMIT_JFLUSH)
 662			goto out;
 663	}
 664	ret = 1;
 665out:
 666	read_unlock(&journal->j_state_lock);
 667	return ret;
 668}
 669EXPORT_SYMBOL(jbd2_trans_will_send_data_barrier);
 670
 671/*
 672 * Wait for a specified commit to complete.
 673 * The caller may not hold the journal lock.
 674 */
 675int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 676{
 677	int err = 0;
 678
 
 679	read_lock(&journal->j_state_lock);
 680#ifdef CONFIG_PROVE_LOCKING
 681	/*
 682	 * Some callers make sure transaction is already committing and in that
 683	 * case we cannot block on open handles anymore. So don't warn in that
 684	 * case.
 685	 */
 686	if (tid_gt(tid, journal->j_commit_sequence) &&
 687	    (!journal->j_committing_transaction ||
 688	     journal->j_committing_transaction->t_tid != tid)) {
 689		read_unlock(&journal->j_state_lock);
 690		jbd2_might_wait_for_commit(journal);
 691		read_lock(&journal->j_state_lock);
 692	}
 693#endif
 694#ifdef CONFIG_JBD2_DEBUG
 695	if (!tid_geq(journal->j_commit_request, tid)) {
 696		printk(KERN_ERR
 697		       "%s: error: j_commit_request=%u, tid=%u\n",
 698		       __func__, journal->j_commit_request, tid);
 699	}
 700#endif
 701	while (tid_gt(tid, journal->j_commit_sequence)) {
 702		jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n",
 703				  tid, journal->j_commit_sequence);
 704		read_unlock(&journal->j_state_lock);
 705		wake_up(&journal->j_wait_commit);
 706		wait_event(journal->j_wait_done_commit,
 707				!tid_gt(tid, journal->j_commit_sequence));
 708		read_lock(&journal->j_state_lock);
 709	}
 710	read_unlock(&journal->j_state_lock);
 711
 712	if (unlikely(is_journal_aborted(journal)))
 713		err = -EIO;
 714	return err;
 715}
 716
 717/* Return 1 when transaction with given tid has already committed. */
 718int jbd2_transaction_committed(journal_t *journal, tid_t tid)
 719{
 720	int ret = 1;
 721
 722	read_lock(&journal->j_state_lock);
 723	if (journal->j_running_transaction &&
 724	    journal->j_running_transaction->t_tid == tid)
 725		ret = 0;
 726	if (journal->j_committing_transaction &&
 727	    journal->j_committing_transaction->t_tid == tid)
 728		ret = 0;
 729	read_unlock(&journal->j_state_lock);
 730	return ret;
 731}
 732EXPORT_SYMBOL(jbd2_transaction_committed);
 733
 734/*
 735 * When this function returns the transaction corresponding to tid
 736 * will be completed.  If the transaction has currently running, start
 737 * committing that transaction before waiting for it to complete.  If
 738 * the transaction id is stale, it is by definition already completed,
 739 * so just return SUCCESS.
 740 */
 741int jbd2_complete_transaction(journal_t *journal, tid_t tid)
 742{
 743	int	need_to_wait = 1;
 744
 745	read_lock(&journal->j_state_lock);
 746	if (journal->j_running_transaction &&
 747	    journal->j_running_transaction->t_tid == tid) {
 748		if (journal->j_commit_request != tid) {
 749			/* transaction not yet started, so request it */
 750			read_unlock(&journal->j_state_lock);
 751			jbd2_log_start_commit(journal, tid);
 752			goto wait_commit;
 753		}
 754	} else if (!(journal->j_committing_transaction &&
 755		     journal->j_committing_transaction->t_tid == tid))
 756		need_to_wait = 0;
 757	read_unlock(&journal->j_state_lock);
 758	if (!need_to_wait)
 759		return 0;
 760wait_commit:
 761	return jbd2_log_wait_commit(journal, tid);
 762}
 763EXPORT_SYMBOL(jbd2_complete_transaction);
 764
 765/*
 766 * Log buffer allocation routines:
 767 */
 768
 769int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp)
 770{
 771	unsigned long blocknr;
 772
 773	write_lock(&journal->j_state_lock);
 774	J_ASSERT(journal->j_free > 1);
 775
 776	blocknr = journal->j_head;
 777	journal->j_head++;
 778	journal->j_free--;
 779	if (journal->j_head == journal->j_last)
 780		journal->j_head = journal->j_first;
 781	write_unlock(&journal->j_state_lock);
 782	return jbd2_journal_bmap(journal, blocknr, retp);
 783}
 784
 785/*
 786 * Conversion of logical to physical block numbers for the journal
 787 *
 788 * On external journals the journal blocks are identity-mapped, so
 789 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 790 * ready.
 791 */
 792int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
 793		 unsigned long long *retp)
 794{
 795	int err = 0;
 796	unsigned long long ret;
 797	sector_t block = 0;
 798
 799	if (journal->j_inode) {
 800		block = blocknr;
 801		ret = bmap(journal->j_inode, &block);
 802
 803		if (ret || !block) {
 804			printk(KERN_ALERT "%s: journal block not found "
 805					"at offset %lu on %s\n",
 806			       __func__, blocknr, journal->j_devname);
 807			err = -EIO;
 808			jbd2_journal_abort(journal, err);
 809		} else {
 810			*retp = block;
 811		}
 812
 813	} else {
 814		*retp = blocknr; /* +journal->j_blk_offset */
 815	}
 816	return err;
 817}
 818
 819/*
 820 * We play buffer_head aliasing tricks to write data/metadata blocks to
 821 * the journal without copying their contents, but for journal
 822 * descriptor blocks we do need to generate bona fide buffers.
 823 *
 824 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
 825 * the buffer's contents they really should run flush_dcache_page(bh->b_page).
 826 * But we don't bother doing that, so there will be coherency problems with
 827 * mmaps of blockdevs which hold live JBD-controlled filesystems.
 828 */
 829struct buffer_head *
 830jbd2_journal_get_descriptor_buffer(transaction_t *transaction, int type)
 831{
 832	journal_t *journal = transaction->t_journal;
 833	struct buffer_head *bh;
 834	unsigned long long blocknr;
 835	journal_header_t *header;
 836	int err;
 837
 838	err = jbd2_journal_next_log_block(journal, &blocknr);
 839
 840	if (err)
 841		return NULL;
 842
 843	bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
 844	if (!bh)
 845		return NULL;
 846	atomic_dec(&transaction->t_outstanding_credits);
 847	lock_buffer(bh);
 848	memset(bh->b_data, 0, journal->j_blocksize);
 849	header = (journal_header_t *)bh->b_data;
 850	header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 851	header->h_blocktype = cpu_to_be32(type);
 852	header->h_sequence = cpu_to_be32(transaction->t_tid);
 853	set_buffer_uptodate(bh);
 854	unlock_buffer(bh);
 855	BUFFER_TRACE(bh, "return this buffer");
 856	return bh;
 857}
 858
 859void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
 860{
 861	struct jbd2_journal_block_tail *tail;
 862	__u32 csum;
 863
 864	if (!jbd2_journal_has_csum_v2or3(j))
 865		return;
 866
 867	tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
 868			sizeof(struct jbd2_journal_block_tail));
 869	tail->t_checksum = 0;
 870	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 871	tail->t_checksum = cpu_to_be32(csum);
 872}
 873
 874/*
 875 * Return tid of the oldest transaction in the journal and block in the journal
 876 * where the transaction starts.
 877 *
 878 * If the journal is now empty, return which will be the next transaction ID
 879 * we will write and where will that transaction start.
 880 *
 881 * The return value is 0 if journal tail cannot be pushed any further, 1 if
 882 * it can.
 883 */
 884int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
 885			      unsigned long *block)
 886{
 887	transaction_t *transaction;
 888	int ret;
 889
 890	read_lock(&journal->j_state_lock);
 891	spin_lock(&journal->j_list_lock);
 892	transaction = journal->j_checkpoint_transactions;
 893	if (transaction) {
 894		*tid = transaction->t_tid;
 895		*block = transaction->t_log_start;
 896	} else if ((transaction = journal->j_committing_transaction) != NULL) {
 897		*tid = transaction->t_tid;
 898		*block = transaction->t_log_start;
 899	} else if ((transaction = journal->j_running_transaction) != NULL) {
 900		*tid = transaction->t_tid;
 901		*block = journal->j_head;
 902	} else {
 903		*tid = journal->j_transaction_sequence;
 904		*block = journal->j_head;
 905	}
 906	ret = tid_gt(*tid, journal->j_tail_sequence);
 907	spin_unlock(&journal->j_list_lock);
 908	read_unlock(&journal->j_state_lock);
 909
 910	return ret;
 911}
 912
 913/*
 914 * Update information in journal structure and in on disk journal superblock
 915 * about log tail. This function does not check whether information passed in
 916 * really pushes log tail further. It's responsibility of the caller to make
 917 * sure provided log tail information is valid (e.g. by holding
 918 * j_checkpoint_mutex all the time between computing log tail and calling this
 919 * function as is the case with jbd2_cleanup_journal_tail()).
 920 *
 921 * Requires j_checkpoint_mutex
 922 */
 923int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 924{
 925	unsigned long freed;
 926	int ret;
 927
 928	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 929
 930	/*
 931	 * We cannot afford for write to remain in drive's caches since as
 932	 * soon as we update j_tail, next transaction can start reusing journal
 933	 * space and if we lose sb update during power failure we'd replay
 934	 * old transaction with possibly newly overwritten data.
 935	 */
 936	ret = jbd2_journal_update_sb_log_tail(journal, tid, block,
 937					      REQ_SYNC | REQ_FUA);
 938	if (ret)
 939		goto out;
 940
 941	write_lock(&journal->j_state_lock);
 942	freed = block - journal->j_tail;
 943	if (block < journal->j_tail)
 944		freed += journal->j_last - journal->j_first;
 945
 946	trace_jbd2_update_log_tail(journal, tid, block, freed);
 947	jbd_debug(1,
 948		  "Cleaning journal tail from %u to %u (offset %lu), "
 949		  "freeing %lu\n",
 950		  journal->j_tail_sequence, tid, block, freed);
 951
 952	journal->j_free += freed;
 953	journal->j_tail_sequence = tid;
 954	journal->j_tail = block;
 955	write_unlock(&journal->j_state_lock);
 956
 957out:
 958	return ret;
 959}
 960
 961/*
 962 * This is a variation of __jbd2_update_log_tail which checks for validity of
 963 * provided log tail and locks j_checkpoint_mutex. So it is safe against races
 964 * with other threads updating log tail.
 965 */
 966void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 967{
 968	mutex_lock_io(&journal->j_checkpoint_mutex);
 969	if (tid_gt(tid, journal->j_tail_sequence))
 970		__jbd2_update_log_tail(journal, tid, block);
 971	mutex_unlock(&journal->j_checkpoint_mutex);
 972}
 973
 974struct jbd2_stats_proc_session {
 975	journal_t *journal;
 976	struct transaction_stats_s *stats;
 977	int start;
 978	int max;
 979};
 980
 981static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
 982{
 983	return *pos ? NULL : SEQ_START_TOKEN;
 984}
 985
 986static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
 987{
 988	(*pos)++;
 989	return NULL;
 990}
 991
 992static int jbd2_seq_info_show(struct seq_file *seq, void *v)
 993{
 994	struct jbd2_stats_proc_session *s = seq->private;
 995
 996	if (v != SEQ_START_TOKEN)
 997		return 0;
 998	seq_printf(seq, "%lu transactions (%lu requested), "
 999		   "each up to %u blocks\n",
1000		   s->stats->ts_tid, s->stats->ts_requested,
1001		   s->journal->j_max_transaction_buffers);
1002	if (s->stats->ts_tid == 0)
1003		return 0;
1004	seq_printf(seq, "average: \n  %ums waiting for transaction\n",
1005	    jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid));
1006	seq_printf(seq, "  %ums request delay\n",
1007	    (s->stats->ts_requested == 0) ? 0 :
1008	    jiffies_to_msecs(s->stats->run.rs_request_delay /
1009			     s->stats->ts_requested));
1010	seq_printf(seq, "  %ums running transaction\n",
1011	    jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid));
1012	seq_printf(seq, "  %ums transaction was being locked\n",
1013	    jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid));
1014	seq_printf(seq, "  %ums flushing data (in ordered mode)\n",
1015	    jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid));
1016	seq_printf(seq, "  %ums logging transaction\n",
1017	    jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid));
1018	seq_printf(seq, "  %lluus average transaction commit time\n",
1019		   div_u64(s->journal->j_average_commit_time, 1000));
1020	seq_printf(seq, "  %lu handles per transaction\n",
1021	    s->stats->run.rs_handle_count / s->stats->ts_tid);
1022	seq_printf(seq, "  %lu blocks per transaction\n",
1023	    s->stats->run.rs_blocks / s->stats->ts_tid);
1024	seq_printf(seq, "  %lu logged blocks per transaction\n",
1025	    s->stats->run.rs_blocks_logged / s->stats->ts_tid);
1026	return 0;
1027}
1028
1029static void jbd2_seq_info_stop(struct seq_file *seq, void *v)
1030{
1031}
1032
1033static const struct seq_operations jbd2_seq_info_ops = {
1034	.start  = jbd2_seq_info_start,
1035	.next   = jbd2_seq_info_next,
1036	.stop   = jbd2_seq_info_stop,
1037	.show   = jbd2_seq_info_show,
1038};
1039
1040static int jbd2_seq_info_open(struct inode *inode, struct file *file)
1041{
1042	journal_t *journal = PDE_DATA(inode);
1043	struct jbd2_stats_proc_session *s;
1044	int rc, size;
1045
1046	s = kmalloc(sizeof(*s), GFP_KERNEL);
1047	if (s == NULL)
1048		return -ENOMEM;
1049	size = sizeof(struct transaction_stats_s);
1050	s->stats = kmalloc(size, GFP_KERNEL);
1051	if (s->stats == NULL) {
1052		kfree(s);
1053		return -ENOMEM;
1054	}
1055	spin_lock(&journal->j_history_lock);
1056	memcpy(s->stats, &journal->j_stats, size);
1057	s->journal = journal;
1058	spin_unlock(&journal->j_history_lock);
1059
1060	rc = seq_open(file, &jbd2_seq_info_ops);
1061	if (rc == 0) {
1062		struct seq_file *m = file->private_data;
1063		m->private = s;
1064	} else {
1065		kfree(s->stats);
1066		kfree(s);
1067	}
1068	return rc;
1069
1070}
1071
1072static int jbd2_seq_info_release(struct inode *inode, struct file *file)
1073{
1074	struct seq_file *seq = file->private_data;
1075	struct jbd2_stats_proc_session *s = seq->private;
1076	kfree(s->stats);
1077	kfree(s);
1078	return seq_release(inode, file);
1079}
1080
1081static const struct proc_ops jbd2_info_proc_ops = {
1082	.proc_open	= jbd2_seq_info_open,
1083	.proc_read	= seq_read,
1084	.proc_lseek	= seq_lseek,
1085	.proc_release	= jbd2_seq_info_release,
 
1086};
1087
1088static struct proc_dir_entry *proc_jbd2_stats;
1089
1090static void jbd2_stats_proc_init(journal_t *journal)
1091{
1092	journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
1093	if (journal->j_proc_entry) {
1094		proc_create_data("info", S_IRUGO, journal->j_proc_entry,
1095				 &jbd2_info_proc_ops, journal);
1096	}
1097}
1098
1099static void jbd2_stats_proc_exit(journal_t *journal)
1100{
1101	remove_proc_entry("info", journal->j_proc_entry);
1102	remove_proc_entry(journal->j_devname, proc_jbd2_stats);
1103}
1104
1105/* Minimum size of descriptor tag */
1106static int jbd2_min_tag_size(void)
1107{
1108	/*
1109	 * Tag with 32-bit block numbers does not use last four bytes of the
1110	 * structure
1111	 */
1112	return sizeof(journal_block_tag_t) - 4;
1113}
1114
1115/*
1116 * Management for journal control blocks: functions to create and
1117 * destroy journal_t structures, and to initialise and read existing
1118 * journal blocks from disk.  */
1119
1120/* First: create and setup a journal_t object in memory.  We initialise
1121 * very few fields yet: that has to wait until we have created the
1122 * journal structures from from scratch, or loaded them from disk. */
1123
1124static journal_t *journal_init_common(struct block_device *bdev,
1125			struct block_device *fs_dev,
1126			unsigned long long start, int len, int blocksize)
1127{
1128	static struct lock_class_key jbd2_trans_commit_key;
1129	journal_t *journal;
1130	int err;
1131	struct buffer_head *bh;
1132	int n;
1133
1134	journal = kzalloc(sizeof(*journal), GFP_KERNEL);
1135	if (!journal)
1136		return NULL;
1137
1138	init_waitqueue_head(&journal->j_wait_transaction_locked);
1139	init_waitqueue_head(&journal->j_wait_done_commit);
1140	init_waitqueue_head(&journal->j_wait_commit);
1141	init_waitqueue_head(&journal->j_wait_updates);
1142	init_waitqueue_head(&journal->j_wait_reserved);
1143	mutex_init(&journal->j_abort_mutex);
1144	mutex_init(&journal->j_barrier);
1145	mutex_init(&journal->j_checkpoint_mutex);
1146	spin_lock_init(&journal->j_revoke_lock);
1147	spin_lock_init(&journal->j_list_lock);
1148	rwlock_init(&journal->j_state_lock);
1149
1150	journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
1151	journal->j_min_batch_time = 0;
1152	journal->j_max_batch_time = 15000; /* 15ms */
1153	atomic_set(&journal->j_reserved_credits, 0);
1154
1155	/* The journal is marked for error until we succeed with recovery! */
1156	journal->j_flags = JBD2_ABORT;
1157
1158	/* Set up a default-sized revoke table for the new mount. */
1159	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
1160	if (err)
1161		goto err_cleanup;
1162
1163	spin_lock_init(&journal->j_history_lock);
1164
1165	lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle",
1166			 &jbd2_trans_commit_key, 0);
1167
1168	/* journal descriptor can store up to n blocks -bzzz */
1169	journal->j_blocksize = blocksize;
1170	journal->j_dev = bdev;
1171	journal->j_fs_dev = fs_dev;
1172	journal->j_blk_offset = start;
1173	journal->j_maxlen = len;
1174	/* We need enough buffers to write out full descriptor block. */
1175	n = journal->j_blocksize / jbd2_min_tag_size();
1176	journal->j_wbufsize = n;
1177	journal->j_wbuf = kmalloc_array(n, sizeof(struct buffer_head *),
1178					GFP_KERNEL);
1179	if (!journal->j_wbuf)
1180		goto err_cleanup;
1181
1182	bh = getblk_unmovable(journal->j_dev, start, journal->j_blocksize);
1183	if (!bh) {
1184		pr_err("%s: Cannot get buffer for journal superblock\n",
1185			__func__);
1186		goto err_cleanup;
1187	}
1188	journal->j_sb_buffer = bh;
1189	journal->j_superblock = (journal_superblock_t *)bh->b_data;
1190
1191	return journal;
1192
1193err_cleanup:
1194	kfree(journal->j_wbuf);
1195	jbd2_journal_destroy_revoke(journal);
1196	kfree(journal);
1197	return NULL;
1198}
1199
1200/* jbd2_journal_init_dev and jbd2_journal_init_inode:
1201 *
1202 * Create a journal structure assigned some fixed set of disk blocks to
1203 * the journal.  We don't actually touch those disk blocks yet, but we
1204 * need to set up all of the mapping information to tell the journaling
1205 * system where the journal blocks are.
1206 *
1207 */
1208
1209/**
1210 *  journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure
1211 *  @bdev: Block device on which to create the journal
1212 *  @fs_dev: Device which hold journalled filesystem for this journal.
1213 *  @start: Block nr Start of journal.
1214 *  @len:  Length of the journal in blocks.
1215 *  @blocksize: blocksize of journalling device
1216 *
1217 *  Returns: a newly created journal_t *
1218 *
1219 *  jbd2_journal_init_dev creates a journal which maps a fixed contiguous
1220 *  range of blocks on an arbitrary block device.
1221 *
1222 */
1223journal_t *jbd2_journal_init_dev(struct block_device *bdev,
1224			struct block_device *fs_dev,
1225			unsigned long long start, int len, int blocksize)
1226{
1227	journal_t *journal;
1228
1229	journal = journal_init_common(bdev, fs_dev, start, len, blocksize);
1230	if (!journal)
1231		return NULL;
1232
1233	bdevname(journal->j_dev, journal->j_devname);
1234	strreplace(journal->j_devname, '/', '!');
1235	jbd2_stats_proc_init(journal);
1236
1237	return journal;
1238}
1239
1240/**
1241 *  journal_t * jbd2_journal_init_inode () - creates a journal which maps to a inode.
1242 *  @inode: An inode to create the journal in
1243 *
1244 * jbd2_journal_init_inode creates a journal which maps an on-disk inode as
1245 * the journal.  The inode must exist already, must support bmap() and
1246 * must have all data blocks preallocated.
1247 */
1248journal_t *jbd2_journal_init_inode(struct inode *inode)
1249{
1250	journal_t *journal;
1251	sector_t blocknr;
1252	char *p;
1253	int err = 0;
1254
1255	blocknr = 0;
1256	err = bmap(inode, &blocknr);
1257
1258	if (err || !blocknr) {
 
1259		pr_err("%s: Cannot locate journal superblock\n",
1260			__func__);
1261		return NULL;
1262	}
1263
1264	jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n",
1265		  inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size,
1266		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
1267
1268	journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev,
1269			blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits,
1270			inode->i_sb->s_blocksize);
1271	if (!journal)
1272		return NULL;
1273
1274	journal->j_inode = inode;
1275	bdevname(journal->j_dev, journal->j_devname);
1276	p = strreplace(journal->j_devname, '/', '!');
1277	sprintf(p, "-%lu", journal->j_inode->i_ino);
1278	jbd2_stats_proc_init(journal);
1279
1280	return journal;
1281}
1282
1283/*
1284 * If the journal init or create aborts, we need to mark the journal
1285 * superblock as being NULL to prevent the journal destroy from writing
1286 * back a bogus superblock.
1287 */
1288static void journal_fail_superblock(journal_t *journal)
1289{
1290	struct buffer_head *bh = journal->j_sb_buffer;
1291	brelse(bh);
1292	journal->j_sb_buffer = NULL;
1293}
1294
1295/*
1296 * Given a journal_t structure, initialise the various fields for
1297 * startup of a new journaling session.  We use this both when creating
1298 * a journal, and after recovering an old journal to reset it for
1299 * subsequent use.
1300 */
1301
1302static int journal_reset(journal_t *journal)
1303{
1304	journal_superblock_t *sb = journal->j_superblock;
1305	unsigned long long first, last;
1306
1307	first = be32_to_cpu(sb->s_first);
1308	last = be32_to_cpu(sb->s_maxlen);
1309	if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1310		printk(KERN_ERR "JBD2: Journal too short (blocks %llu-%llu).\n",
1311		       first, last);
1312		journal_fail_superblock(journal);
1313		return -EINVAL;
1314	}
1315
1316	journal->j_first = first;
1317	journal->j_last = last;
1318
1319	journal->j_head = first;
1320	journal->j_tail = first;
1321	journal->j_free = last - first;
1322
1323	journal->j_tail_sequence = journal->j_transaction_sequence;
1324	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
1325	journal->j_commit_request = journal->j_commit_sequence;
1326
1327	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
1328
1329	/*
1330	 * As a special case, if the on-disk copy is already marked as needing
1331	 * no recovery (s_start == 0), then we can safely defer the superblock
1332	 * update until the next commit by setting JBD2_FLUSHED.  This avoids
1333	 * attempting a write to a potential-readonly device.
1334	 */
1335	if (sb->s_start == 0) {
1336		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
1337			"(start %ld, seq %u, errno %d)\n",
1338			journal->j_tail, journal->j_tail_sequence,
1339			journal->j_errno);
1340		journal->j_flags |= JBD2_FLUSHED;
1341	} else {
1342		/* Lock here to make assertions happy... */
1343		mutex_lock_io(&journal->j_checkpoint_mutex);
1344		/*
1345		 * Update log tail information. We use REQ_FUA since new
1346		 * transaction will start reusing journal space and so we
1347		 * must make sure information about current log tail is on
1348		 * disk before that.
1349		 */
1350		jbd2_journal_update_sb_log_tail(journal,
1351						journal->j_tail_sequence,
1352						journal->j_tail,
1353						REQ_SYNC | REQ_FUA);
1354		mutex_unlock(&journal->j_checkpoint_mutex);
1355	}
1356	return jbd2_journal_start_thread(journal);
1357}
1358
1359/*
1360 * This function expects that the caller will have locked the journal
1361 * buffer head, and will return with it unlocked
1362 */
1363static int jbd2_write_superblock(journal_t *journal, int write_flags)
1364{
1365	struct buffer_head *bh = journal->j_sb_buffer;
1366	journal_superblock_t *sb = journal->j_superblock;
1367	int ret;
1368
1369	/* Buffer got discarded which means block device got invalidated */
1370	if (!buffer_mapped(bh)) {
1371		unlock_buffer(bh);
1372		return -EIO;
1373	}
1374
1375	trace_jbd2_write_superblock(journal, write_flags);
1376	if (!(journal->j_flags & JBD2_BARRIER))
1377		write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
 
1378	if (buffer_write_io_error(bh)) {
1379		/*
1380		 * Oh, dear.  A previous attempt to write the journal
1381		 * superblock failed.  This could happen because the
1382		 * USB device was yanked out.  Or it could happen to
1383		 * be a transient write error and maybe the block will
1384		 * be remapped.  Nothing we can do but to retry the
1385		 * write and hope for the best.
1386		 */
1387		printk(KERN_ERR "JBD2: previous I/O error detected "
1388		       "for journal superblock update for %s.\n",
1389		       journal->j_devname);
1390		clear_buffer_write_io_error(bh);
1391		set_buffer_uptodate(bh);
1392	}
1393	if (jbd2_journal_has_csum_v2or3(journal))
1394		sb->s_checksum = jbd2_superblock_csum(journal, sb);
1395	get_bh(bh);
1396	bh->b_end_io = end_buffer_write_sync;
1397	ret = submit_bh(REQ_OP_WRITE, write_flags, bh);
1398	wait_on_buffer(bh);
1399	if (buffer_write_io_error(bh)) {
1400		clear_buffer_write_io_error(bh);
1401		set_buffer_uptodate(bh);
1402		ret = -EIO;
1403	}
1404	if (ret) {
1405		printk(KERN_ERR "JBD2: Error %d detected when updating "
1406		       "journal superblock for %s.\n", ret,
1407		       journal->j_devname);
1408		if (!is_journal_aborted(journal))
1409			jbd2_journal_abort(journal, ret);
1410	}
1411
1412	return ret;
1413}
1414
1415/**
1416 * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
1417 * @journal: The journal to update.
1418 * @tail_tid: TID of the new transaction at the tail of the log
1419 * @tail_block: The first block of the transaction at the tail of the log
1420 * @write_op: With which operation should we write the journal sb
1421 *
1422 * Update a journal's superblock information about log tail and write it to
1423 * disk, waiting for the IO to complete.
1424 */
1425int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
1426				     unsigned long tail_block, int write_op)
1427{
1428	journal_superblock_t *sb = journal->j_superblock;
1429	int ret;
1430
1431	if (is_journal_aborted(journal))
1432		return -EIO;
1433
1434	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1435	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
1436		  tail_block, tail_tid);
1437
1438	lock_buffer(journal->j_sb_buffer);
1439	sb->s_sequence = cpu_to_be32(tail_tid);
1440	sb->s_start    = cpu_to_be32(tail_block);
1441
1442	ret = jbd2_write_superblock(journal, write_op);
1443	if (ret)
1444		goto out;
1445
1446	/* Log is no longer empty */
1447	write_lock(&journal->j_state_lock);
1448	WARN_ON(!sb->s_sequence);
1449	journal->j_flags &= ~JBD2_FLUSHED;
1450	write_unlock(&journal->j_state_lock);
1451
1452out:
1453	return ret;
1454}
1455
1456/**
1457 * jbd2_mark_journal_empty() - Mark on disk journal as empty.
1458 * @journal: The journal to update.
1459 * @write_op: With which operation should we write the journal sb
1460 *
1461 * Update a journal's dynamic superblock fields to show that journal is empty.
1462 * Write updated superblock to disk waiting for IO to complete.
1463 */
1464static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
1465{
1466	journal_superblock_t *sb = journal->j_superblock;
1467
1468	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
1469	lock_buffer(journal->j_sb_buffer);
1470	if (sb->s_start == 0) {		/* Is it already empty? */
1471		unlock_buffer(journal->j_sb_buffer);
 
1472		return;
1473	}
1474
1475	jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n",
1476		  journal->j_tail_sequence);
1477
1478	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
1479	sb->s_start    = cpu_to_be32(0);
 
1480
1481	jbd2_write_superblock(journal, write_op);
1482
1483	/* Log is no longer empty */
1484	write_lock(&journal->j_state_lock);
1485	journal->j_flags |= JBD2_FLUSHED;
1486	write_unlock(&journal->j_state_lock);
1487}
1488
1489
1490/**
1491 * jbd2_journal_update_sb_errno() - Update error in the journal.
1492 * @journal: The journal to update.
1493 *
1494 * Update a journal's errno.  Write updated superblock to disk waiting for IO
1495 * to complete.
1496 */
1497void jbd2_journal_update_sb_errno(journal_t *journal)
1498{
1499	journal_superblock_t *sb = journal->j_superblock;
1500	int errcode;
1501
1502	lock_buffer(journal->j_sb_buffer);
1503	errcode = journal->j_errno;
1504	if (errcode == -ESHUTDOWN)
1505		errcode = 0;
1506	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode);
1507	sb->s_errno    = cpu_to_be32(errcode);
1508
1509	jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA);
1510}
1511EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
1512
1513static int journal_revoke_records_per_block(journal_t *journal)
1514{
1515	int record_size;
1516	int space = journal->j_blocksize - sizeof(jbd2_journal_revoke_header_t);
1517
1518	if (jbd2_has_feature_64bit(journal))
1519		record_size = 8;
1520	else
1521		record_size = 4;
1522
1523	if (jbd2_journal_has_csum_v2or3(journal))
1524		space -= sizeof(struct jbd2_journal_block_tail);
1525	return space / record_size;
1526}
1527
1528/*
1529 * Read the superblock for a given journal, performing initial
1530 * validation of the format.
1531 */
1532static int journal_get_superblock(journal_t *journal)
1533{
1534	struct buffer_head *bh;
1535	journal_superblock_t *sb;
1536	int err = -EIO;
1537
1538	bh = journal->j_sb_buffer;
1539
1540	J_ASSERT(bh != NULL);
1541	if (!buffer_uptodate(bh)) {
1542		ll_rw_block(REQ_OP_READ, 0, 1, &bh);
1543		wait_on_buffer(bh);
1544		if (!buffer_uptodate(bh)) {
1545			printk(KERN_ERR
1546				"JBD2: IO error reading journal superblock\n");
1547			goto out;
1548		}
1549	}
1550
1551	if (buffer_verified(bh))
1552		return 0;
1553
1554	sb = journal->j_superblock;
1555
1556	err = -EINVAL;
1557
1558	if (sb->s_header.h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER) ||
1559	    sb->s_blocksize != cpu_to_be32(journal->j_blocksize)) {
1560		printk(KERN_WARNING "JBD2: no valid journal superblock found\n");
1561		goto out;
1562	}
1563
1564	switch(be32_to_cpu(sb->s_header.h_blocktype)) {
1565	case JBD2_SUPERBLOCK_V1:
1566		journal->j_format_version = 1;
1567		break;
1568	case JBD2_SUPERBLOCK_V2:
1569		journal->j_format_version = 2;
1570		break;
1571	default:
1572		printk(KERN_WARNING "JBD2: unrecognised superblock format ID\n");
1573		goto out;
1574	}
1575
1576	if (be32_to_cpu(sb->s_maxlen) < journal->j_maxlen)
1577		journal->j_maxlen = be32_to_cpu(sb->s_maxlen);
1578	else if (be32_to_cpu(sb->s_maxlen) > journal->j_maxlen) {
1579		printk(KERN_WARNING "JBD2: journal file too short\n");
1580		goto out;
1581	}
1582
1583	if (be32_to_cpu(sb->s_first) == 0 ||
1584	    be32_to_cpu(sb->s_first) >= journal->j_maxlen) {
1585		printk(KERN_WARNING
1586			"JBD2: Invalid start block of journal: %u\n",
1587			be32_to_cpu(sb->s_first));
1588		goto out;
1589	}
1590
1591	if (jbd2_has_feature_csum2(journal) &&
1592	    jbd2_has_feature_csum3(journal)) {
1593		/* Can't have checksum v2 and v3 at the same time! */
1594		printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
1595		       "at the same time!\n");
1596		goto out;
1597	}
1598
1599	if (jbd2_journal_has_csum_v2or3_feature(journal) &&
1600	    jbd2_has_feature_checksum(journal)) {
1601		/* Can't have checksum v1 and v2 on at the same time! */
1602		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
1603		       "at the same time!\n");
1604		goto out;
1605	}
1606
1607	if (!jbd2_verify_csum_type(journal, sb)) {
1608		printk(KERN_ERR "JBD2: Unknown checksum type\n");
1609		goto out;
1610	}
1611
1612	/* Load the checksum driver */
1613	if (jbd2_journal_has_csum_v2or3_feature(journal)) {
1614		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1615		if (IS_ERR(journal->j_chksum_driver)) {
1616			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1617			err = PTR_ERR(journal->j_chksum_driver);
1618			journal->j_chksum_driver = NULL;
1619			goto out;
1620		}
1621	}
1622
1623	if (jbd2_journal_has_csum_v2or3(journal)) {
1624		/* Check superblock checksum */
1625		if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
1626			printk(KERN_ERR "JBD2: journal checksum error\n");
1627			err = -EFSBADCRC;
1628			goto out;
1629		}
1630
1631		/* Precompute checksum seed for all metadata */
 
1632		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1633						   sizeof(sb->s_uuid));
1634	}
1635
1636	journal->j_revoke_records_per_block =
1637				journal_revoke_records_per_block(journal);
1638	set_buffer_verified(bh);
1639
1640	return 0;
1641
1642out:
1643	journal_fail_superblock(journal);
1644	return err;
1645}
1646
1647/*
1648 * Load the on-disk journal superblock and read the key fields into the
1649 * journal_t.
1650 */
1651
1652static int load_superblock(journal_t *journal)
1653{
1654	int err;
1655	journal_superblock_t *sb;
1656
1657	err = journal_get_superblock(journal);
1658	if (err)
1659		return err;
1660
1661	sb = journal->j_superblock;
1662
1663	journal->j_tail_sequence = be32_to_cpu(sb->s_sequence);
1664	journal->j_tail = be32_to_cpu(sb->s_start);
1665	journal->j_first = be32_to_cpu(sb->s_first);
1666	journal->j_last = be32_to_cpu(sb->s_maxlen);
1667	journal->j_errno = be32_to_cpu(sb->s_errno);
1668
1669	return 0;
1670}
1671
1672
1673/**
1674 * int jbd2_journal_load() - Read journal from disk.
1675 * @journal: Journal to act on.
1676 *
1677 * Given a journal_t structure which tells us which disk blocks contain
1678 * a journal, read the journal from disk to initialise the in-memory
1679 * structures.
1680 */
1681int jbd2_journal_load(journal_t *journal)
1682{
1683	int err;
1684	journal_superblock_t *sb;
1685
1686	err = load_superblock(journal);
1687	if (err)
1688		return err;
1689
1690	sb = journal->j_superblock;
1691	/* If this is a V2 superblock, then we have to check the
1692	 * features flags on it. */
1693
1694	if (journal->j_format_version >= 2) {
1695		if ((sb->s_feature_ro_compat &
1696		     ~cpu_to_be32(JBD2_KNOWN_ROCOMPAT_FEATURES)) ||
1697		    (sb->s_feature_incompat &
1698		     ~cpu_to_be32(JBD2_KNOWN_INCOMPAT_FEATURES))) {
1699			printk(KERN_WARNING
1700				"JBD2: Unrecognised features on journal\n");
1701			return -EINVAL;
1702		}
1703	}
1704
1705	/*
1706	 * Create a slab for this blocksize
1707	 */
1708	err = jbd2_journal_create_slab(be32_to_cpu(sb->s_blocksize));
1709	if (err)
1710		return err;
1711
1712	/* Let the recovery code check whether it needs to recover any
1713	 * data from the journal. */
1714	if (jbd2_journal_recover(journal))
1715		goto recovery_error;
1716
1717	if (journal->j_failed_commit) {
1718		printk(KERN_ERR "JBD2: journal transaction %u on %s "
1719		       "is corrupt.\n", journal->j_failed_commit,
1720		       journal->j_devname);
1721		return -EFSCORRUPTED;
1722	}
1723	/*
1724	 * clear JBD2_ABORT flag initialized in journal_init_common
1725	 * here to update log tail information with the newest seq.
1726	 */
1727	journal->j_flags &= ~JBD2_ABORT;
1728
1729	/* OK, we've finished with the dynamic journal bits:
1730	 * reinitialise the dynamic contents of the superblock in memory
1731	 * and reset them on disk. */
1732	if (journal_reset(journal))
1733		goto recovery_error;
1734
 
1735	journal->j_flags |= JBD2_LOADED;
1736	return 0;
1737
1738recovery_error:
1739	printk(KERN_WARNING "JBD2: recovery failed\n");
1740	return -EIO;
1741}
1742
1743/**
1744 * void jbd2_journal_destroy() - Release a journal_t structure.
1745 * @journal: Journal to act on.
1746 *
1747 * Release a journal_t structure once it is no longer in use by the
1748 * journaled object.
1749 * Return <0 if we couldn't clean up the journal.
1750 */
1751int jbd2_journal_destroy(journal_t *journal)
1752{
1753	int err = 0;
1754
1755	/* Wait for the commit thread to wake up and die. */
1756	journal_kill_thread(journal);
1757
1758	/* Force a final log commit */
1759	if (journal->j_running_transaction)
1760		jbd2_journal_commit_transaction(journal);
1761
1762	/* Force any old transactions to disk */
1763
1764	/* Totally anal locking here... */
1765	spin_lock(&journal->j_list_lock);
1766	while (journal->j_checkpoint_transactions != NULL) {
1767		spin_unlock(&journal->j_list_lock);
1768		mutex_lock_io(&journal->j_checkpoint_mutex);
1769		err = jbd2_log_do_checkpoint(journal);
1770		mutex_unlock(&journal->j_checkpoint_mutex);
1771		/*
1772		 * If checkpointing failed, just free the buffers to avoid
1773		 * looping forever
1774		 */
1775		if (err) {
1776			jbd2_journal_destroy_checkpoint(journal);
1777			spin_lock(&journal->j_list_lock);
1778			break;
1779		}
1780		spin_lock(&journal->j_list_lock);
1781	}
1782
1783	J_ASSERT(journal->j_running_transaction == NULL);
1784	J_ASSERT(journal->j_committing_transaction == NULL);
1785	J_ASSERT(journal->j_checkpoint_transactions == NULL);
1786	spin_unlock(&journal->j_list_lock);
1787
1788	if (journal->j_sb_buffer) {
1789		if (!is_journal_aborted(journal)) {
1790			mutex_lock_io(&journal->j_checkpoint_mutex);
1791
1792			write_lock(&journal->j_state_lock);
1793			journal->j_tail_sequence =
1794				++journal->j_transaction_sequence;
1795			write_unlock(&journal->j_state_lock);
1796
1797			jbd2_mark_journal_empty(journal,
1798					REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
1799			mutex_unlock(&journal->j_checkpoint_mutex);
1800		} else
1801			err = -EIO;
1802		brelse(journal->j_sb_buffer);
1803	}
1804
1805	if (journal->j_proc_entry)
1806		jbd2_stats_proc_exit(journal);
1807	iput(journal->j_inode);
1808	if (journal->j_revoke)
1809		jbd2_journal_destroy_revoke(journal);
1810	if (journal->j_chksum_driver)
1811		crypto_free_shash(journal->j_chksum_driver);
1812	kfree(journal->j_wbuf);
1813	kfree(journal);
1814
1815	return err;
1816}
1817
1818
1819/**
1820 *int jbd2_journal_check_used_features() - Check if features specified are used.
1821 * @journal: Journal to check.
1822 * @compat: bitmask of compatible features
1823 * @ro: bitmask of features that force read-only mount
1824 * @incompat: bitmask of incompatible features
1825 *
1826 * Check whether the journal uses all of a given set of
1827 * features.  Return true (non-zero) if it does.
1828 **/
1829
1830int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
1831				 unsigned long ro, unsigned long incompat)
1832{
1833	journal_superblock_t *sb;
1834
1835	if (!compat && !ro && !incompat)
1836		return 1;
1837	/* Load journal superblock if it is not loaded yet. */
1838	if (journal->j_format_version == 0 &&
1839	    journal_get_superblock(journal) != 0)
1840		return 0;
1841	if (journal->j_format_version == 1)
1842		return 0;
1843
1844	sb = journal->j_superblock;
1845
1846	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
1847	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
1848	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
1849		return 1;
1850
1851	return 0;
1852}
1853
1854/**
1855 * int jbd2_journal_check_available_features() - Check feature set in journalling layer
1856 * @journal: Journal to check.
1857 * @compat: bitmask of compatible features
1858 * @ro: bitmask of features that force read-only mount
1859 * @incompat: bitmask of incompatible features
1860 *
1861 * Check whether the journaling code supports the use of
1862 * all of a given set of features on this journal.  Return true
1863 * (non-zero) if it can. */
1864
1865int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
1866				      unsigned long ro, unsigned long incompat)
1867{
1868	if (!compat && !ro && !incompat)
1869		return 1;
1870
1871	/* We can support any known requested features iff the
1872	 * superblock is in version 2.  Otherwise we fail to support any
1873	 * extended sb features. */
1874
1875	if (journal->j_format_version != 2)
1876		return 0;
1877
1878	if ((compat   & JBD2_KNOWN_COMPAT_FEATURES) == compat &&
1879	    (ro       & JBD2_KNOWN_ROCOMPAT_FEATURES) == ro &&
1880	    (incompat & JBD2_KNOWN_INCOMPAT_FEATURES) == incompat)
1881		return 1;
1882
1883	return 0;
1884}
1885
1886/**
1887 * int jbd2_journal_set_features() - Mark a given journal feature in the superblock
1888 * @journal: Journal to act on.
1889 * @compat: bitmask of compatible features
1890 * @ro: bitmask of features that force read-only mount
1891 * @incompat: bitmask of incompatible features
1892 *
1893 * Mark a given journal feature as present on the
1894 * superblock.  Returns true if the requested features could be set.
1895 *
1896 */
1897
1898int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
1899			  unsigned long ro, unsigned long incompat)
1900{
1901#define INCOMPAT_FEATURE_ON(f) \
1902		((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
1903#define COMPAT_FEATURE_ON(f) \
1904		((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
1905	journal_superblock_t *sb;
1906
1907	if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
1908		return 1;
1909
1910	if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1911		return 0;
1912
1913	/* If enabling v2 checksums, turn on v3 instead */
1914	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
1915		incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
1916		incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
1917	}
1918
1919	/* Asking for checksumming v3 and v1?  Only give them v3. */
1920	if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
1921	    compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1922		compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1923
1924	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
1925		  compat, ro, incompat);
1926
1927	sb = journal->j_superblock;
1928
1929	/* Load the checksum driver if necessary */
1930	if ((journal->j_chksum_driver == NULL) &&
1931	    INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1932		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1933		if (IS_ERR(journal->j_chksum_driver)) {
1934			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
1935			journal->j_chksum_driver = NULL;
1936			return 0;
1937		}
1938		/* Precompute checksum seed for all metadata */
1939		journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1940						   sizeof(sb->s_uuid));
1941	}
1942
1943	lock_buffer(journal->j_sb_buffer);
1944
1945	/* If enabling v3 checksums, update superblock */
1946	if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1947		sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1948		sb->s_feature_compat &=
1949			~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1950	}
1951
1952	/* If enabling v1 checksums, downgrade superblock */
1953	if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1954		sb->s_feature_incompat &=
1955			~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
1956				     JBD2_FEATURE_INCOMPAT_CSUM_V3);
1957
1958	sb->s_feature_compat    |= cpu_to_be32(compat);
1959	sb->s_feature_ro_compat |= cpu_to_be32(ro);
1960	sb->s_feature_incompat  |= cpu_to_be32(incompat);
1961	unlock_buffer(journal->j_sb_buffer);
1962	journal->j_revoke_records_per_block =
1963				journal_revoke_records_per_block(journal);
1964
1965	return 1;
1966#undef COMPAT_FEATURE_ON
1967#undef INCOMPAT_FEATURE_ON
1968}
1969
1970/*
1971 * jbd2_journal_clear_features () - Clear a given journal feature in the
1972 * 				    superblock
1973 * @journal: Journal to act on.
1974 * @compat: bitmask of compatible features
1975 * @ro: bitmask of features that force read-only mount
1976 * @incompat: bitmask of incompatible features
1977 *
1978 * Clear a given journal feature as present on the
1979 * superblock.
1980 */
1981void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
1982				unsigned long ro, unsigned long incompat)
1983{
1984	journal_superblock_t *sb;
1985
1986	jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
1987		  compat, ro, incompat);
1988
1989	sb = journal->j_superblock;
1990
1991	sb->s_feature_compat    &= ~cpu_to_be32(compat);
1992	sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
1993	sb->s_feature_incompat  &= ~cpu_to_be32(incompat);
1994	journal->j_revoke_records_per_block =
1995				journal_revoke_records_per_block(journal);
1996}
1997EXPORT_SYMBOL(jbd2_journal_clear_features);
1998
1999/**
2000 * int jbd2_journal_flush () - Flush journal
2001 * @journal: Journal to act on.
2002 *
2003 * Flush all data for a given journal to disk and empty the journal.
2004 * Filesystems can use this when remounting readonly to ensure that
2005 * recovery does not need to happen on remount.
2006 */
2007
2008int jbd2_journal_flush(journal_t *journal)
2009{
2010	int err = 0;
2011	transaction_t *transaction = NULL;
2012
2013	write_lock(&journal->j_state_lock);
2014
2015	/* Force everything buffered to the log... */
2016	if (journal->j_running_transaction) {
2017		transaction = journal->j_running_transaction;
2018		__jbd2_log_start_commit(journal, transaction->t_tid);
2019	} else if (journal->j_committing_transaction)
2020		transaction = journal->j_committing_transaction;
2021
2022	/* Wait for the log commit to complete... */
2023	if (transaction) {
2024		tid_t tid = transaction->t_tid;
2025
2026		write_unlock(&journal->j_state_lock);
2027		jbd2_log_wait_commit(journal, tid);
2028	} else {
2029		write_unlock(&journal->j_state_lock);
2030	}
2031
2032	/* ...and flush everything in the log out to disk. */
2033	spin_lock(&journal->j_list_lock);
2034	while (!err && journal->j_checkpoint_transactions != NULL) {
2035		spin_unlock(&journal->j_list_lock);
2036		mutex_lock_io(&journal->j_checkpoint_mutex);
2037		err = jbd2_log_do_checkpoint(journal);
2038		mutex_unlock(&journal->j_checkpoint_mutex);
2039		spin_lock(&journal->j_list_lock);
2040	}
2041	spin_unlock(&journal->j_list_lock);
2042
2043	if (is_journal_aborted(journal))
2044		return -EIO;
2045
2046	mutex_lock_io(&journal->j_checkpoint_mutex);
2047	if (!err) {
2048		err = jbd2_cleanup_journal_tail(journal);
2049		if (err < 0) {
2050			mutex_unlock(&journal->j_checkpoint_mutex);
2051			goto out;
2052		}
2053		err = 0;
2054	}
2055
2056	/* Finally, mark the journal as really needing no recovery.
2057	 * This sets s_start==0 in the underlying superblock, which is
2058	 * the magic code for a fully-recovered superblock.  Any future
2059	 * commits of data to the journal will restore the current
2060	 * s_start value. */
2061	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
2062	mutex_unlock(&journal->j_checkpoint_mutex);
2063	write_lock(&journal->j_state_lock);
2064	J_ASSERT(!journal->j_running_transaction);
2065	J_ASSERT(!journal->j_committing_transaction);
2066	J_ASSERT(!journal->j_checkpoint_transactions);
2067	J_ASSERT(journal->j_head == journal->j_tail);
2068	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
2069	write_unlock(&journal->j_state_lock);
2070out:
2071	return err;
2072}
2073
2074/**
2075 * int jbd2_journal_wipe() - Wipe journal contents
2076 * @journal: Journal to act on.
2077 * @write: flag (see below)
2078 *
2079 * Wipe out all of the contents of a journal, safely.  This will produce
2080 * a warning if the journal contains any valid recovery information.
2081 * Must be called between journal_init_*() and jbd2_journal_load().
2082 *
2083 * If 'write' is non-zero, then we wipe out the journal on disk; otherwise
2084 * we merely suppress recovery.
2085 */
2086
2087int jbd2_journal_wipe(journal_t *journal, int write)
2088{
2089	int err = 0;
2090
2091	J_ASSERT (!(journal->j_flags & JBD2_LOADED));
2092
2093	err = load_superblock(journal);
2094	if (err)
2095		return err;
2096
2097	if (!journal->j_tail)
2098		goto no_recovery;
2099
2100	printk(KERN_WARNING "JBD2: %s recovery information on journal\n",
2101		write ? "Clearing" : "Ignoring");
2102
2103	err = jbd2_journal_skip_recovery(journal);
2104	if (write) {
2105		/* Lock to make assertions happy... */
2106		mutex_lock_io(&journal->j_checkpoint_mutex);
2107		jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
2108		mutex_unlock(&journal->j_checkpoint_mutex);
2109	}
2110
2111 no_recovery:
2112	return err;
2113}
2114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2115/**
2116 * void jbd2_journal_abort () - Shutdown the journal immediately.
2117 * @journal: the journal to shutdown.
2118 * @errno:   an error number to record in the journal indicating
2119 *           the reason for the shutdown.
2120 *
2121 * Perform a complete, immediate shutdown of the ENTIRE
2122 * journal (not of a single transaction).  This operation cannot be
2123 * undone without closing and reopening the journal.
2124 *
2125 * The jbd2_journal_abort function is intended to support higher level error
2126 * recovery mechanisms such as the ext2/ext3 remount-readonly error
2127 * mode.
2128 *
2129 * Journal abort has very specific semantics.  Any existing dirty,
2130 * unjournaled buffers in the main filesystem will still be written to
2131 * disk by bdflush, but the journaling mechanism will be suspended
2132 * immediately and no further transaction commits will be honoured.
2133 *
2134 * Any dirty, journaled buffers will be written back to disk without
2135 * hitting the journal.  Atomicity cannot be guaranteed on an aborted
2136 * filesystem, but we _do_ attempt to leave as much data as possible
2137 * behind for fsck to use for cleanup.
2138 *
2139 * Any attempt to get a new transaction handle on a journal which is in
2140 * ABORT state will just result in an -EROFS error return.  A
2141 * jbd2_journal_stop on an existing handle will return -EIO if we have
2142 * entered abort state during the update.
2143 *
2144 * Recursive transactions are not disturbed by journal abort until the
2145 * final jbd2_journal_stop, which will receive the -EIO error.
2146 *
2147 * Finally, the jbd2_journal_abort call allows the caller to supply an errno
2148 * which will be recorded (if possible) in the journal superblock.  This
2149 * allows a client to record failure conditions in the middle of a
2150 * transaction without having to complete the transaction to record the
2151 * failure to disk.  ext3_error, for example, now uses this
2152 * functionality.
2153 *
 
 
 
 
 
2154 */
2155
2156void jbd2_journal_abort(journal_t *journal, int errno)
2157{
2158	transaction_t *transaction;
2159
2160	/*
2161	 * Lock the aborting procedure until everything is done, this avoid
2162	 * races between filesystem's error handling flow (e.g. ext4_abort()),
2163	 * ensure panic after the error info is written into journal's
2164	 * superblock.
2165	 */
2166	mutex_lock(&journal->j_abort_mutex);
2167	/*
2168	 * ESHUTDOWN always takes precedence because a file system check
2169	 * caused by any other journal abort error is not required after
2170	 * a shutdown triggered.
2171	 */
2172	write_lock(&journal->j_state_lock);
2173	if (journal->j_flags & JBD2_ABORT) {
2174		int old_errno = journal->j_errno;
2175
2176		write_unlock(&journal->j_state_lock);
2177		if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
2178			journal->j_errno = errno;
2179			jbd2_journal_update_sb_errno(journal);
2180		}
2181		mutex_unlock(&journal->j_abort_mutex);
2182		return;
2183	}
2184
2185	/*
2186	 * Mark the abort as occurred and start current running transaction
2187	 * to release all journaled buffer.
2188	 */
2189	pr_err("Aborting journal on device %s.\n", journal->j_devname);
2190
2191	journal->j_flags |= JBD2_ABORT;
2192	journal->j_errno = errno;
2193	transaction = journal->j_running_transaction;
2194	if (transaction)
2195		__jbd2_log_start_commit(journal, transaction->t_tid);
2196	write_unlock(&journal->j_state_lock);
2197
2198	/*
2199	 * Record errno to the journal super block, so that fsck and jbd2
2200	 * layer could realise that a filesystem check is needed.
2201	 */
2202	jbd2_journal_update_sb_errno(journal);
2203	mutex_unlock(&journal->j_abort_mutex);
2204}
2205
2206/**
2207 * int jbd2_journal_errno () - returns the journal's error state.
2208 * @journal: journal to examine.
2209 *
2210 * This is the errno number set with jbd2_journal_abort(), the last
2211 * time the journal was mounted - if the journal was stopped
2212 * without calling abort this will be 0.
2213 *
2214 * If the journal has been aborted on this mount time -EROFS will
2215 * be returned.
2216 */
2217int jbd2_journal_errno(journal_t *journal)
2218{
2219	int err;
2220
2221	read_lock(&journal->j_state_lock);
2222	if (journal->j_flags & JBD2_ABORT)
2223		err = -EROFS;
2224	else
2225		err = journal->j_errno;
2226	read_unlock(&journal->j_state_lock);
2227	return err;
2228}
2229
2230/**
2231 * int jbd2_journal_clear_err () - clears the journal's error state
2232 * @journal: journal to act on.
2233 *
2234 * An error must be cleared or acked to take a FS out of readonly
2235 * mode.
2236 */
2237int jbd2_journal_clear_err(journal_t *journal)
2238{
2239	int err = 0;
2240
2241	write_lock(&journal->j_state_lock);
2242	if (journal->j_flags & JBD2_ABORT)
2243		err = -EROFS;
2244	else
2245		journal->j_errno = 0;
2246	write_unlock(&journal->j_state_lock);
2247	return err;
2248}
2249
2250/**
2251 * void jbd2_journal_ack_err() - Ack journal err.
2252 * @journal: journal to act on.
2253 *
2254 * An error must be cleared or acked to take a FS out of readonly
2255 * mode.
2256 */
2257void jbd2_journal_ack_err(journal_t *journal)
2258{
2259	write_lock(&journal->j_state_lock);
2260	if (journal->j_errno)
2261		journal->j_flags |= JBD2_ACK_ERR;
2262	write_unlock(&journal->j_state_lock);
2263}
2264
2265int jbd2_journal_blocks_per_page(struct inode *inode)
2266{
2267	return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
2268}
2269
2270/*
2271 * helper functions to deal with 32 or 64bit block numbers.
2272 */
2273size_t journal_tag_bytes(journal_t *journal)
2274{
2275	size_t sz;
2276
2277	if (jbd2_has_feature_csum3(journal))
2278		return sizeof(journal_block_tag3_t);
2279
2280	sz = sizeof(journal_block_tag_t);
2281
2282	if (jbd2_has_feature_csum2(journal))
2283		sz += sizeof(__u16);
2284
2285	if (jbd2_has_feature_64bit(journal))
2286		return sz;
2287	else
2288		return sz - sizeof(__u32);
2289}
2290
2291/*
2292 * JBD memory management
2293 *
2294 * These functions are used to allocate block-sized chunks of memory
2295 * used for making copies of buffer_head data.  Very often it will be
2296 * page-sized chunks of data, but sometimes it will be in
2297 * sub-page-size chunks.  (For example, 16k pages on Power systems
2298 * with a 4k block file system.)  For blocks smaller than a page, we
2299 * use a SLAB allocator.  There are slab caches for each block size,
2300 * which are allocated at mount time, if necessary, and we only free
2301 * (all of) the slab caches when/if the jbd2 module is unloaded.  For
2302 * this reason we don't need to a mutex to protect access to
2303 * jbd2_slab[] allocating or releasing memory; only in
2304 * jbd2_journal_create_slab().
2305 */
2306#define JBD2_MAX_SLABS 8
2307static struct kmem_cache *jbd2_slab[JBD2_MAX_SLABS];
2308
2309static const char *jbd2_slab_names[JBD2_MAX_SLABS] = {
2310	"jbd2_1k", "jbd2_2k", "jbd2_4k", "jbd2_8k",
2311	"jbd2_16k", "jbd2_32k", "jbd2_64k", "jbd2_128k"
2312};
2313
2314
2315static void jbd2_journal_destroy_slabs(void)
2316{
2317	int i;
2318
2319	for (i = 0; i < JBD2_MAX_SLABS; i++) {
2320		kmem_cache_destroy(jbd2_slab[i]);
 
2321		jbd2_slab[i] = NULL;
2322	}
2323}
2324
2325static int jbd2_journal_create_slab(size_t size)
2326{
2327	static DEFINE_MUTEX(jbd2_slab_create_mutex);
2328	int i = order_base_2(size) - 10;
2329	size_t slab_size;
2330
2331	if (size == PAGE_SIZE)
2332		return 0;
2333
2334	if (i >= JBD2_MAX_SLABS)
2335		return -EINVAL;
2336
2337	if (unlikely(i < 0))
2338		i = 0;
2339	mutex_lock(&jbd2_slab_create_mutex);
2340	if (jbd2_slab[i]) {
2341		mutex_unlock(&jbd2_slab_create_mutex);
2342		return 0;	/* Already created */
2343	}
2344
2345	slab_size = 1 << (i+10);
2346	jbd2_slab[i] = kmem_cache_create(jbd2_slab_names[i], slab_size,
2347					 slab_size, 0, NULL);
2348	mutex_unlock(&jbd2_slab_create_mutex);
2349	if (!jbd2_slab[i]) {
2350		printk(KERN_EMERG "JBD2: no memory for jbd2_slab cache\n");
2351		return -ENOMEM;
2352	}
2353	return 0;
2354}
2355
2356static struct kmem_cache *get_slab(size_t size)
2357{
2358	int i = order_base_2(size) - 10;
2359
2360	BUG_ON(i >= JBD2_MAX_SLABS);
2361	if (unlikely(i < 0))
2362		i = 0;
2363	BUG_ON(jbd2_slab[i] == NULL);
2364	return jbd2_slab[i];
2365}
2366
2367void *jbd2_alloc(size_t size, gfp_t flags)
2368{
2369	void *ptr;
2370
2371	BUG_ON(size & (size-1)); /* Must be a power of 2 */
2372
2373	if (size < PAGE_SIZE)
2374		ptr = kmem_cache_alloc(get_slab(size), flags);
2375	else
2376		ptr = (void *)__get_free_pages(flags, get_order(size));
2377
2378	/* Check alignment; SLUB has gotten this wrong in the past,
2379	 * and this can lead to user data corruption! */
2380	BUG_ON(((unsigned long) ptr) & (size-1));
2381
2382	return ptr;
2383}
2384
2385void jbd2_free(void *ptr, size_t size)
2386{
2387	if (size < PAGE_SIZE)
2388		kmem_cache_free(get_slab(size), ptr);
2389	else
2390		free_pages((unsigned long)ptr, get_order(size));
2391};
2392
2393/*
2394 * Journal_head storage management
2395 */
2396static struct kmem_cache *jbd2_journal_head_cache;
2397#ifdef CONFIG_JBD2_DEBUG
2398static atomic_t nr_journal_heads = ATOMIC_INIT(0);
2399#endif
2400
2401static int __init jbd2_journal_init_journal_head_cache(void)
2402{
2403	J_ASSERT(!jbd2_journal_head_cache);
 
 
2404	jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head",
2405				sizeof(struct journal_head),
2406				0,		/* offset */
2407				SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU,
2408				NULL);		/* ctor */
 
2409	if (!jbd2_journal_head_cache) {
 
2410		printk(KERN_EMERG "JBD2: no memory for journal_head cache\n");
2411		return -ENOMEM;
2412	}
2413	return 0;
2414}
2415
2416static void jbd2_journal_destroy_journal_head_cache(void)
2417{
2418	kmem_cache_destroy(jbd2_journal_head_cache);
2419	jbd2_journal_head_cache = NULL;
 
 
2420}
2421
2422/*
2423 * journal_head splicing and dicing
2424 */
2425static struct journal_head *journal_alloc_journal_head(void)
2426{
2427	struct journal_head *ret;
2428
2429#ifdef CONFIG_JBD2_DEBUG
2430	atomic_inc(&nr_journal_heads);
2431#endif
2432	ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS);
2433	if (!ret) {
2434		jbd_debug(1, "out of memory for journal_head\n");
2435		pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__);
2436		ret = kmem_cache_zalloc(jbd2_journal_head_cache,
2437				GFP_NOFS | __GFP_NOFAIL);
2438	}
2439	if (ret)
2440		spin_lock_init(&ret->b_state_lock);
2441	return ret;
2442}
2443
2444static void journal_free_journal_head(struct journal_head *jh)
2445{
2446#ifdef CONFIG_JBD2_DEBUG
2447	atomic_dec(&nr_journal_heads);
2448	memset(jh, JBD2_POISON_FREE, sizeof(*jh));
2449#endif
2450	kmem_cache_free(jbd2_journal_head_cache, jh);
2451}
2452
2453/*
2454 * A journal_head is attached to a buffer_head whenever JBD has an
2455 * interest in the buffer.
2456 *
2457 * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
2458 * is set.  This bit is tested in core kernel code where we need to take
2459 * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
2460 * there.
2461 *
2462 * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
2463 *
2464 * When a buffer has its BH_JBD bit set it is immune from being released by
2465 * core kernel code, mainly via ->b_count.
2466 *
2467 * A journal_head is detached from its buffer_head when the journal_head's
2468 * b_jcount reaches zero. Running transaction (b_transaction) and checkpoint
2469 * transaction (b_cp_transaction) hold their references to b_jcount.
2470 *
2471 * Various places in the kernel want to attach a journal_head to a buffer_head
2472 * _before_ attaching the journal_head to a transaction.  To protect the
2473 * journal_head in this situation, jbd2_journal_add_journal_head elevates the
2474 * journal_head's b_jcount refcount by one.  The caller must call
2475 * jbd2_journal_put_journal_head() to undo this.
2476 *
2477 * So the typical usage would be:
2478 *
2479 *	(Attach a journal_head if needed.  Increments b_jcount)
2480 *	struct journal_head *jh = jbd2_journal_add_journal_head(bh);
2481 *	...
2482 *      (Get another reference for transaction)
2483 *	jbd2_journal_grab_journal_head(bh);
2484 *	jh->b_transaction = xxx;
2485 *	(Put original reference)
2486 *	jbd2_journal_put_journal_head(jh);
2487 */
2488
2489/*
2490 * Give a buffer_head a journal_head.
2491 *
2492 * May sleep.
2493 */
2494struct journal_head *jbd2_journal_add_journal_head(struct buffer_head *bh)
2495{
2496	struct journal_head *jh;
2497	struct journal_head *new_jh = NULL;
2498
2499repeat:
2500	if (!buffer_jbd(bh))
2501		new_jh = journal_alloc_journal_head();
2502
2503	jbd_lock_bh_journal_head(bh);
2504	if (buffer_jbd(bh)) {
2505		jh = bh2jh(bh);
2506	} else {
2507		J_ASSERT_BH(bh,
2508			(atomic_read(&bh->b_count) > 0) ||
2509			(bh->b_page && bh->b_page->mapping));
2510
2511		if (!new_jh) {
2512			jbd_unlock_bh_journal_head(bh);
2513			goto repeat;
2514		}
2515
2516		jh = new_jh;
2517		new_jh = NULL;		/* We consumed it */
2518		set_buffer_jbd(bh);
2519		bh->b_private = jh;
2520		jh->b_bh = bh;
2521		get_bh(bh);
2522		BUFFER_TRACE(bh, "added journal_head");
2523	}
2524	jh->b_jcount++;
2525	jbd_unlock_bh_journal_head(bh);
2526	if (new_jh)
2527		journal_free_journal_head(new_jh);
2528	return bh->b_private;
2529}
2530
2531/*
2532 * Grab a ref against this buffer_head's journal_head.  If it ended up not
2533 * having a journal_head, return NULL
2534 */
2535struct journal_head *jbd2_journal_grab_journal_head(struct buffer_head *bh)
2536{
2537	struct journal_head *jh = NULL;
2538
2539	jbd_lock_bh_journal_head(bh);
2540	if (buffer_jbd(bh)) {
2541		jh = bh2jh(bh);
2542		jh->b_jcount++;
2543	}
2544	jbd_unlock_bh_journal_head(bh);
2545	return jh;
2546}
2547
2548static void __journal_remove_journal_head(struct buffer_head *bh)
2549{
2550	struct journal_head *jh = bh2jh(bh);
2551
 
2552	J_ASSERT_JH(jh, jh->b_transaction == NULL);
2553	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
2554	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
2555	J_ASSERT_JH(jh, jh->b_jlist == BJ_None);
2556	J_ASSERT_BH(bh, buffer_jbd(bh));
2557	J_ASSERT_BH(bh, jh2bh(jh) == bh);
2558	BUFFER_TRACE(bh, "remove journal_head");
2559
2560	/* Unlink before dropping the lock */
2561	bh->b_private = NULL;
2562	jh->b_bh = NULL;	/* debug, really */
2563	clear_buffer_jbd(bh);
2564}
2565
2566static void journal_release_journal_head(struct journal_head *jh, size_t b_size)
2567{
2568	if (jh->b_frozen_data) {
2569		printk(KERN_WARNING "%s: freeing b_frozen_data\n", __func__);
2570		jbd2_free(jh->b_frozen_data, b_size);
2571	}
2572	if (jh->b_committed_data) {
2573		printk(KERN_WARNING "%s: freeing b_committed_data\n", __func__);
2574		jbd2_free(jh->b_committed_data, b_size);
2575	}
 
 
 
2576	journal_free_journal_head(jh);
2577}
2578
2579/*
2580 * Drop a reference on the passed journal_head.  If it fell to zero then
2581 * release the journal_head from the buffer_head.
2582 */
2583void jbd2_journal_put_journal_head(struct journal_head *jh)
2584{
2585	struct buffer_head *bh = jh2bh(jh);
2586
2587	jbd_lock_bh_journal_head(bh);
2588	J_ASSERT_JH(jh, jh->b_jcount > 0);
2589	--jh->b_jcount;
2590	if (!jh->b_jcount) {
2591		__journal_remove_journal_head(bh);
2592		jbd_unlock_bh_journal_head(bh);
2593		journal_release_journal_head(jh, bh->b_size);
2594		__brelse(bh);
2595	} else {
2596		jbd_unlock_bh_journal_head(bh);
2597	}
2598}
2599
2600/*
2601 * Initialize jbd inode head
2602 */
2603void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
2604{
2605	jinode->i_transaction = NULL;
2606	jinode->i_next_transaction = NULL;
2607	jinode->i_vfs_inode = inode;
2608	jinode->i_flags = 0;
2609	jinode->i_dirty_start = 0;
2610	jinode->i_dirty_end = 0;
2611	INIT_LIST_HEAD(&jinode->i_list);
2612}
2613
2614/*
2615 * Function to be called before we start removing inode from memory (i.e.,
2616 * clear_inode() is a fine place to be called from). It removes inode from
2617 * transaction's lists.
2618 */
2619void jbd2_journal_release_jbd_inode(journal_t *journal,
2620				    struct jbd2_inode *jinode)
2621{
2622	if (!journal)
2623		return;
2624restart:
2625	spin_lock(&journal->j_list_lock);
2626	/* Is commit writing out inode - we have to wait */
2627	if (jinode->i_flags & JI_COMMIT_RUNNING) {
2628		wait_queue_head_t *wq;
2629		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
2630		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
2631		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
2632		spin_unlock(&journal->j_list_lock);
2633		schedule();
2634		finish_wait(wq, &wait.wq_entry);
2635		goto restart;
2636	}
2637
2638	if (jinode->i_transaction) {
2639		list_del(&jinode->i_list);
2640		jinode->i_transaction = NULL;
2641	}
2642	spin_unlock(&journal->j_list_lock);
2643}
2644
2645
2646#ifdef CONFIG_PROC_FS
2647
2648#define JBD2_STATS_PROC_NAME "fs/jbd2"
2649
2650static void __init jbd2_create_jbd_stats_proc_entry(void)
2651{
2652	proc_jbd2_stats = proc_mkdir(JBD2_STATS_PROC_NAME, NULL);
2653}
2654
2655static void __exit jbd2_remove_jbd_stats_proc_entry(void)
2656{
2657	if (proc_jbd2_stats)
2658		remove_proc_entry(JBD2_STATS_PROC_NAME, NULL);
2659}
2660
2661#else
2662
2663#define jbd2_create_jbd_stats_proc_entry() do {} while (0)
2664#define jbd2_remove_jbd_stats_proc_entry() do {} while (0)
2665
2666#endif
2667
2668struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
2669
2670static int __init jbd2_journal_init_inode_cache(void)
2671{
2672	J_ASSERT(!jbd2_inode_cache);
2673	jbd2_inode_cache = KMEM_CACHE(jbd2_inode, 0);
2674	if (!jbd2_inode_cache) {
2675		pr_emerg("JBD2: failed to create inode cache\n");
2676		return -ENOMEM;
2677	}
2678	return 0;
2679}
2680
2681static int __init jbd2_journal_init_handle_cache(void)
2682{
2683	J_ASSERT(!jbd2_handle_cache);
2684	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
2685	if (!jbd2_handle_cache) {
2686		printk(KERN_EMERG "JBD2: failed to create handle cache\n");
2687		return -ENOMEM;
2688	}
 
 
 
 
 
 
2689	return 0;
2690}
2691
2692static void jbd2_journal_destroy_inode_cache(void)
2693{
2694	kmem_cache_destroy(jbd2_inode_cache);
2695	jbd2_inode_cache = NULL;
2696}
 
2697
2698static void jbd2_journal_destroy_handle_cache(void)
2699{
2700	kmem_cache_destroy(jbd2_handle_cache);
2701	jbd2_handle_cache = NULL;
2702}
2703
2704/*
2705 * Module startup and shutdown
2706 */
2707
2708static int __init journal_init_caches(void)
2709{
2710	int ret;
2711
2712	ret = jbd2_journal_init_revoke_record_cache();
2713	if (ret == 0)
2714		ret = jbd2_journal_init_revoke_table_cache();
2715	if (ret == 0)
2716		ret = jbd2_journal_init_journal_head_cache();
2717	if (ret == 0)
2718		ret = jbd2_journal_init_handle_cache();
2719	if (ret == 0)
2720		ret = jbd2_journal_init_inode_cache();
2721	if (ret == 0)
2722		ret = jbd2_journal_init_transaction_cache();
2723	return ret;
2724}
2725
2726static void jbd2_journal_destroy_caches(void)
2727{
2728	jbd2_journal_destroy_revoke_record_cache();
2729	jbd2_journal_destroy_revoke_table_cache();
2730	jbd2_journal_destroy_journal_head_cache();
2731	jbd2_journal_destroy_handle_cache();
2732	jbd2_journal_destroy_inode_cache();
2733	jbd2_journal_destroy_transaction_cache();
2734	jbd2_journal_destroy_slabs();
2735}
2736
2737static int __init journal_init(void)
2738{
2739	int ret;
2740
2741	BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024);
2742
2743	ret = journal_init_caches();
2744	if (ret == 0) {
2745		jbd2_create_jbd_stats_proc_entry();
2746	} else {
2747		jbd2_journal_destroy_caches();
2748	}
2749	return ret;
2750}
2751
2752static void __exit journal_exit(void)
2753{
2754#ifdef CONFIG_JBD2_DEBUG
2755	int n = atomic_read(&nr_journal_heads);
2756	if (n)
2757		printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
2758#endif
2759	jbd2_remove_jbd_stats_proc_entry();
2760	jbd2_journal_destroy_caches();
2761}
2762
2763MODULE_LICENSE("GPL");
2764module_init(journal_init);
2765module_exit(journal_exit);
2766