fast_commit.c - fs/ext4/fast_commit.c - Linux diff v6.2

   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 *
  69 * Not all operations are supported by fast commits today (e.g extended
  70 * attributes). Fast commit ineligibility is marked by calling
  71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  72 * to full commit.
 
 
 
 
 
 
 
 
 
 
 
  73 *
  74 * Atomicity of commits
  75 * --------------------
  76 * In order to guarantee atomicity during the commit operation, fast commit
  77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  78 * tag contains CRC of the contents and TID of the transaction after which
  79 * this fast commit should be applied. Recovery code replays fast commit
  80 * logs only if there's at least 1 valid tail present. For every fast commit
  81 * operation, there is 1 tail. This means, we may end up with multiple tails
  82 * in the fast commit space. Here's an example:
  83 *
  84 * - Create a new file A and remove existing file B
  85 * - fsync()
  86 * - Append contents to file A
  87 * - Truncate file A
  88 * - fsync()
  89 *
  90 * The fast commit space at the end of above operations would look like this:
  91 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
  92 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
  93 *
  94 * Replay code should thus check for all the valid tails in the FC area.
  95 *
  96 * Fast Commit Replay Idempotence
  97 * ------------------------------
  98 *
  99 * Fast commits tags are idempotent in nature provided the recovery code follows
 100 * certain rules. The guiding principle that the commit path follows while
 101 * committing is that it stores the result of a particular operation instead of
 102 * storing the procedure.
 103 *
 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 105 * was associated with inode 10. During fast commit, instead of storing this
 106 * operation as a procedure "rename a to b", we store the resulting file system
 107 * state as a "series" of outcomes:
 108 *
 109 * - Link dirent b to inode 10
 110 * - Unlink dirent a
 111 * - Inode <10> with valid refcount
 112 *
 113 * Now when recovery code runs, it needs "enforce" this state on the file
 114 * system. This is what guarantees idempotence of fast commit replay.
 115 *
 116 * Let's take an example of a procedure that is not idempotent and see how fast
 117 * commits make it idempotent. Consider following sequence of operations:
 118 *
 119 *     rm A;    mv B A;    read A
 120 *  (x)     (y)        (z)
 121 *
 122 * (x), (y) and (z) are the points at which we can crash. If we store this
 123 * sequence of operations as is then the replay is not idempotent. Let's say
 124 * while in replay, we crash at (z). During the second replay, file A (which was
 125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 126 * file named A would be absent when we try to read A. So, this sequence of
 127 * operations is not idempotent. However, as mentioned above, instead of storing
 128 * the procedure fast commits store the outcome of each procedure. Thus the fast
 129 * commit log for above procedure would be as follows:
 130 *
 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 132 * inode 11 before the replay)
 133 *
 134 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 135 * (w)          (x)                    (y)          (z)
 136 *
 137 * If we crash at (z), we will have file A linked to inode 11. During the second
 138 * replay, we will remove file A (inode 11). But we will create it back and make
 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 142 * similarly. Thus, by converting a non-idempotent procedure into a series of
 143 * idempotent outcomes, fast commits ensured idempotence during the replay.
 144 *
 145 * TODOs
 146 * -----
 147 *
 148 * 0) Fast commit replay path hardening: Fast commit replay code should use
 149 *    journal handles to make sure all the updates it does during the replay
 150 *    path are atomic. With that if we crash during fast commit replay, after
 151 *    trying to do recovery again, we will find a file system where fast commit
 152 *    area is invalid (because new full commit would be found). In order to deal
 153 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 154 *    superblock state is persisted before starting the replay, so that after
 155 *    the crash, fast commit recovery code can look at that flag and perform
 156 *    fast commit recovery even if that area is invalidated by later full
 157 *    commits.
 158 *
 159 * 1) Fast commit's commit path locks the entire file system during fast
 160 *    commit. This has significant performance penalty. Instead of that, we
 161 *    should use ext4_fc_start/stop_update functions to start inode level
 162 *    updates from ext4_journal_start/stop. Once we do that we can drop file
 163 *    system locking during commit path.
 
 
 164 *
 165 * 2) Handle more ineligible cases.
 166 */
 167
 168#include <trace/events/ext4.h>
 169static struct kmem_cache *ext4_fc_dentry_cachep;
 170
 171static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172{
 173	BUFFER_TRACE(bh, "");
 174	if (uptodate) {
 175		ext4_debug("%s: Block %lld up-to-date",
 176			   __func__, bh->b_blocknr);
 177		set_buffer_uptodate(bh);
 178	} else {
 179		ext4_debug("%s: Block %lld not up-to-date",
 180			   __func__, bh->b_blocknr);
 181		clear_buffer_uptodate(bh);
 182	}
 183
 184	unlock_buffer(bh);
 185}
 186
 187static inline void ext4_fc_reset_inode(struct inode *inode)
 188{
 189	struct ext4_inode_info *ei = EXT4_I(inode);
 190
 191	ei->i_fc_lblk_start = 0;
 192	ei->i_fc_lblk_len = 0;
 193}
 194
 195void ext4_fc_init_inode(struct inode *inode)
 196{
 197	struct ext4_inode_info *ei = EXT4_I(inode);
 198
 199	ext4_fc_reset_inode(inode);
 200	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 201	INIT_LIST_HEAD(&ei->i_fc_list);
 202	INIT_LIST_HEAD(&ei->i_fc_dilist);
 203	init_waitqueue_head(&ei->i_fc_wait);
 204	atomic_set(&ei->i_fc_updates, 0);
 205}
 206
 207/* This function must be called with sbi->s_fc_lock held. */
 208static void ext4_fc_wait_committing_inode(struct inode *inode)
 209__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 210{
 211	wait_queue_head_t *wq;
 212	struct ext4_inode_info *ei = EXT4_I(inode);
 213
 214#if (BITS_PER_LONG < 64)
 215	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 216			EXT4_STATE_FC_COMMITTING);
 217	wq = bit_waitqueue(&ei->i_state_flags,
 218				EXT4_STATE_FC_COMMITTING);
 219#else
 220	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 221			EXT4_STATE_FC_COMMITTING);
 222	wq = bit_waitqueue(&ei->i_flags,
 223				EXT4_STATE_FC_COMMITTING);
 224#endif
 225	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 226	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 227	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 228	schedule();
 229	finish_wait(wq, &wait.wq_entry);
 230}
 231
 232static bool ext4_fc_disabled(struct super_block *sb)
 233{
 234	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 235		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
 236}
 237
 238/*
 239 * Inform Ext4's fast about start of an inode update
 240 *
 241 * This function is called by the high level call VFS callbacks before
 242 * performing any inode update. This function blocks if there's an ongoing
 243 * fast commit on the inode in question.
 244 */
 245void ext4_fc_start_update(struct inode *inode)
 246{
 247	struct ext4_inode_info *ei = EXT4_I(inode);
 248
 249	if (ext4_fc_disabled(inode->i_sb))
 
 250		return;
 251
 252restart:
 253	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 254	if (list_empty(&ei->i_fc_list))
 255		goto out;
 256
 257	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 258		ext4_fc_wait_committing_inode(inode);
 259		goto restart;
 260	}
 261out:
 262	atomic_inc(&ei->i_fc_updates);
 263	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 264}
 265
 266/*
 267 * Stop inode update and wake up waiting fast commits if any.
 268 */
 269void ext4_fc_stop_update(struct inode *inode)
 270{
 271	struct ext4_inode_info *ei = EXT4_I(inode);
 272
 273	if (ext4_fc_disabled(inode->i_sb))
 
 274		return;
 275
 276	if (atomic_dec_and_test(&ei->i_fc_updates))
 277		wake_up_all(&ei->i_fc_wait);
 278}
 279
 280/*
 281 * Remove inode from fast commit list. If the inode is being committed
 282 * we wait until inode commit is done.
 283 */
 284void ext4_fc_del(struct inode *inode)
 285{
 286	struct ext4_inode_info *ei = EXT4_I(inode);
 287	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 288	struct ext4_fc_dentry_update *fc_dentry;
 289
 290	if (ext4_fc_disabled(inode->i_sb))
 
 291		return;
 292
 293restart:
 294	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 295	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 296		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 297		return;
 298	}
 299
 300	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 301		ext4_fc_wait_committing_inode(inode);
 302		goto restart;
 303	}
 
 
 
 304
 305	if (!list_empty(&ei->i_fc_list))
 306		list_del_init(&ei->i_fc_list);
 
 
 
 
 
 307
 308	/*
 309	 * Since this inode is getting removed, let's also remove all FC
 310	 * dentry create references, since it is not needed to log it anyways.
 311	 */
 312	if (list_empty(&ei->i_fc_dilist)) {
 313		spin_unlock(&sbi->s_fc_lock);
 314		return;
 315	}
 316
 317	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
 318	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 319	list_del_init(&fc_dentry->fcd_list);
 320	list_del_init(&fc_dentry->fcd_dilist);
 321
 322	WARN_ON(!list_empty(&ei->i_fc_dilist));
 323	spin_unlock(&sbi->s_fc_lock);
 324
 325	if (fc_dentry->fcd_name.name &&
 326		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
 327		kfree(fc_dentry->fcd_name.name);
 328	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 329
 330	return;
 
 
 331}
 332
 333/*
 334 * Mark file system as fast commit ineligible, and record latest
 335 * ineligible transaction tid. This means until the recorded
 336 * transaction, commit operation would result in a full jbd2 commit.
 337 */
 338void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 339{
 340	struct ext4_sb_info *sbi = EXT4_SB(sb);
 341	tid_t tid;
 342
 343	if (ext4_fc_disabled(sb))
 
 344		return;
 345
 346	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 347	if (handle && !IS_ERR(handle))
 348		tid = handle->h_transaction->t_tid;
 349	else {
 350		read_lock(&sbi->s_journal->j_state_lock);
 351		tid = sbi->s_journal->j_running_transaction ?
 352				sbi->s_journal->j_running_transaction->t_tid : 0;
 353		read_unlock(&sbi->s_journal->j_state_lock);
 354	}
 355	spin_lock(&sbi->s_fc_lock);
 356	if (sbi->s_fc_ineligible_tid < tid)
 357		sbi->s_fc_ineligible_tid = tid;
 358	spin_unlock(&sbi->s_fc_lock);
 359	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 360	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 361}
 362
 363/*
 364 * Generic fast commit tracking function. If this is the first time this we are
 365 * called after a full commit, we initialize fast commit fields and then call
 366 * __fc_track_fn() with update = 0. If we have already been called after a full
 367 * commit, we pass update = 1. Based on that, the track function can determine
 368 * if it needs to track a field for the first time or if it needs to just
 369 * update the previously tracked value.
 370 *
 371 * If enqueue is set, this function enqueues the inode in fast commit list.
 372 */
 373static int ext4_fc_track_template(
 374	handle_t *handle, struct inode *inode,
 375	int (*__fc_track_fn)(struct inode *, void *, bool),
 376	void *args, int enqueue)
 377{
 378	bool update = false;
 379	struct ext4_inode_info *ei = EXT4_I(inode);
 380	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 381	tid_t tid = 0;
 382	int ret;
 383
 
 
 
 
 
 
 
 384	tid = handle->h_transaction->t_tid;
 385	mutex_lock(&ei->i_fc_lock);
 386	if (tid == ei->i_sync_tid) {
 387		update = true;
 388	} else {
 389		ext4_fc_reset_inode(inode);
 390		ei->i_sync_tid = tid;
 391	}
 392	ret = __fc_track_fn(inode, args, update);
 393	mutex_unlock(&ei->i_fc_lock);
 394
 395	if (!enqueue)
 396		return ret;
 397
 398	spin_lock(&sbi->s_fc_lock);
 399	if (list_empty(&EXT4_I(inode)->i_fc_list))
 400		list_add_tail(&EXT4_I(inode)->i_fc_list,
 401				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 402				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 403				&sbi->s_fc_q[FC_Q_STAGING] :
 404				&sbi->s_fc_q[FC_Q_MAIN]);
 405	spin_unlock(&sbi->s_fc_lock);
 406
 407	return ret;
 408}
 409
 410struct __track_dentry_update_args {
 411	struct dentry *dentry;
 412	int op;
 413};
 414
 415/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 416static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 417{
 418	struct ext4_fc_dentry_update *node;
 419	struct ext4_inode_info *ei = EXT4_I(inode);
 420	struct __track_dentry_update_args *dentry_update =
 421		(struct __track_dentry_update_args *)arg;
 422	struct dentry *dentry = dentry_update->dentry;
 423	struct inode *dir = dentry->d_parent->d_inode;
 424	struct super_block *sb = inode->i_sb;
 425	struct ext4_sb_info *sbi = EXT4_SB(sb);
 426
 427	mutex_unlock(&ei->i_fc_lock);
 428
 429	if (IS_ENCRYPTED(dir)) {
 430		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
 431					NULL);
 432		mutex_lock(&ei->i_fc_lock);
 433		return -EOPNOTSUPP;
 434	}
 435
 436	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 437	if (!node) {
 438		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 439		mutex_lock(&ei->i_fc_lock);
 440		return -ENOMEM;
 441	}
 442
 443	node->fcd_op = dentry_update->op;
 444	node->fcd_parent = dir->i_ino;
 445	node->fcd_ino = inode->i_ino;
 446	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 447		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 448		if (!node->fcd_name.name) {
 449			kmem_cache_free(ext4_fc_dentry_cachep, node);
 450			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 
 451			mutex_lock(&ei->i_fc_lock);
 452			return -ENOMEM;
 453		}
 454		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 455			dentry->d_name.len);
 456	} else {
 457		memcpy(node->fcd_iname, dentry->d_name.name,
 458			dentry->d_name.len);
 459		node->fcd_name.name = node->fcd_iname;
 460	}
 461	node->fcd_name.len = dentry->d_name.len;
 462	INIT_LIST_HEAD(&node->fcd_dilist);
 463	spin_lock(&sbi->s_fc_lock);
 464	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 465		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 466		list_add_tail(&node->fcd_list,
 467				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 468	else
 469		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 470
 471	/*
 472	 * This helps us keep a track of all fc_dentry updates which is part of
 473	 * this ext4 inode. So in case the inode is getting unlinked, before
 474	 * even we get a chance to fsync, we could remove all fc_dentry
 475	 * references while evicting the inode in ext4_fc_del().
 476	 * Also with this, we don't need to loop over all the inodes in
 477	 * sbi->s_fc_q to get the corresponding inode in
 478	 * ext4_fc_commit_dentry_updates().
 479	 */
 480	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
 481		WARN_ON(!list_empty(&ei->i_fc_dilist));
 482		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
 483	}
 484	spin_unlock(&sbi->s_fc_lock);
 485	mutex_lock(&ei->i_fc_lock);
 486
 487	return 0;
 488}
 489
 490void __ext4_fc_track_unlink(handle_t *handle,
 491		struct inode *inode, struct dentry *dentry)
 492{
 493	struct __track_dentry_update_args args;
 494	int ret;
 495
 496	args.dentry = dentry;
 497	args.op = EXT4_FC_TAG_UNLINK;
 498
 499	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 500					(void *)&args, 0);
 501	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
 502}
 503
 504void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 505{
 506	struct inode *inode = d_inode(dentry);
 507
 508	if (ext4_fc_disabled(inode->i_sb))
 509		return;
 510
 511	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 512		return;
 513
 514	__ext4_fc_track_unlink(handle, inode, dentry);
 515}
 516
 517void __ext4_fc_track_link(handle_t *handle,
 518	struct inode *inode, struct dentry *dentry)
 519{
 520	struct __track_dentry_update_args args;
 521	int ret;
 522
 523	args.dentry = dentry;
 524	args.op = EXT4_FC_TAG_LINK;
 525
 526	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 527					(void *)&args, 0);
 528	trace_ext4_fc_track_link(handle, inode, dentry, ret);
 529}
 530
 531void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 532{
 533	struct inode *inode = d_inode(dentry);
 534
 535	if (ext4_fc_disabled(inode->i_sb))
 536		return;
 537
 538	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 539		return;
 540
 541	__ext4_fc_track_link(handle, inode, dentry);
 542}
 543
 544void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 545			  struct dentry *dentry)
 546{
 547	struct __track_dentry_update_args args;
 548	int ret;
 549
 550	args.dentry = dentry;
 551	args.op = EXT4_FC_TAG_CREAT;
 552
 553	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 554					(void *)&args, 0);
 555	trace_ext4_fc_track_create(handle, inode, dentry, ret);
 556}
 557
 558void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 559{
 560	struct inode *inode = d_inode(dentry);
 561
 562	if (ext4_fc_disabled(inode->i_sb))
 563		return;
 564
 565	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 566		return;
 567
 568	__ext4_fc_track_create(handle, inode, dentry);
 569}
 570
 571/* __track_fn for inode tracking */
 572static int __track_inode(struct inode *inode, void *arg, bool update)
 573{
 574	if (update)
 575		return -EEXIST;
 576
 577	EXT4_I(inode)->i_fc_lblk_len = 0;
 578
 579	return 0;
 580}
 581
 582void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 583{
 584	int ret;
 585
 586	if (S_ISDIR(inode->i_mode))
 587		return;
 588
 589	if (ext4_fc_disabled(inode->i_sb))
 590		return;
 591
 592	if (ext4_should_journal_data(inode)) {
 593		ext4_fc_mark_ineligible(inode->i_sb,
 594					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 595		return;
 596	}
 597
 598	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 599		return;
 600
 601	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 602	trace_ext4_fc_track_inode(handle, inode, ret);
 603}
 604
 605struct __track_range_args {
 606	ext4_lblk_t start, end;
 607};
 608
 609/* __track_fn for tracking data updates */
 610static int __track_range(struct inode *inode, void *arg, bool update)
 611{
 612	struct ext4_inode_info *ei = EXT4_I(inode);
 613	ext4_lblk_t oldstart;
 614	struct __track_range_args *__arg =
 615		(struct __track_range_args *)arg;
 616
 617	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 618		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 619		return -ECANCELED;
 620	}
 621
 622	oldstart = ei->i_fc_lblk_start;
 623
 624	if (update && ei->i_fc_lblk_len > 0) {
 625		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 626		ei->i_fc_lblk_len =
 627			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 628				ei->i_fc_lblk_start + 1;
 629	} else {
 630		ei->i_fc_lblk_start = __arg->start;
 631		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 632	}
 633
 634	return 0;
 635}
 636
 637void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 638			 ext4_lblk_t end)
 639{
 640	struct __track_range_args args;
 641	int ret;
 642
 643	if (S_ISDIR(inode->i_mode))
 644		return;
 645
 646	if (ext4_fc_disabled(inode->i_sb))
 647		return;
 648
 649	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 650		return;
 651
 652	args.start = start;
 653	args.end = end;
 654
 655	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 656
 657	trace_ext4_fc_track_range(handle, inode, start, end, ret);
 658}
 659
 660static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 661{
 662	blk_opf_t write_flags = REQ_SYNC;
 663	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 664
 665	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 666	if (test_opt(sb, BARRIER) && is_tail)
 667		write_flags |= REQ_FUA | REQ_PREFLUSH;
 668	lock_buffer(bh);
 669	set_buffer_dirty(bh);
 670	set_buffer_uptodate(bh);
 671	bh->b_end_io = ext4_end_buffer_io_sync;
 672	submit_bh(REQ_OP_WRITE | write_flags, bh);
 673	EXT4_SB(sb)->s_fc_bh = NULL;
 674}
 675
 676/* Ext4 commit path routines */
 677
 
 
 
 
 
 
 
 
 
 
 
 
 678/*
 679 * Allocate len bytes on a fast commit buffer.
 680 *
 681 * During the commit time this function is used to manage fast commit
 682 * block space. We don't split a fast commit log onto different
 683 * blocks. So this function makes sure that if there's not enough space
 684 * on the current block, the remaining space in the current block is
 685 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 686 * new block is from jbd2 and CRC is updated to reflect the padding
 687 * we added.
 688 */
 689static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 690{
 691	struct ext4_fc_tl tl;
 692	struct ext4_sb_info *sbi = EXT4_SB(sb);
 693	struct buffer_head *bh;
 694	int bsize = sbi->s_journal->j_blocksize;
 695	int ret, off = sbi->s_fc_bytes % bsize;
 696	int remaining;
 697	u8 *dst;
 698
 699	/*
 700	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
 701	 * cannot fulfill the request.
 702	 */
 703	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
 704		return NULL;
 705
 706	if (!sbi->s_fc_bh) {
 707		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 708		if (ret)
 709			return NULL;
 710		sbi->s_fc_bh = bh;
 711	}
 712	dst = sbi->s_fc_bh->b_data + off;
 713
 714	/*
 715	 * Allocate the bytes in the current block if we can do so while still
 716	 * leaving enough space for a PAD tlv.
 717	 */
 718	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
 719	if (len <= remaining) {
 720		sbi->s_fc_bytes += len;
 721		return dst;
 722	}
 723
 724	/*
 725	 * Else, terminate the current block with a PAD tlv, then allocate a new
 726	 * block and allocate the bytes at the start of that new block.
 727	 */
 728
 729	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 730	tl.fc_len = cpu_to_le16(remaining);
 731	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 732	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
 733	*crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
 734
 735	ext4_fc_submit_bh(sb, false);
 736
 737	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 738	if (ret)
 739		return NULL;
 740	sbi->s_fc_bh = bh;
 741	sbi->s_fc_bytes += bsize - off + len;
 742	return sbi->s_fc_bh->b_data;
 743}
 744
 
 
 
 
 
 
 
 
 
 745/*
 746 * Complete a fast commit by writing tail tag.
 747 *
 748 * Writing tail tag marks the end of a fast commit. In order to guarantee
 749 * atomicity, after writing tail tag, even if there's space remaining
 750 * in the block, next commit shouldn't use it. That's why tail tag
 751 * has the length as that of the remaining space on the block.
 752 */
 753static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 754{
 755	struct ext4_sb_info *sbi = EXT4_SB(sb);
 756	struct ext4_fc_tl tl;
 757	struct ext4_fc_tail tail;
 758	int off, bsize = sbi->s_journal->j_blocksize;
 759	u8 *dst;
 760
 761	/*
 762	 * ext4_fc_reserve_space takes care of allocating an extra block if
 763	 * there's no enough space on this block for accommodating this tail.
 764	 */
 765	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
 766	if (!dst)
 767		return -ENOSPC;
 768
 769	off = sbi->s_fc_bytes % bsize;
 770
 771	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 772	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 773	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 774
 775	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 776	dst += EXT4_FC_TAG_BASE_LEN;
 777	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 778	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
 779	dst += sizeof(tail.fc_tid);
 780	crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
 781			  dst - (u8 *)sbi->s_fc_bh->b_data);
 782	tail.fc_crc = cpu_to_le32(crc);
 783	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
 784	dst += sizeof(tail.fc_crc);
 785	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 786
 787	ext4_fc_submit_bh(sb, true);
 788
 789	return 0;
 790}
 791
 792/*
 793 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 794 * Returns false if there's not enough space.
 795 */
 796static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 797			   u32 *crc)
 798{
 799	struct ext4_fc_tl tl;
 800	u8 *dst;
 801
 802	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
 803	if (!dst)
 804		return false;
 805
 806	tl.fc_tag = cpu_to_le16(tag);
 807	tl.fc_len = cpu_to_le16(len);
 808
 809	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 810	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
 811
 812	return true;
 813}
 814
 815/* Same as above, but adds dentry tlv. */
 816static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 817				   struct ext4_fc_dentry_update *fc_dentry)
 
 
 818{
 819	struct ext4_fc_dentry_info fcd;
 820	struct ext4_fc_tl tl;
 821	int dlen = fc_dentry->fcd_name.len;
 822	u8 *dst = ext4_fc_reserve_space(sb,
 823			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
 824
 825	if (!dst)
 826		return false;
 827
 828	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 829	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 830	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 831	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 832	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 833	dst += EXT4_FC_TAG_BASE_LEN;
 834	memcpy(dst, &fcd, sizeof(fcd));
 835	dst += sizeof(fcd);
 836	memcpy(dst, fc_dentry->fcd_name.name, dlen);
 
 837
 838	return true;
 839}
 840
 841/*
 842 * Writes inode in the fast commit space under TLV with tag @tag.
 843 * Returns 0 on success, error on failure.
 844 */
 845static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 846{
 847	struct ext4_inode_info *ei = EXT4_I(inode);
 848	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 849	int ret;
 850	struct ext4_iloc iloc;
 851	struct ext4_fc_inode fc_inode;
 852	struct ext4_fc_tl tl;
 853	u8 *dst;
 854
 855	ret = ext4_get_inode_loc(inode, &iloc);
 856	if (ret)
 857		return ret;
 858
 859	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 860		inode_len = EXT4_INODE_SIZE(inode->i_sb);
 861	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 862		inode_len += ei->i_extra_isize;
 863
 864	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 865	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 866	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 867
 868	ret = -ECANCELED;
 869	dst = ext4_fc_reserve_space(inode->i_sb,
 870		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
 871	if (!dst)
 872		goto err;
 873
 874	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 875	dst += EXT4_FC_TAG_BASE_LEN;
 876	memcpy(dst, &fc_inode, sizeof(fc_inode));
 
 
 877	dst += sizeof(fc_inode);
 878	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
 879	ret = 0;
 880err:
 881	brelse(iloc.bh);
 882	return ret;
 883}
 884
 885/*
 886 * Writes updated data ranges for the inode in question. Updates CRC.
 887 * Returns 0 on success, error otherwise.
 888 */
 889static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 890{
 891	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 892	struct ext4_inode_info *ei = EXT4_I(inode);
 893	struct ext4_map_blocks map;
 894	struct ext4_fc_add_range fc_ext;
 895	struct ext4_fc_del_range lrange;
 896	struct ext4_extent *ex;
 897	int ret;
 898
 899	mutex_lock(&ei->i_fc_lock);
 900	if (ei->i_fc_lblk_len == 0) {
 901		mutex_unlock(&ei->i_fc_lock);
 902		return 0;
 903	}
 904	old_blk_size = ei->i_fc_lblk_start;
 905	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 906	ei->i_fc_lblk_len = 0;
 907	mutex_unlock(&ei->i_fc_lock);
 908
 909	cur_lblk_off = old_blk_size;
 910	ext4_debug("will try writing %d to %d for inode %ld\n",
 911		   cur_lblk_off, new_blk_size, inode->i_ino);
 912
 913	while (cur_lblk_off <= new_blk_size) {
 914		map.m_lblk = cur_lblk_off;
 915		map.m_len = new_blk_size - cur_lblk_off + 1;
 916		ret = ext4_map_blocks(NULL, inode, &map, 0);
 917		if (ret < 0)
 918			return -ECANCELED;
 919
 920		if (map.m_len == 0) {
 921			cur_lblk_off++;
 922			continue;
 923		}
 924
 925		if (ret == 0) {
 926			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 927			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 928			lrange.fc_len = cpu_to_le32(map.m_len);
 929			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 930					    sizeof(lrange), (u8 *)&lrange, crc))
 931				return -ENOSPC;
 932		} else {
 933			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 934				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 935
 936			/* Limit the number of blocks in one extent */
 937			map.m_len = min(max, map.m_len);
 938
 939			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 940			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 941			ex->ee_block = cpu_to_le32(map.m_lblk);
 942			ex->ee_len = cpu_to_le16(map.m_len);
 943			ext4_ext_store_pblock(ex, map.m_pblk);
 944			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 945				ext4_ext_mark_unwritten(ex);
 946			else
 947				ext4_ext_mark_initialized(ex);
 948			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 949					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 950				return -ENOSPC;
 951		}
 952
 953		cur_lblk_off += map.m_len;
 954	}
 955
 956	return 0;
 957}
 958
 959
 960/* Submit data for all the fast commit inodes */
 961static int ext4_fc_submit_inode_data_all(journal_t *journal)
 962{
 963	struct super_block *sb = journal->j_private;
 964	struct ext4_sb_info *sbi = EXT4_SB(sb);
 965	struct ext4_inode_info *ei;
 966	int ret = 0;
 967
 968	spin_lock(&sbi->s_fc_lock);
 
 969	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 970		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 971		while (atomic_read(&ei->i_fc_updates)) {
 972			DEFINE_WAIT(wait);
 973
 974			prepare_to_wait(&ei->i_fc_wait, &wait,
 975						TASK_UNINTERRUPTIBLE);
 976			if (atomic_read(&ei->i_fc_updates)) {
 977				spin_unlock(&sbi->s_fc_lock);
 978				schedule();
 979				spin_lock(&sbi->s_fc_lock);
 980			}
 981			finish_wait(&ei->i_fc_wait, &wait);
 982		}
 983		spin_unlock(&sbi->s_fc_lock);
 984		ret = jbd2_submit_inode_data(journal, ei->jinode);
 985		if (ret)
 986			return ret;
 987		spin_lock(&sbi->s_fc_lock);
 988	}
 989	spin_unlock(&sbi->s_fc_lock);
 990
 991	return ret;
 992}
 993
 994/* Wait for completion of data for all the fast commit inodes */
 995static int ext4_fc_wait_inode_data_all(journal_t *journal)
 996{
 997	struct super_block *sb = journal->j_private;
 998	struct ext4_sb_info *sbi = EXT4_SB(sb);
 999	struct ext4_inode_info *pos, *n;
1000	int ret = 0;
1001
1002	spin_lock(&sbi->s_fc_lock);
1003	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1004		if (!ext4_test_inode_state(&pos->vfs_inode,
1005					   EXT4_STATE_FC_COMMITTING))
1006			continue;
1007		spin_unlock(&sbi->s_fc_lock);
1008
1009		ret = jbd2_wait_inode_data(journal, pos->jinode);
1010		if (ret)
1011			return ret;
1012		spin_lock(&sbi->s_fc_lock);
1013	}
1014	spin_unlock(&sbi->s_fc_lock);
1015
1016	return 0;
1017}
1018
1019/* Commit all the directory entry updates */
1020static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1021__acquires(&sbi->s_fc_lock)
1022__releases(&sbi->s_fc_lock)
1023{
1024	struct super_block *sb = journal->j_private;
1025	struct ext4_sb_info *sbi = EXT4_SB(sb);
1026	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1027	struct inode *inode;
1028	struct ext4_inode_info *ei;
1029	int ret;
1030
1031	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1032		return 0;
1033	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1034				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1035		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1036			spin_unlock(&sbi->s_fc_lock);
1037			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1038				ret = -ENOSPC;
1039				goto lock_and_exit;
1040			}
1041			spin_lock(&sbi->s_fc_lock);
1042			continue;
1043		}
 
 
 
 
 
 
 
 
 
1044		/*
1045		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1046		 * corresponding inode pointer
1047		 */
1048		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1049		ei = list_first_entry(&fc_dentry->fcd_dilist,
1050				struct ext4_inode_info, i_fc_dilist);
1051		inode = &ei->vfs_inode;
1052		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1053
1054		spin_unlock(&sbi->s_fc_lock);
1055
1056		/*
1057		 * We first write the inode and then the create dirent. This
1058		 * allows the recovery code to create an unnamed inode first
1059		 * and then link it to a directory entry. This allows us
1060		 * to use namei.c routines almost as is and simplifies
1061		 * the recovery code.
1062		 */
1063		ret = ext4_fc_write_inode(inode, crc);
1064		if (ret)
1065			goto lock_and_exit;
1066
1067		ret = ext4_fc_write_inode_data(inode, crc);
1068		if (ret)
1069			goto lock_and_exit;
1070
1071		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1072			ret = -ENOSPC;
1073			goto lock_and_exit;
1074		}
1075
1076		spin_lock(&sbi->s_fc_lock);
1077	}
1078	return 0;
1079lock_and_exit:
1080	spin_lock(&sbi->s_fc_lock);
1081	return ret;
1082}
1083
1084static int ext4_fc_perform_commit(journal_t *journal)
1085{
1086	struct super_block *sb = journal->j_private;
1087	struct ext4_sb_info *sbi = EXT4_SB(sb);
1088	struct ext4_inode_info *iter;
1089	struct ext4_fc_head head;
1090	struct inode *inode;
1091	struct blk_plug plug;
1092	int ret = 0;
1093	u32 crc = 0;
1094
1095	ret = ext4_fc_submit_inode_data_all(journal);
1096	if (ret)
1097		return ret;
1098
1099	ret = ext4_fc_wait_inode_data_all(journal);
1100	if (ret)
1101		return ret;
1102
1103	/*
1104	 * If file system device is different from journal device, issue a cache
1105	 * flush before we start writing fast commit blocks.
1106	 */
1107	if (journal->j_fs_dev != journal->j_dev)
1108		blkdev_issue_flush(journal->j_fs_dev);
1109
1110	blk_start_plug(&plug);
1111	if (sbi->s_fc_bytes == 0) {
1112		/*
1113		 * Add a head tag only if this is the first fast commit
1114		 * in this TID.
1115		 */
1116		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1117		head.fc_tid = cpu_to_le32(
1118			sbi->s_journal->j_running_transaction->t_tid);
1119		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1120			(u8 *)&head, &crc)) {
1121			ret = -ENOSPC;
1122			goto out;
1123		}
1124	}
1125
1126	spin_lock(&sbi->s_fc_lock);
1127	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1128	if (ret) {
1129		spin_unlock(&sbi->s_fc_lock);
1130		goto out;
1131	}
1132
1133	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1134		inode = &iter->vfs_inode;
1135		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1136			continue;
1137
1138		spin_unlock(&sbi->s_fc_lock);
1139		ret = ext4_fc_write_inode_data(inode, &crc);
1140		if (ret)
1141			goto out;
1142		ret = ext4_fc_write_inode(inode, &crc);
1143		if (ret)
1144			goto out;
1145		spin_lock(&sbi->s_fc_lock);
1146	}
1147	spin_unlock(&sbi->s_fc_lock);
1148
1149	ret = ext4_fc_write_tail(sb, crc);
1150
1151out:
1152	blk_finish_plug(&plug);
1153	return ret;
1154}
1155
1156static void ext4_fc_update_stats(struct super_block *sb, int status,
1157				 u64 commit_time, int nblks, tid_t commit_tid)
1158{
1159	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1160
1161	ext4_debug("Fast commit ended with status = %d for tid %u",
1162			status, commit_tid);
1163	if (status == EXT4_FC_STATUS_OK) {
1164		stats->fc_num_commits++;
1165		stats->fc_numblks += nblks;
1166		if (likely(stats->s_fc_avg_commit_time))
1167			stats->s_fc_avg_commit_time =
1168				(commit_time +
1169				 stats->s_fc_avg_commit_time * 3) / 4;
1170		else
1171			stats->s_fc_avg_commit_time = commit_time;
1172	} else if (status == EXT4_FC_STATUS_FAILED ||
1173		   status == EXT4_FC_STATUS_INELIGIBLE) {
1174		if (status == EXT4_FC_STATUS_FAILED)
1175			stats->fc_failed_commits++;
1176		stats->fc_ineligible_commits++;
1177	} else {
1178		stats->fc_skipped_commits++;
1179	}
1180	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1181}
1182
1183/*
1184 * The main commit entry point. Performs a fast commit for transaction
1185 * commit_tid if needed. If it's not possible to perform a fast commit
1186 * due to various reasons, we fall back to full commit. Returns 0
1187 * on success, error otherwise.
1188 */
1189int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1190{
1191	struct super_block *sb = journal->j_private;
1192	struct ext4_sb_info *sbi = EXT4_SB(sb);
1193	int nblks = 0, ret, bsize = journal->j_blocksize;
1194	int subtid = atomic_read(&sbi->s_fc_subtid);
1195	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1196	ktime_t start_time, commit_time;
1197
1198	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1199		return jbd2_complete_transaction(journal, commit_tid);
1200
1201	trace_ext4_fc_commit_start(sb, commit_tid);
1202
1203	start_time = ktime_get();
1204
 
 
 
 
 
 
1205restart_fc:
1206	ret = jbd2_fc_begin_commit(journal, commit_tid);
1207	if (ret == -EALREADY) {
1208		/* There was an ongoing commit, check if we need to restart */
1209		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1210			commit_tid > journal->j_commit_sequence)
1211			goto restart_fc;
1212		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1213				commit_tid);
1214		return 0;
1215	} else if (ret) {
1216		/*
1217		 * Commit couldn't start. Just update stats and perform a
1218		 * full commit.
1219		 */
1220		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1221				commit_tid);
1222		return jbd2_complete_transaction(journal, commit_tid);
1223	}
1224
1225	/*
1226	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1227	 * if we are fast commit ineligible.
1228	 */
1229	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1230		status = EXT4_FC_STATUS_INELIGIBLE;
1231		goto fallback;
1232	}
1233
1234	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1235	ret = ext4_fc_perform_commit(journal);
1236	if (ret < 0) {
1237		status = EXT4_FC_STATUS_FAILED;
1238		goto fallback;
 
1239	}
1240	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1241	ret = jbd2_fc_wait_bufs(journal, nblks);
1242	if (ret < 0) {
1243		status = EXT4_FC_STATUS_FAILED;
1244		goto fallback;
 
1245	}
1246	atomic_inc(&sbi->s_fc_subtid);
1247	ret = jbd2_fc_end_commit(journal);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1248	/*
1249	 * weight the commit time higher than the average time so we
1250	 * don't react too strongly to vast changes in the commit time
1251	 */
1252	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1253	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1254	return ret;
1255
1256fallback:
1257	ret = jbd2_fc_end_commit_fallback(journal);
1258	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1259	return ret;
 
 
 
 
 
 
1260}
1261
1262/*
1263 * Fast commit cleanup routine. This is called after every fast commit and
1264 * full commit. full is true if we are called after a full commit.
1265 */
1266static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1267{
1268	struct super_block *sb = journal->j_private;
1269	struct ext4_sb_info *sbi = EXT4_SB(sb);
1270	struct ext4_inode_info *iter, *iter_n;
1271	struct ext4_fc_dentry_update *fc_dentry;
1272
1273	if (full && sbi->s_fc_bh)
1274		sbi->s_fc_bh = NULL;
1275
1276	trace_ext4_fc_cleanup(journal, full, tid);
1277	jbd2_fc_release_bufs(journal);
1278
1279	spin_lock(&sbi->s_fc_lock);
1280	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1281				 i_fc_list) {
1282		list_del_init(&iter->i_fc_list);
1283		ext4_clear_inode_state(&iter->vfs_inode,
1284				       EXT4_STATE_FC_COMMITTING);
1285		if (iter->i_sync_tid <= tid)
1286			ext4_fc_reset_inode(&iter->vfs_inode);
1287		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1288		smp_mb();
1289#if (BITS_PER_LONG < 64)
1290		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1291#else
1292		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1293#endif
1294	}
1295
1296	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1297		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1298					     struct ext4_fc_dentry_update,
1299					     fcd_list);
1300		list_del_init(&fc_dentry->fcd_list);
1301		list_del_init(&fc_dentry->fcd_dilist);
1302		spin_unlock(&sbi->s_fc_lock);
1303
1304		if (fc_dentry->fcd_name.name &&
1305			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1306			kfree(fc_dentry->fcd_name.name);
1307		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1308		spin_lock(&sbi->s_fc_lock);
1309	}
1310
1311	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1312				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1313	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1314				&sbi->s_fc_q[FC_Q_MAIN]);
1315
1316	if (tid >= sbi->s_fc_ineligible_tid) {
1317		sbi->s_fc_ineligible_tid = 0;
1318		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1319	}
1320
1321	if (full)
1322		sbi->s_fc_bytes = 0;
1323	spin_unlock(&sbi->s_fc_lock);
1324	trace_ext4_fc_stats(sb);
1325}
1326
1327/* Ext4 Replay Path Routines */
1328
1329/* Helper struct for dentry replay routines */
1330struct dentry_info_args {
1331	int parent_ino, dname_len, ino, inode_len;
1332	char *dname;
1333};
1334
1335static inline void tl_to_darg(struct dentry_info_args *darg,
1336			      struct ext4_fc_tl *tl, u8 *val)
1337{
1338	struct ext4_fc_dentry_info fcd;
1339
1340	memcpy(&fcd, val, sizeof(fcd));
1341
1342	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1343	darg->ino = le32_to_cpu(fcd.fc_ino);
1344	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1345	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1346}
1347
1348static inline void ext4_fc_get_tl(struct ext4_fc_tl *tl, u8 *val)
1349{
1350	memcpy(tl, val, EXT4_FC_TAG_BASE_LEN);
1351	tl->fc_len = le16_to_cpu(tl->fc_len);
1352	tl->fc_tag = le16_to_cpu(tl->fc_tag);
1353}
1354
1355/* Unlink replay function */
1356static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1357				 u8 *val)
1358{
1359	struct inode *inode, *old_parent;
1360	struct qstr entry;
1361	struct dentry_info_args darg;
1362	int ret = 0;
1363
1364	tl_to_darg(&darg, tl, val);
1365
1366	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1367			darg.parent_ino, darg.dname_len);
1368
1369	entry.name = darg.dname;
1370	entry.len = darg.dname_len;
1371	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1372
1373	if (IS_ERR(inode)) {
1374		ext4_debug("Inode %d not found", darg.ino);
1375		return 0;
1376	}
1377
1378	old_parent = ext4_iget(sb, darg.parent_ino,
1379				EXT4_IGET_NORMAL);
1380	if (IS_ERR(old_parent)) {
1381		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1382		iput(inode);
1383		return 0;
1384	}
1385
1386	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1387	/* -ENOENT ok coz it might not exist anymore. */
1388	if (ret == -ENOENT)
1389		ret = 0;
1390	iput(old_parent);
1391	iput(inode);
1392	return ret;
1393}
1394
1395static int ext4_fc_replay_link_internal(struct super_block *sb,
1396				struct dentry_info_args *darg,
1397				struct inode *inode)
1398{
1399	struct inode *dir = NULL;
1400	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1401	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1402	int ret = 0;
1403
1404	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1405	if (IS_ERR(dir)) {
1406		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1407		dir = NULL;
1408		goto out;
1409	}
1410
1411	dentry_dir = d_obtain_alias(dir);
1412	if (IS_ERR(dentry_dir)) {
1413		ext4_debug("Failed to obtain dentry");
1414		dentry_dir = NULL;
1415		goto out;
1416	}
1417
1418	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1419	if (!dentry_inode) {
1420		ext4_debug("Inode dentry not created.");
1421		ret = -ENOMEM;
1422		goto out;
1423	}
1424
1425	ret = __ext4_link(dir, inode, dentry_inode);
1426	/*
1427	 * It's possible that link already existed since data blocks
1428	 * for the dir in question got persisted before we crashed OR
1429	 * we replayed this tag and crashed before the entire replay
1430	 * could complete.
1431	 */
1432	if (ret && ret != -EEXIST) {
1433		ext4_debug("Failed to link\n");
1434		goto out;
1435	}
1436
1437	ret = 0;
1438out:
1439	if (dentry_dir) {
1440		d_drop(dentry_dir);
1441		dput(dentry_dir);
1442	} else if (dir) {
1443		iput(dir);
1444	}
1445	if (dentry_inode) {
1446		d_drop(dentry_inode);
1447		dput(dentry_inode);
1448	}
1449
1450	return ret;
1451}
1452
1453/* Link replay function */
1454static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1455			       u8 *val)
1456{
1457	struct inode *inode;
1458	struct dentry_info_args darg;
1459	int ret = 0;
1460
1461	tl_to_darg(&darg, tl, val);
1462	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1463			darg.parent_ino, darg.dname_len);
1464
1465	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1466	if (IS_ERR(inode)) {
1467		ext4_debug("Inode not found.");
1468		return 0;
1469	}
1470
1471	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1472	iput(inode);
1473	return ret;
1474}
1475
1476/*
1477 * Record all the modified inodes during replay. We use this later to setup
1478 * block bitmaps correctly.
1479 */
1480static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1481{
1482	struct ext4_fc_replay_state *state;
1483	int i;
1484
1485	state = &EXT4_SB(sb)->s_fc_replay_state;
1486	for (i = 0; i < state->fc_modified_inodes_used; i++)
1487		if (state->fc_modified_inodes[i] == ino)
1488			return 0;
1489	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1490		int *fc_modified_inodes;
1491
1492		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1493				sizeof(int) * (state->fc_modified_inodes_size +
1494				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1495				GFP_KERNEL);
1496		if (!fc_modified_inodes)
1497			return -ENOMEM;
1498		state->fc_modified_inodes = fc_modified_inodes;
1499		state->fc_modified_inodes_size +=
1500			EXT4_FC_REPLAY_REALLOC_INCREMENT;
 
 
 
 
 
 
1501	}
1502	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1503	return 0;
1504}
1505
1506/*
1507 * Inode replay function
1508 */
1509static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1510				u8 *val)
1511{
1512	struct ext4_fc_inode fc_inode;
1513	struct ext4_inode *raw_inode;
1514	struct ext4_inode *raw_fc_inode;
1515	struct inode *inode = NULL;
1516	struct ext4_iloc iloc;
1517	int inode_len, ino, ret, tag = tl->fc_tag;
1518	struct ext4_extent_header *eh;
1519	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1520
1521	memcpy(&fc_inode, val, sizeof(fc_inode));
1522
1523	ino = le32_to_cpu(fc_inode.fc_ino);
1524	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1525
1526	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1527	if (!IS_ERR(inode)) {
1528		ext4_ext_clear_bb(inode);
1529		iput(inode);
1530	}
1531	inode = NULL;
1532
1533	ret = ext4_fc_record_modified_inode(sb, ino);
1534	if (ret)
1535		goto out;
1536
1537	raw_fc_inode = (struct ext4_inode *)
1538		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1539	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1540	if (ret)
1541		goto out;
1542
1543	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1544	raw_inode = ext4_raw_inode(&iloc);
1545
1546	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1547	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1548	       inode_len - off_gen);
1549	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1550		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1551		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1552			memset(eh, 0, sizeof(*eh));
1553			eh->eh_magic = EXT4_EXT_MAGIC;
1554			eh->eh_max = cpu_to_le16(
1555				(sizeof(raw_inode->i_block) -
1556				 sizeof(struct ext4_extent_header))
1557				 / sizeof(struct ext4_extent));
1558		}
1559	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1560		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1561			sizeof(raw_inode->i_block));
1562	}
1563
1564	/* Immediately update the inode on disk. */
1565	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1566	if (ret)
1567		goto out;
1568	ret = sync_dirty_buffer(iloc.bh);
1569	if (ret)
1570		goto out;
1571	ret = ext4_mark_inode_used(sb, ino);
1572	if (ret)
1573		goto out;
1574
1575	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1576	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1577	if (IS_ERR(inode)) {
1578		ext4_debug("Inode not found.");
1579		return -EFSCORRUPTED;
1580	}
1581
1582	/*
1583	 * Our allocator could have made different decisions than before
1584	 * crashing. This should be fixed but until then, we calculate
1585	 * the number of blocks the inode.
1586	 */
1587	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1588		ext4_ext_replay_set_iblocks(inode);
1589
1590	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1591	ext4_reset_inode_seed(inode);
1592
1593	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1594	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1595	sync_dirty_buffer(iloc.bh);
1596	brelse(iloc.bh);
1597out:
1598	iput(inode);
1599	if (!ret)
1600		blkdev_issue_flush(sb->s_bdev);
1601
1602	return 0;
1603}
1604
1605/*
1606 * Dentry create replay function.
1607 *
1608 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1609 * inode for which we are trying to create a dentry here, should already have
1610 * been replayed before we start here.
1611 */
1612static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1613				 u8 *val)
1614{
1615	int ret = 0;
1616	struct inode *inode = NULL;
1617	struct inode *dir = NULL;
1618	struct dentry_info_args darg;
1619
1620	tl_to_darg(&darg, tl, val);
1621
1622	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1623			darg.parent_ino, darg.dname_len);
1624
1625	/* This takes care of update group descriptor and other metadata */
1626	ret = ext4_mark_inode_used(sb, darg.ino);
1627	if (ret)
1628		goto out;
1629
1630	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1631	if (IS_ERR(inode)) {
1632		ext4_debug("inode %d not found.", darg.ino);
1633		inode = NULL;
1634		ret = -EINVAL;
1635		goto out;
1636	}
1637
1638	if (S_ISDIR(inode->i_mode)) {
1639		/*
1640		 * If we are creating a directory, we need to make sure that the
1641		 * dot and dot dot dirents are setup properly.
1642		 */
1643		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1644		if (IS_ERR(dir)) {
1645			ext4_debug("Dir %d not found.", darg.ino);
1646			goto out;
1647		}
1648		ret = ext4_init_new_dir(NULL, dir, inode);
1649		iput(dir);
1650		if (ret) {
1651			ret = 0;
1652			goto out;
1653		}
1654	}
1655	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1656	if (ret)
1657		goto out;
1658	set_nlink(inode, 1);
1659	ext4_mark_inode_dirty(NULL, inode);
1660out:
1661	iput(inode);
 
1662	return ret;
1663}
1664
1665/*
1666 * Record physical disk regions which are in use as per fast commit area,
1667 * and used by inodes during replay phase. Our simple replay phase
1668 * allocator excludes these regions from allocation.
1669 */
1670int ext4_fc_record_regions(struct super_block *sb, int ino,
1671		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1672{
1673	struct ext4_fc_replay_state *state;
1674	struct ext4_fc_alloc_region *region;
1675
1676	state = &EXT4_SB(sb)->s_fc_replay_state;
1677	/*
1678	 * during replay phase, the fc_regions_valid may not same as
1679	 * fc_regions_used, update it when do new additions.
1680	 */
1681	if (replay && state->fc_regions_used != state->fc_regions_valid)
1682		state->fc_regions_used = state->fc_regions_valid;
1683	if (state->fc_regions_used == state->fc_regions_size) {
1684		struct ext4_fc_alloc_region *fc_regions;
1685
1686		fc_regions = krealloc(state->fc_regions,
1687				      sizeof(struct ext4_fc_alloc_region) *
1688				      (state->fc_regions_size +
1689				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1690				      GFP_KERNEL);
1691		if (!fc_regions)
1692			return -ENOMEM;
1693		state->fc_regions_size +=
1694			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1695		state->fc_regions = fc_regions;
 
 
 
 
 
 
1696	}
1697	region = &state->fc_regions[state->fc_regions_used++];
1698	region->ino = ino;
1699	region->lblk = lblk;
1700	region->pblk = pblk;
1701	region->len = len;
1702
1703	if (replay)
1704		state->fc_regions_valid++;
1705
1706	return 0;
1707}
1708
1709/* Replay add range tag */
1710static int ext4_fc_replay_add_range(struct super_block *sb,
1711				    struct ext4_fc_tl *tl, u8 *val)
1712{
1713	struct ext4_fc_add_range fc_add_ex;
1714	struct ext4_extent newex, *ex;
1715	struct inode *inode;
1716	ext4_lblk_t start, cur;
1717	int remaining, len;
1718	ext4_fsblk_t start_pblk;
1719	struct ext4_map_blocks map;
1720	struct ext4_ext_path *path = NULL;
1721	int ret;
1722
1723	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1724	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1725
1726	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1727		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1728		ext4_ext_get_actual_len(ex));
1729
1730	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1731	if (IS_ERR(inode)) {
1732		ext4_debug("Inode not found.");
1733		return 0;
1734	}
1735
1736	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1737	if (ret)
1738		goto out;
1739
1740	start = le32_to_cpu(ex->ee_block);
1741	start_pblk = ext4_ext_pblock(ex);
1742	len = ext4_ext_get_actual_len(ex);
1743
1744	cur = start;
1745	remaining = len;
1746	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1747		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1748		  inode->i_ino);
1749
1750	while (remaining > 0) {
1751		map.m_lblk = cur;
1752		map.m_len = remaining;
1753		map.m_pblk = 0;
1754		ret = ext4_map_blocks(NULL, inode, &map, 0);
1755
1756		if (ret < 0)
1757			goto out;
 
 
1758
1759		if (ret == 0) {
1760			/* Range is not mapped */
1761			path = ext4_find_extent(inode, cur, NULL, 0);
1762			if (IS_ERR(path))
1763				goto out;
 
 
1764			memset(&newex, 0, sizeof(newex));
1765			newex.ee_block = cpu_to_le32(cur);
1766			ext4_ext_store_pblock(
1767				&newex, start_pblk + cur - start);
1768			newex.ee_len = cpu_to_le16(map.m_len);
1769			if (ext4_ext_is_unwritten(ex))
1770				ext4_ext_mark_unwritten(&newex);
1771			down_write(&EXT4_I(inode)->i_data_sem);
1772			ret = ext4_ext_insert_extent(
1773				NULL, inode, &path, &newex, 0);
1774			up_write((&EXT4_I(inode)->i_data_sem));
1775			ext4_free_ext_path(path);
1776			if (ret)
1777				goto out;
 
 
 
1778			goto next;
1779		}
1780
1781		if (start_pblk + cur - start != map.m_pblk) {
1782			/*
1783			 * Logical to physical mapping changed. This can happen
1784			 * if this range was removed and then reallocated to
1785			 * map to new physical blocks during a fast commit.
1786			 */
1787			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1788					ext4_ext_is_unwritten(ex),
1789					start_pblk + cur - start);
1790			if (ret)
1791				goto out;
 
 
1792			/*
1793			 * Mark the old blocks as free since they aren't used
1794			 * anymore. We maintain an array of all the modified
1795			 * inodes. In case these blocks are still used at either
1796			 * a different logical range in the same inode or in
1797			 * some different inode, we will mark them as allocated
1798			 * at the end of the FC replay using our array of
1799			 * modified inodes.
1800			 */
1801			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1802			goto next;
1803		}
1804
1805		/* Range is mapped and needs a state change */
1806		ext4_debug("Converting from %ld to %d %lld",
1807				map.m_flags & EXT4_MAP_UNWRITTEN,
1808			ext4_ext_is_unwritten(ex), map.m_pblk);
1809		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1810					ext4_ext_is_unwritten(ex), map.m_pblk);
1811		if (ret)
1812			goto out;
 
 
1813		/*
1814		 * We may have split the extent tree while toggling the state.
1815		 * Try to shrink the extent tree now.
1816		 */
1817		ext4_ext_replay_shrink_inode(inode, start + len);
1818next:
1819		cur += map.m_len;
1820		remaining -= map.m_len;
1821	}
1822	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1823					sb->s_blocksize_bits);
1824out:
1825	iput(inode);
1826	return 0;
1827}
1828
1829/* Replay DEL_RANGE tag */
1830static int
1831ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1832			 u8 *val)
1833{
1834	struct inode *inode;
1835	struct ext4_fc_del_range lrange;
1836	struct ext4_map_blocks map;
1837	ext4_lblk_t cur, remaining;
1838	int ret;
1839
1840	memcpy(&lrange, val, sizeof(lrange));
1841	cur = le32_to_cpu(lrange.fc_lblk);
1842	remaining = le32_to_cpu(lrange.fc_len);
1843
1844	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1845		le32_to_cpu(lrange.fc_ino), cur, remaining);
1846
1847	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1848	if (IS_ERR(inode)) {
1849		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1850		return 0;
1851	}
1852
1853	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1854	if (ret)
1855		goto out;
1856
1857	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1858			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1859			le32_to_cpu(lrange.fc_len));
1860	while (remaining > 0) {
1861		map.m_lblk = cur;
1862		map.m_len = remaining;
1863
1864		ret = ext4_map_blocks(NULL, inode, &map, 0);
1865		if (ret < 0)
1866			goto out;
 
 
1867		if (ret > 0) {
1868			remaining -= ret;
1869			cur += ret;
1870			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1871		} else {
1872			remaining -= map.m_len;
1873			cur += map.m_len;
1874		}
1875	}
1876
1877	down_write(&EXT4_I(inode)->i_data_sem);
1878	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1879				le32_to_cpu(lrange.fc_lblk) +
1880				le32_to_cpu(lrange.fc_len) - 1);
1881	up_write(&EXT4_I(inode)->i_data_sem);
1882	if (ret)
1883		goto out;
1884	ext4_ext_replay_shrink_inode(inode,
1885		i_size_read(inode) >> sb->s_blocksize_bits);
1886	ext4_mark_inode_dirty(NULL, inode);
1887out:
1888	iput(inode);
 
1889	return 0;
1890}
1891
1892static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1893{
1894	struct ext4_fc_replay_state *state;
1895	struct inode *inode;
1896	struct ext4_ext_path *path = NULL;
1897	struct ext4_map_blocks map;
1898	int i, ret, j;
1899	ext4_lblk_t cur, end;
1900
1901	state = &EXT4_SB(sb)->s_fc_replay_state;
1902	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1903		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1904			EXT4_IGET_NORMAL);
1905		if (IS_ERR(inode)) {
1906			ext4_debug("Inode %d not found.",
1907				state->fc_modified_inodes[i]);
1908			continue;
1909		}
1910		cur = 0;
1911		end = EXT_MAX_BLOCKS;
1912		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1913			iput(inode);
1914			continue;
1915		}
1916		while (cur < end) {
1917			map.m_lblk = cur;
1918			map.m_len = end - cur;
1919
1920			ret = ext4_map_blocks(NULL, inode, &map, 0);
1921			if (ret < 0)
1922				break;
1923
1924			if (ret > 0) {
1925				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1926				if (!IS_ERR(path)) {
1927					for (j = 0; j < path->p_depth; j++)
1928						ext4_mb_mark_bb(inode->i_sb,
1929							path[j].p_block, 1, 1);
1930					ext4_free_ext_path(path);
 
1931				}
1932				cur += ret;
1933				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1934							map.m_len, 1);
1935			} else {
1936				cur = cur + (map.m_len ? map.m_len : 1);
1937			}
1938		}
1939		iput(inode);
1940	}
1941}
1942
1943/*
1944 * Check if block is in excluded regions for block allocation. The simple
1945 * allocator that runs during replay phase is calls this function to see
1946 * if it is okay to use a block.
1947 */
1948bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1949{
1950	int i;
1951	struct ext4_fc_replay_state *state;
1952
1953	state = &EXT4_SB(sb)->s_fc_replay_state;
1954	for (i = 0; i < state->fc_regions_valid; i++) {
1955		if (state->fc_regions[i].ino == 0 ||
1956			state->fc_regions[i].len == 0)
1957			continue;
1958		if (in_range(blk, state->fc_regions[i].pblk,
1959					state->fc_regions[i].len))
1960			return true;
1961	}
1962	return false;
1963}
1964
1965/* Cleanup function called after replay */
1966void ext4_fc_replay_cleanup(struct super_block *sb)
1967{
1968	struct ext4_sb_info *sbi = EXT4_SB(sb);
1969
1970	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1971	kfree(sbi->s_fc_replay_state.fc_regions);
1972	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1973}
1974
1975static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
1976				      int tag, int len)
1977{
1978	switch (tag) {
1979	case EXT4_FC_TAG_ADD_RANGE:
1980		return len == sizeof(struct ext4_fc_add_range);
1981	case EXT4_FC_TAG_DEL_RANGE:
1982		return len == sizeof(struct ext4_fc_del_range);
1983	case EXT4_FC_TAG_CREAT:
1984	case EXT4_FC_TAG_LINK:
1985	case EXT4_FC_TAG_UNLINK:
1986		len -= sizeof(struct ext4_fc_dentry_info);
1987		return len >= 1 && len <= EXT4_NAME_LEN;
1988	case EXT4_FC_TAG_INODE:
1989		len -= sizeof(struct ext4_fc_inode);
1990		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
1991			len <= sbi->s_inode_size;
1992	case EXT4_FC_TAG_PAD:
1993		return true; /* padding can have any length */
1994	case EXT4_FC_TAG_TAIL:
1995		return len >= sizeof(struct ext4_fc_tail);
1996	case EXT4_FC_TAG_HEAD:
1997		return len == sizeof(struct ext4_fc_head);
1998	}
1999	return false;
2000}
2001
2002/*
2003 * Recovery Scan phase handler
2004 *
2005 * This function is called during the scan phase and is responsible
2006 * for doing following things:
2007 * - Make sure the fast commit area has valid tags for replay
2008 * - Count number of tags that need to be replayed by the replay handler
2009 * - Verify CRC
2010 * - Create a list of excluded blocks for allocation during replay phase
2011 *
2012 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2013 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2014 * to indicate that scan has finished and JBD2 can now start replay phase.
2015 * It returns a negative error to indicate that there was an error. At the end
2016 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2017 * to indicate the number of tags that need to replayed during the replay phase.
2018 */
2019static int ext4_fc_replay_scan(journal_t *journal,
2020				struct buffer_head *bh, int off,
2021				tid_t expected_tid)
2022{
2023	struct super_block *sb = journal->j_private;
2024	struct ext4_sb_info *sbi = EXT4_SB(sb);
2025	struct ext4_fc_replay_state *state;
2026	int ret = JBD2_FC_REPLAY_CONTINUE;
2027	struct ext4_fc_add_range ext;
2028	struct ext4_fc_tl tl;
2029	struct ext4_fc_tail tail;
2030	__u8 *start, *end, *cur, *val;
2031	struct ext4_fc_head head;
2032	struct ext4_extent *ex;
2033
2034	state = &sbi->s_fc_replay_state;
2035
2036	start = (u8 *)bh->b_data;
2037	end = start + journal->j_blocksize;
2038
2039	if (state->fc_replay_expected_off == 0) {
2040		state->fc_cur_tag = 0;
2041		state->fc_replay_num_tags = 0;
2042		state->fc_crc = 0;
2043		state->fc_regions = NULL;
2044		state->fc_regions_valid = state->fc_regions_used =
2045			state->fc_regions_size = 0;
2046		/* Check if we can stop early */
2047		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2048			!= EXT4_FC_TAG_HEAD)
2049			return 0;
2050	}
2051
2052	if (off != state->fc_replay_expected_off) {
2053		ret = -EFSCORRUPTED;
2054		goto out_err;
2055	}
2056
2057	state->fc_replay_expected_off++;
2058	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2059	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2060		ext4_fc_get_tl(&tl, cur);
2061		val = cur + EXT4_FC_TAG_BASE_LEN;
2062		if (tl.fc_len > end - val ||
2063		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2064			ret = state->fc_replay_num_tags ?
2065				JBD2_FC_REPLAY_STOP : -ECANCELED;
2066			goto out_err;
2067		}
2068		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2069			   tag2str(tl.fc_tag), bh->b_blocknr);
2070		switch (tl.fc_tag) {
2071		case EXT4_FC_TAG_ADD_RANGE:
2072			memcpy(&ext, val, sizeof(ext));
2073			ex = (struct ext4_extent *)&ext.fc_ex;
2074			ret = ext4_fc_record_regions(sb,
2075				le32_to_cpu(ext.fc_ino),
2076				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2077				ext4_ext_get_actual_len(ex), 0);
2078			if (ret < 0)
2079				break;
2080			ret = JBD2_FC_REPLAY_CONTINUE;
2081			fallthrough;
2082		case EXT4_FC_TAG_DEL_RANGE:
2083		case EXT4_FC_TAG_LINK:
2084		case EXT4_FC_TAG_UNLINK:
2085		case EXT4_FC_TAG_CREAT:
2086		case EXT4_FC_TAG_INODE:
2087		case EXT4_FC_TAG_PAD:
2088			state->fc_cur_tag++;
2089			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2090				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2091			break;
2092		case EXT4_FC_TAG_TAIL:
2093			state->fc_cur_tag++;
2094			memcpy(&tail, val, sizeof(tail));
2095			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2096						EXT4_FC_TAG_BASE_LEN +
2097						offsetof(struct ext4_fc_tail,
2098						fc_crc));
2099			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2100				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2101				state->fc_replay_num_tags = state->fc_cur_tag;
2102				state->fc_regions_valid =
2103					state->fc_regions_used;
2104			} else {
2105				ret = state->fc_replay_num_tags ?
2106					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2107			}
2108			state->fc_crc = 0;
2109			break;
2110		case EXT4_FC_TAG_HEAD:
2111			memcpy(&head, val, sizeof(head));
2112			if (le32_to_cpu(head.fc_features) &
2113				~EXT4_FC_SUPPORTED_FEATURES) {
2114				ret = -EOPNOTSUPP;
2115				break;
2116			}
2117			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2118				ret = JBD2_FC_REPLAY_STOP;
2119				break;
2120			}
2121			state->fc_cur_tag++;
2122			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2123				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2124			break;
2125		default:
2126			ret = state->fc_replay_num_tags ?
2127				JBD2_FC_REPLAY_STOP : -ECANCELED;
2128		}
2129		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2130			break;
2131	}
2132
2133out_err:
2134	trace_ext4_fc_replay_scan(sb, ret, off);
2135	return ret;
2136}
2137
2138/*
2139 * Main recovery path entry point.
2140 * The meaning of return codes is similar as above.
2141 */
2142static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2143				enum passtype pass, int off, tid_t expected_tid)
2144{
2145	struct super_block *sb = journal->j_private;
2146	struct ext4_sb_info *sbi = EXT4_SB(sb);
2147	struct ext4_fc_tl tl;
2148	__u8 *start, *end, *cur, *val;
2149	int ret = JBD2_FC_REPLAY_CONTINUE;
2150	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2151	struct ext4_fc_tail tail;
2152
2153	if (pass == PASS_SCAN) {
2154		state->fc_current_pass = PASS_SCAN;
2155		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2156	}
2157
2158	if (state->fc_current_pass != pass) {
2159		state->fc_current_pass = pass;
2160		sbi->s_mount_state |= EXT4_FC_REPLAY;
2161	}
2162	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2163		ext4_debug("Replay stops\n");
2164		ext4_fc_set_bitmaps_and_counters(sb);
2165		return 0;
2166	}
2167
2168#ifdef CONFIG_EXT4_DEBUG
2169	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2170		pr_warn("Dropping fc block %d because max_replay set\n", off);
2171		return JBD2_FC_REPLAY_STOP;
2172	}
2173#endif
2174
2175	start = (u8 *)bh->b_data;
2176	end = start + journal->j_blocksize;
2177
2178	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2179	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2180		ext4_fc_get_tl(&tl, cur);
2181		val = cur + EXT4_FC_TAG_BASE_LEN;
2182
2183		if (state->fc_replay_num_tags == 0) {
2184			ret = JBD2_FC_REPLAY_STOP;
2185			ext4_fc_set_bitmaps_and_counters(sb);
2186			break;
2187		}
2188
2189		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2190		state->fc_replay_num_tags--;
2191		switch (tl.fc_tag) {
2192		case EXT4_FC_TAG_LINK:
2193			ret = ext4_fc_replay_link(sb, &tl, val);
2194			break;
2195		case EXT4_FC_TAG_UNLINK:
2196			ret = ext4_fc_replay_unlink(sb, &tl, val);
2197			break;
2198		case EXT4_FC_TAG_ADD_RANGE:
2199			ret = ext4_fc_replay_add_range(sb, &tl, val);
2200			break;
2201		case EXT4_FC_TAG_CREAT:
2202			ret = ext4_fc_replay_create(sb, &tl, val);
2203			break;
2204		case EXT4_FC_TAG_DEL_RANGE:
2205			ret = ext4_fc_replay_del_range(sb, &tl, val);
2206			break;
2207		case EXT4_FC_TAG_INODE:
2208			ret = ext4_fc_replay_inode(sb, &tl, val);
2209			break;
2210		case EXT4_FC_TAG_PAD:
2211			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2212					     tl.fc_len, 0);
2213			break;
2214		case EXT4_FC_TAG_TAIL:
2215			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2216					     0, tl.fc_len, 0);
2217			memcpy(&tail, val, sizeof(tail));
2218			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2219			break;
2220		case EXT4_FC_TAG_HEAD:
2221			break;
2222		default:
2223			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
 
2224			ret = -ECANCELED;
2225			break;
2226		}
2227		if (ret < 0)
2228			break;
2229		ret = JBD2_FC_REPLAY_CONTINUE;
2230	}
2231	return ret;
2232}
2233
2234void ext4_fc_init(struct super_block *sb, journal_t *journal)
2235{
2236	/*
2237	 * We set replay callback even if fast commit disabled because we may
2238	 * could still have fast commit blocks that need to be replayed even if
2239	 * fast commit has now been turned off.
2240	 */
2241	journal->j_fc_replay_callback = ext4_fc_replay;
2242	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2243		return;
2244	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2245}
2246
2247static const char * const fc_ineligible_reasons[] = {
2248	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2249	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2250	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2251	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2252	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2253	[EXT4_FC_REASON_RESIZE] = "Resize",
2254	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2255	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2256	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2257	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2258};
2259
2260int ext4_fc_info_show(struct seq_file *seq, void *v)
2261{
2262	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2263	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2264	int i;
2265
2266	if (v != SEQ_START_TOKEN)
2267		return 0;
2268
2269	seq_printf(seq,
2270		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2271		   stats->fc_num_commits, stats->fc_ineligible_commits,
2272		   stats->fc_numblks,
2273		   div_u64(stats->s_fc_avg_commit_time, 1000));
2274	seq_puts(seq, "Ineligible reasons:\n");
2275	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2276		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2277			stats->fc_ineligible_reason_count[i]);
2278
2279	return 0;
2280}
2281
2282int __init ext4_fc_init_dentry_cache(void)
2283{
2284	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2285					   SLAB_RECLAIM_ACCOUNT);
2286
2287	if (ext4_fc_dentry_cachep == NULL)
2288		return -ENOMEM;
2289
2290	return 0;
2291}
2292
2293void ext4_fc_destroy_dentry_cache(void)
2294{
2295	kmem_cache_destroy(ext4_fc_dentry_cachep);
2296}

   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
 
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 
 
 
 
 
 
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 
 
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 
 
 
 
 324		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 
 
 
 
 
 
 
 
 
 
 
 
 
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 
 436
 437	mutex_unlock(&ei->i_fc_lock);
 
 
 
 
 
 
 
 
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 
 
 
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 
 
 
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 
 
 
 
 
 
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 
 
 
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 
 
 
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 
 
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 
 
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 
 
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 897				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 898
 899			/* Limit the number of blocks in one extent */
 900			map.m_len = min(max, map.m_len);
 901
 902			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 903			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 904			ex->ee_block = cpu_to_le32(map.m_lblk);
 905			ex->ee_len = cpu_to_le16(map.m_len);
 906			ext4_ext_store_pblock(ex, map.m_pblk);
 907			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 908				ext4_ext_mark_unwritten(ex);
 909			else
 910				ext4_ext_mark_initialized(ex);
 911			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 912					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 913				return -ENOSPC;
 914		}
 915
 916		cur_lblk_off += map.m_len;
 917	}
 918
 919	return 0;
 920}
 921
 922
 923/* Submit data for all the fast commit inodes */
 924static int ext4_fc_submit_inode_data_all(journal_t *journal)
 925{
 926	struct super_block *sb = (struct super_block *)(journal->j_private);
 927	struct ext4_sb_info *sbi = EXT4_SB(sb);
 928	struct ext4_inode_info *ei;
 929	int ret = 0;
 930
 931	spin_lock(&sbi->s_fc_lock);
 932	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 933	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 934		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 935		while (atomic_read(&ei->i_fc_updates)) {
 936			DEFINE_WAIT(wait);
 937
 938			prepare_to_wait(&ei->i_fc_wait, &wait,
 939						TASK_UNINTERRUPTIBLE);
 940			if (atomic_read(&ei->i_fc_updates)) {
 941				spin_unlock(&sbi->s_fc_lock);
 942				schedule();
 943				spin_lock(&sbi->s_fc_lock);
 944			}
 945			finish_wait(&ei->i_fc_wait, &wait);
 946		}
 947		spin_unlock(&sbi->s_fc_lock);
 948		ret = jbd2_submit_inode_data(ei->jinode);
 949		if (ret)
 950			return ret;
 951		spin_lock(&sbi->s_fc_lock);
 952	}
 953	spin_unlock(&sbi->s_fc_lock);
 954
 955	return ret;
 956}
 957
 958/* Wait for completion of data for all the fast commit inodes */
 959static int ext4_fc_wait_inode_data_all(journal_t *journal)
 960{
 961	struct super_block *sb = (struct super_block *)(journal->j_private);
 962	struct ext4_sb_info *sbi = EXT4_SB(sb);
 963	struct ext4_inode_info *pos, *n;
 964	int ret = 0;
 965
 966	spin_lock(&sbi->s_fc_lock);
 967	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 968		if (!ext4_test_inode_state(&pos->vfs_inode,
 969					   EXT4_STATE_FC_COMMITTING))
 970			continue;
 971		spin_unlock(&sbi->s_fc_lock);
 972
 973		ret = jbd2_wait_inode_data(journal, pos->jinode);
 974		if (ret)
 975			return ret;
 976		spin_lock(&sbi->s_fc_lock);
 977	}
 978	spin_unlock(&sbi->s_fc_lock);
 979
 980	return 0;
 981}
 982
 983/* Commit all the directory entry updates */
 984static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 985__acquires(&sbi->s_fc_lock)
 986__releases(&sbi->s_fc_lock)
 987{
 988	struct super_block *sb = (struct super_block *)(journal->j_private);
 989	struct ext4_sb_info *sbi = EXT4_SB(sb);
 990	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 991	struct inode *inode;
 992	struct ext4_inode_info *ei, *ei_n;
 993	int ret;
 994
 995	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 996		return 0;
 997	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 998				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 999		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1000			spin_unlock(&sbi->s_fc_lock);
1001			if (!ext4_fc_add_dentry_tlv(
1002				sb, fc_dentry->fcd_op,
1003				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1004				fc_dentry->fcd_name.len,
1005				fc_dentry->fcd_name.name, crc)) {
1006				ret = -ENOSPC;
1007				goto lock_and_exit;
1008			}
1009			spin_lock(&sbi->s_fc_lock);
1010			continue;
1011		}
1012
1013		inode = NULL;
1014		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1015					 i_fc_list) {
1016			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1017				inode = &ei->vfs_inode;
1018				break;
1019			}
1020		}
1021		/*
1022		 * If we don't find inode in our list, then it was deleted,
1023		 * in which case, we don't need to record it's create tag.
1024		 */
1025		if (!inode)
1026			continue;
 
 
 
 
1027		spin_unlock(&sbi->s_fc_lock);
1028
1029		/*
1030		 * We first write the inode and then the create dirent. This
1031		 * allows the recovery code to create an unnamed inode first
1032		 * and then link it to a directory entry. This allows us
1033		 * to use namei.c routines almost as is and simplifies
1034		 * the recovery code.
1035		 */
1036		ret = ext4_fc_write_inode(inode, crc);
1037		if (ret)
1038			goto lock_and_exit;
1039
1040		ret = ext4_fc_write_inode_data(inode, crc);
1041		if (ret)
1042			goto lock_and_exit;
1043
1044		if (!ext4_fc_add_dentry_tlv(
1045			sb, fc_dentry->fcd_op,
1046			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1047			fc_dentry->fcd_name.len,
1048			fc_dentry->fcd_name.name, crc)) {
1049			ret = -ENOSPC;
1050			goto lock_and_exit;
1051		}
1052
1053		spin_lock(&sbi->s_fc_lock);
1054	}
1055	return 0;
1056lock_and_exit:
1057	spin_lock(&sbi->s_fc_lock);
1058	return ret;
1059}
1060
1061static int ext4_fc_perform_commit(journal_t *journal)
1062{
1063	struct super_block *sb = (struct super_block *)(journal->j_private);
1064	struct ext4_sb_info *sbi = EXT4_SB(sb);
1065	struct ext4_inode_info *iter;
1066	struct ext4_fc_head head;
1067	struct inode *inode;
1068	struct blk_plug plug;
1069	int ret = 0;
1070	u32 crc = 0;
1071
1072	ret = ext4_fc_submit_inode_data_all(journal);
1073	if (ret)
1074		return ret;
1075
1076	ret = ext4_fc_wait_inode_data_all(journal);
1077	if (ret)
1078		return ret;
1079
1080	/*
1081	 * If file system device is different from journal device, issue a cache
1082	 * flush before we start writing fast commit blocks.
1083	 */
1084	if (journal->j_fs_dev != journal->j_dev)
1085		blkdev_issue_flush(journal->j_fs_dev);
1086
1087	blk_start_plug(&plug);
1088	if (sbi->s_fc_bytes == 0) {
1089		/*
1090		 * Add a head tag only if this is the first fast commit
1091		 * in this TID.
1092		 */
1093		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1094		head.fc_tid = cpu_to_le32(
1095			sbi->s_journal->j_running_transaction->t_tid);
1096		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1097			(u8 *)&head, &crc)) {
1098			ret = -ENOSPC;
1099			goto out;
1100		}
1101	}
1102
1103	spin_lock(&sbi->s_fc_lock);
1104	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1105	if (ret) {
1106		spin_unlock(&sbi->s_fc_lock);
1107		goto out;
1108	}
1109
1110	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1111		inode = &iter->vfs_inode;
1112		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1113			continue;
1114
1115		spin_unlock(&sbi->s_fc_lock);
1116		ret = ext4_fc_write_inode_data(inode, &crc);
1117		if (ret)
1118			goto out;
1119		ret = ext4_fc_write_inode(inode, &crc);
1120		if (ret)
1121			goto out;
1122		spin_lock(&sbi->s_fc_lock);
1123	}
1124	spin_unlock(&sbi->s_fc_lock);
1125
1126	ret = ext4_fc_write_tail(sb, crc);
1127
1128out:
1129	blk_finish_plug(&plug);
1130	return ret;
1131}
1132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1133/*
1134 * The main commit entry point. Performs a fast commit for transaction
1135 * commit_tid if needed. If it's not possible to perform a fast commit
1136 * due to various reasons, we fall back to full commit. Returns 0
1137 * on success, error otherwise.
1138 */
1139int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1140{
1141	struct super_block *sb = (struct super_block *)(journal->j_private);
1142	struct ext4_sb_info *sbi = EXT4_SB(sb);
1143	int nblks = 0, ret, bsize = journal->j_blocksize;
1144	int subtid = atomic_read(&sbi->s_fc_subtid);
1145	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1146	ktime_t start_time, commit_time;
1147
1148	trace_ext4_fc_commit_start(sb);
 
 
 
1149
1150	start_time = ktime_get();
1151
1152	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1153		(ext4_fc_is_ineligible(sb))) {
1154		reason = EXT4_FC_REASON_INELIGIBLE;
1155		goto out;
1156	}
1157
1158restart_fc:
1159	ret = jbd2_fc_begin_commit(journal, commit_tid);
1160	if (ret == -EALREADY) {
1161		/* There was an ongoing commit, check if we need to restart */
1162		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1163			commit_tid > journal->j_commit_sequence)
1164			goto restart_fc;
1165		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1166		goto out;
 
1167	} else if (ret) {
1168		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169		reason = EXT4_FC_REASON_FC_START_FAILED;
1170		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
1171	}
1172
1173	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1174	ret = ext4_fc_perform_commit(journal);
1175	if (ret < 0) {
1176		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177		reason = EXT4_FC_REASON_FC_FAILED;
1178		goto out;
1179	}
1180	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181	ret = jbd2_fc_wait_bufs(journal, nblks);
1182	if (ret < 0) {
1183		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184		reason = EXT4_FC_REASON_FC_FAILED;
1185		goto out;
1186	}
1187	atomic_inc(&sbi->s_fc_subtid);
1188	jbd2_fc_end_commit(journal);
1189out:
1190	/* Has any ineligible update happened since we started? */
1191	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1192		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1193		reason = EXT4_FC_REASON_INELIGIBLE;
1194	}
1195
1196	spin_lock(&sbi->s_fc_lock);
1197	if (reason != EXT4_FC_REASON_OK &&
1198		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1199		sbi->s_fc_stats.fc_ineligible_commits++;
1200	} else {
1201		sbi->s_fc_stats.fc_num_commits++;
1202		sbi->s_fc_stats.fc_numblks += nblks;
1203	}
1204	spin_unlock(&sbi->s_fc_lock);
1205	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1206	trace_ext4_fc_commit_stop(sb, nblks, reason);
1207	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1208	/*
1209	 * weight the commit time higher than the average time so we don't
1210	 * react too strongly to vast changes in the commit time
1211	 */
1212	if (likely(sbi->s_fc_avg_commit_time))
1213		sbi->s_fc_avg_commit_time = (commit_time +
1214				sbi->s_fc_avg_commit_time * 3) / 4;
1215	else
1216		sbi->s_fc_avg_commit_time = commit_time;
1217	jbd_debug(1,
1218		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1219		nblks, reason, subtid);
1220	if (reason == EXT4_FC_REASON_FC_FAILED)
1221		return jbd2_fc_end_commit_fallback(journal);
1222	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1223		reason == EXT4_FC_REASON_INELIGIBLE)
1224		return jbd2_complete_transaction(journal, commit_tid);
1225	return 0;
1226}
1227
1228/*
1229 * Fast commit cleanup routine. This is called after every fast commit and
1230 * full commit. full is true if we are called after a full commit.
1231 */
1232static void ext4_fc_cleanup(journal_t *journal, int full)
1233{
1234	struct super_block *sb = journal->j_private;
1235	struct ext4_sb_info *sbi = EXT4_SB(sb);
1236	struct ext4_inode_info *iter, *iter_n;
1237	struct ext4_fc_dentry_update *fc_dentry;
1238
1239	if (full && sbi->s_fc_bh)
1240		sbi->s_fc_bh = NULL;
1241
 
1242	jbd2_fc_release_bufs(journal);
1243
1244	spin_lock(&sbi->s_fc_lock);
1245	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1246				 i_fc_list) {
1247		list_del_init(&iter->i_fc_list);
1248		ext4_clear_inode_state(&iter->vfs_inode,
1249				       EXT4_STATE_FC_COMMITTING);
1250		ext4_fc_reset_inode(&iter->vfs_inode);
 
1251		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1252		smp_mb();
1253#if (BITS_PER_LONG < 64)
1254		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1255#else
1256		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1257#endif
1258	}
1259
1260	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1261		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1262					     struct ext4_fc_dentry_update,
1263					     fcd_list);
1264		list_del_init(&fc_dentry->fcd_list);
 
1265		spin_unlock(&sbi->s_fc_lock);
1266
1267		if (fc_dentry->fcd_name.name &&
1268			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1269			kfree(fc_dentry->fcd_name.name);
1270		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1271		spin_lock(&sbi->s_fc_lock);
1272	}
1273
1274	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1275				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1276	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1277				&sbi->s_fc_q[FC_Q_MAIN]);
1278
1279	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1280	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 
 
1281
1282	if (full)
1283		sbi->s_fc_bytes = 0;
1284	spin_unlock(&sbi->s_fc_lock);
1285	trace_ext4_fc_stats(sb);
1286}
1287
1288/* Ext4 Replay Path Routines */
1289
1290/* Helper struct for dentry replay routines */
1291struct dentry_info_args {
1292	int parent_ino, dname_len, ino, inode_len;
1293	char *dname;
1294};
1295
1296static inline void tl_to_darg(struct dentry_info_args *darg,
1297			      struct  ext4_fc_tl *tl, u8 *val)
1298{
1299	struct ext4_fc_dentry_info fcd;
1300
1301	memcpy(&fcd, val, sizeof(fcd));
1302
1303	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1304	darg->ino = le32_to_cpu(fcd.fc_ino);
1305	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1306	darg->dname_len = le16_to_cpu(tl->fc_len) -
1307		sizeof(struct ext4_fc_dentry_info);
 
 
 
 
 
 
1308}
1309
1310/* Unlink replay function */
1311static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1312				 u8 *val)
1313{
1314	struct inode *inode, *old_parent;
1315	struct qstr entry;
1316	struct dentry_info_args darg;
1317	int ret = 0;
1318
1319	tl_to_darg(&darg, tl, val);
1320
1321	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1322			darg.parent_ino, darg.dname_len);
1323
1324	entry.name = darg.dname;
1325	entry.len = darg.dname_len;
1326	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1327
1328	if (IS_ERR(inode)) {
1329		jbd_debug(1, "Inode %d not found", darg.ino);
1330		return 0;
1331	}
1332
1333	old_parent = ext4_iget(sb, darg.parent_ino,
1334				EXT4_IGET_NORMAL);
1335	if (IS_ERR(old_parent)) {
1336		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1337		iput(inode);
1338		return 0;
1339	}
1340
1341	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1342	/* -ENOENT ok coz it might not exist anymore. */
1343	if (ret == -ENOENT)
1344		ret = 0;
1345	iput(old_parent);
1346	iput(inode);
1347	return ret;
1348}
1349
1350static int ext4_fc_replay_link_internal(struct super_block *sb,
1351				struct dentry_info_args *darg,
1352				struct inode *inode)
1353{
1354	struct inode *dir = NULL;
1355	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1356	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1357	int ret = 0;
1358
1359	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1360	if (IS_ERR(dir)) {
1361		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1362		dir = NULL;
1363		goto out;
1364	}
1365
1366	dentry_dir = d_obtain_alias(dir);
1367	if (IS_ERR(dentry_dir)) {
1368		jbd_debug(1, "Failed to obtain dentry");
1369		dentry_dir = NULL;
1370		goto out;
1371	}
1372
1373	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1374	if (!dentry_inode) {
1375		jbd_debug(1, "Inode dentry not created.");
1376		ret = -ENOMEM;
1377		goto out;
1378	}
1379
1380	ret = __ext4_link(dir, inode, dentry_inode);
1381	/*
1382	 * It's possible that link already existed since data blocks
1383	 * for the dir in question got persisted before we crashed OR
1384	 * we replayed this tag and crashed before the entire replay
1385	 * could complete.
1386	 */
1387	if (ret && ret != -EEXIST) {
1388		jbd_debug(1, "Failed to link\n");
1389		goto out;
1390	}
1391
1392	ret = 0;
1393out:
1394	if (dentry_dir) {
1395		d_drop(dentry_dir);
1396		dput(dentry_dir);
1397	} else if (dir) {
1398		iput(dir);
1399	}
1400	if (dentry_inode) {
1401		d_drop(dentry_inode);
1402		dput(dentry_inode);
1403	}
1404
1405	return ret;
1406}
1407
1408/* Link replay function */
1409static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1410			       u8 *val)
1411{
1412	struct inode *inode;
1413	struct dentry_info_args darg;
1414	int ret = 0;
1415
1416	tl_to_darg(&darg, tl, val);
1417	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1418			darg.parent_ino, darg.dname_len);
1419
1420	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421	if (IS_ERR(inode)) {
1422		jbd_debug(1, "Inode not found.");
1423		return 0;
1424	}
1425
1426	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1427	iput(inode);
1428	return ret;
1429}
1430
1431/*
1432 * Record all the modified inodes during replay. We use this later to setup
1433 * block bitmaps correctly.
1434 */
1435static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1436{
1437	struct ext4_fc_replay_state *state;
1438	int i;
1439
1440	state = &EXT4_SB(sb)->s_fc_replay_state;
1441	for (i = 0; i < state->fc_modified_inodes_used; i++)
1442		if (state->fc_modified_inodes[i] == ino)
1443			return 0;
1444	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
 
 
 
 
 
 
 
 
 
1445		state->fc_modified_inodes_size +=
1446			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1447		state->fc_modified_inodes = krealloc(
1448					state->fc_modified_inodes, sizeof(int) *
1449					state->fc_modified_inodes_size,
1450					GFP_KERNEL);
1451		if (!state->fc_modified_inodes)
1452			return -ENOMEM;
1453	}
1454	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1455	return 0;
1456}
1457
1458/*
1459 * Inode replay function
1460 */
1461static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1462				u8 *val)
1463{
1464	struct ext4_fc_inode fc_inode;
1465	struct ext4_inode *raw_inode;
1466	struct ext4_inode *raw_fc_inode;
1467	struct inode *inode = NULL;
1468	struct ext4_iloc iloc;
1469	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1470	struct ext4_extent_header *eh;
 
1471
1472	memcpy(&fc_inode, val, sizeof(fc_inode));
1473
1474	ino = le32_to_cpu(fc_inode.fc_ino);
1475	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1476
1477	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1478	if (!IS_ERR(inode)) {
1479		ext4_ext_clear_bb(inode);
1480		iput(inode);
1481	}
1482	inode = NULL;
1483
1484	ext4_fc_record_modified_inode(sb, ino);
 
 
1485
1486	raw_fc_inode = (struct ext4_inode *)
1487		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1488	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1489	if (ret)
1490		goto out;
1491
1492	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1493	raw_inode = ext4_raw_inode(&iloc);
1494
1495	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1496	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1497		inode_len - offsetof(struct ext4_inode, i_generation));
1498	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1499		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1500		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1501			memset(eh, 0, sizeof(*eh));
1502			eh->eh_magic = EXT4_EXT_MAGIC;
1503			eh->eh_max = cpu_to_le16(
1504				(sizeof(raw_inode->i_block) -
1505				 sizeof(struct ext4_extent_header))
1506				 / sizeof(struct ext4_extent));
1507		}
1508	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1509		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1510			sizeof(raw_inode->i_block));
1511	}
1512
1513	/* Immediately update the inode on disk. */
1514	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515	if (ret)
1516		goto out;
1517	ret = sync_dirty_buffer(iloc.bh);
1518	if (ret)
1519		goto out;
1520	ret = ext4_mark_inode_used(sb, ino);
1521	if (ret)
1522		goto out;
1523
1524	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1525	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1526	if (IS_ERR(inode)) {
1527		jbd_debug(1, "Inode not found.");
1528		return -EFSCORRUPTED;
1529	}
1530
1531	/*
1532	 * Our allocator could have made different decisions than before
1533	 * crashing. This should be fixed but until then, we calculate
1534	 * the number of blocks the inode.
1535	 */
1536	ext4_ext_replay_set_iblocks(inode);
 
1537
1538	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1539	ext4_reset_inode_seed(inode);
1540
1541	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1542	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1543	sync_dirty_buffer(iloc.bh);
1544	brelse(iloc.bh);
1545out:
1546	iput(inode);
1547	if (!ret)
1548		blkdev_issue_flush(sb->s_bdev);
1549
1550	return 0;
1551}
1552
1553/*
1554 * Dentry create replay function.
1555 *
1556 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1557 * inode for which we are trying to create a dentry here, should already have
1558 * been replayed before we start here.
1559 */
1560static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1561				 u8 *val)
1562{
1563	int ret = 0;
1564	struct inode *inode = NULL;
1565	struct inode *dir = NULL;
1566	struct dentry_info_args darg;
1567
1568	tl_to_darg(&darg, tl, val);
1569
1570	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571			darg.parent_ino, darg.dname_len);
1572
1573	/* This takes care of update group descriptor and other metadata */
1574	ret = ext4_mark_inode_used(sb, darg.ino);
1575	if (ret)
1576		goto out;
1577
1578	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579	if (IS_ERR(inode)) {
1580		jbd_debug(1, "inode %d not found.", darg.ino);
1581		inode = NULL;
1582		ret = -EINVAL;
1583		goto out;
1584	}
1585
1586	if (S_ISDIR(inode->i_mode)) {
1587		/*
1588		 * If we are creating a directory, we need to make sure that the
1589		 * dot and dot dot dirents are setup properly.
1590		 */
1591		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592		if (IS_ERR(dir)) {
1593			jbd_debug(1, "Dir %d not found.", darg.ino);
1594			goto out;
1595		}
1596		ret = ext4_init_new_dir(NULL, dir, inode);
1597		iput(dir);
1598		if (ret) {
1599			ret = 0;
1600			goto out;
1601		}
1602	}
1603	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604	if (ret)
1605		goto out;
1606	set_nlink(inode, 1);
1607	ext4_mark_inode_dirty(NULL, inode);
1608out:
1609	if (inode)
1610		iput(inode);
1611	return ret;
1612}
1613
1614/*
1615 * Record physical disk regions which are in use as per fast commit area. Our
1616 * simple replay phase allocator excludes these regions from allocation.
 
1617 */
1618static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620{
1621	struct ext4_fc_replay_state *state;
1622	struct ext4_fc_alloc_region *region;
1623
1624	state = &EXT4_SB(sb)->s_fc_replay_state;
 
 
 
 
 
 
1625	if (state->fc_regions_used == state->fc_regions_size) {
 
 
 
 
 
 
 
 
 
1626		state->fc_regions_size +=
1627			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628		state->fc_regions = krealloc(
1629					state->fc_regions,
1630					state->fc_regions_size *
1631					sizeof(struct ext4_fc_alloc_region),
1632					GFP_KERNEL);
1633		if (!state->fc_regions)
1634			return -ENOMEM;
1635	}
1636	region = &state->fc_regions[state->fc_regions_used++];
1637	region->ino = ino;
1638	region->lblk = lblk;
1639	region->pblk = pblk;
1640	region->len = len;
1641
 
 
 
1642	return 0;
1643}
1644
1645/* Replay add range tag */
1646static int ext4_fc_replay_add_range(struct super_block *sb,
1647				    struct ext4_fc_tl *tl, u8 *val)
1648{
1649	struct ext4_fc_add_range fc_add_ex;
1650	struct ext4_extent newex, *ex;
1651	struct inode *inode;
1652	ext4_lblk_t start, cur;
1653	int remaining, len;
1654	ext4_fsblk_t start_pblk;
1655	struct ext4_map_blocks map;
1656	struct ext4_ext_path *path = NULL;
1657	int ret;
1658
1659	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1660	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1661
1662	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1664		ext4_ext_get_actual_len(ex));
1665
1666	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1667	if (IS_ERR(inode)) {
1668		jbd_debug(1, "Inode not found.");
1669		return 0;
1670	}
1671
1672	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1673
1674	start = le32_to_cpu(ex->ee_block);
1675	start_pblk = ext4_ext_pblock(ex);
1676	len = ext4_ext_get_actual_len(ex);
1677
1678	cur = start;
1679	remaining = len;
1680	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1681		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1682		  inode->i_ino);
1683
1684	while (remaining > 0) {
1685		map.m_lblk = cur;
1686		map.m_len = remaining;
1687		map.m_pblk = 0;
1688		ret = ext4_map_blocks(NULL, inode, &map, 0);
1689
1690		if (ret < 0) {
1691			iput(inode);
1692			return 0;
1693		}
1694
1695		if (ret == 0) {
1696			/* Range is not mapped */
1697			path = ext4_find_extent(inode, cur, NULL, 0);
1698			if (IS_ERR(path)) {
1699				iput(inode);
1700				return 0;
1701			}
1702			memset(&newex, 0, sizeof(newex));
1703			newex.ee_block = cpu_to_le32(cur);
1704			ext4_ext_store_pblock(
1705				&newex, start_pblk + cur - start);
1706			newex.ee_len = cpu_to_le16(map.m_len);
1707			if (ext4_ext_is_unwritten(ex))
1708				ext4_ext_mark_unwritten(&newex);
1709			down_write(&EXT4_I(inode)->i_data_sem);
1710			ret = ext4_ext_insert_extent(
1711				NULL, inode, &path, &newex, 0);
1712			up_write((&EXT4_I(inode)->i_data_sem));
1713			ext4_ext_drop_refs(path);
1714			kfree(path);
1715			if (ret) {
1716				iput(inode);
1717				return 0;
1718			}
1719			goto next;
1720		}
1721
1722		if (start_pblk + cur - start != map.m_pblk) {
1723			/*
1724			 * Logical to physical mapping changed. This can happen
1725			 * if this range was removed and then reallocated to
1726			 * map to new physical blocks during a fast commit.
1727			 */
1728			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729					ext4_ext_is_unwritten(ex),
1730					start_pblk + cur - start);
1731			if (ret) {
1732				iput(inode);
1733				return 0;
1734			}
1735			/*
1736			 * Mark the old blocks as free since they aren't used
1737			 * anymore. We maintain an array of all the modified
1738			 * inodes. In case these blocks are still used at either
1739			 * a different logical range in the same inode or in
1740			 * some different inode, we will mark them as allocated
1741			 * at the end of the FC replay using our array of
1742			 * modified inodes.
1743			 */
1744			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1745			goto next;
1746		}
1747
1748		/* Range is mapped and needs a state change */
1749		jbd_debug(1, "Converting from %ld to %d %lld",
1750				map.m_flags & EXT4_MAP_UNWRITTEN,
1751			ext4_ext_is_unwritten(ex), map.m_pblk);
1752		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1753					ext4_ext_is_unwritten(ex), map.m_pblk);
1754		if (ret) {
1755			iput(inode);
1756			return 0;
1757		}
1758		/*
1759		 * We may have split the extent tree while toggling the state.
1760		 * Try to shrink the extent tree now.
1761		 */
1762		ext4_ext_replay_shrink_inode(inode, start + len);
1763next:
1764		cur += map.m_len;
1765		remaining -= map.m_len;
1766	}
1767	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1768					sb->s_blocksize_bits);
 
1769	iput(inode);
1770	return 0;
1771}
1772
1773/* Replay DEL_RANGE tag */
1774static int
1775ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1776			 u8 *val)
1777{
1778	struct inode *inode;
1779	struct ext4_fc_del_range lrange;
1780	struct ext4_map_blocks map;
1781	ext4_lblk_t cur, remaining;
1782	int ret;
1783
1784	memcpy(&lrange, val, sizeof(lrange));
1785	cur = le32_to_cpu(lrange.fc_lblk);
1786	remaining = le32_to_cpu(lrange.fc_len);
1787
1788	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789		le32_to_cpu(lrange.fc_ino), cur, remaining);
1790
1791	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1792	if (IS_ERR(inode)) {
1793		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1794		return 0;
1795	}
1796
1797	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1798
1799	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1801			le32_to_cpu(lrange.fc_len));
1802	while (remaining > 0) {
1803		map.m_lblk = cur;
1804		map.m_len = remaining;
1805
1806		ret = ext4_map_blocks(NULL, inode, &map, 0);
1807		if (ret < 0) {
1808			iput(inode);
1809			return 0;
1810		}
1811		if (ret > 0) {
1812			remaining -= ret;
1813			cur += ret;
1814			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815		} else {
1816			remaining -= map.m_len;
1817			cur += map.m_len;
1818		}
1819	}
1820
1821	ret = ext4_punch_hole(inode,
1822		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1823		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
 
 
1824	if (ret)
1825		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826	ext4_ext_replay_shrink_inode(inode,
1827		i_size_read(inode) >> sb->s_blocksize_bits);
1828	ext4_mark_inode_dirty(NULL, inode);
 
1829	iput(inode);
1830
1831	return 0;
1832}
1833
1834static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1835{
1836	struct ext4_fc_replay_state *state;
1837	struct inode *inode;
1838	struct ext4_ext_path *path = NULL;
1839	struct ext4_map_blocks map;
1840	int i, ret, j;
1841	ext4_lblk_t cur, end;
1842
1843	state = &EXT4_SB(sb)->s_fc_replay_state;
1844	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1845		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1846			EXT4_IGET_NORMAL);
1847		if (IS_ERR(inode)) {
1848			jbd_debug(1, "Inode %d not found.",
1849				state->fc_modified_inodes[i]);
1850			continue;
1851		}
1852		cur = 0;
1853		end = EXT_MAX_BLOCKS;
 
 
 
 
1854		while (cur < end) {
1855			map.m_lblk = cur;
1856			map.m_len = end - cur;
1857
1858			ret = ext4_map_blocks(NULL, inode, &map, 0);
1859			if (ret < 0)
1860				break;
1861
1862			if (ret > 0) {
1863				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1864				if (!IS_ERR(path)) {
1865					for (j = 0; j < path->p_depth; j++)
1866						ext4_mb_mark_bb(inode->i_sb,
1867							path[j].p_block, 1, 1);
1868					ext4_ext_drop_refs(path);
1869					kfree(path);
1870				}
1871				cur += ret;
1872				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1873							map.m_len, 1);
1874			} else {
1875				cur = cur + (map.m_len ? map.m_len : 1);
1876			}
1877		}
1878		iput(inode);
1879	}
1880}
1881
1882/*
1883 * Check if block is in excluded regions for block allocation. The simple
1884 * allocator that runs during replay phase is calls this function to see
1885 * if it is okay to use a block.
1886 */
1887bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1888{
1889	int i;
1890	struct ext4_fc_replay_state *state;
1891
1892	state = &EXT4_SB(sb)->s_fc_replay_state;
1893	for (i = 0; i < state->fc_regions_valid; i++) {
1894		if (state->fc_regions[i].ino == 0 ||
1895			state->fc_regions[i].len == 0)
1896			continue;
1897		if (blk >= state->fc_regions[i].pblk &&
1898		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1899			return true;
1900	}
1901	return false;
1902}
1903
1904/* Cleanup function called after replay */
1905void ext4_fc_replay_cleanup(struct super_block *sb)
1906{
1907	struct ext4_sb_info *sbi = EXT4_SB(sb);
1908
1909	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1910	kfree(sbi->s_fc_replay_state.fc_regions);
1911	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1912}
1913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1914/*
1915 * Recovery Scan phase handler
1916 *
1917 * This function is called during the scan phase and is responsible
1918 * for doing following things:
1919 * - Make sure the fast commit area has valid tags for replay
1920 * - Count number of tags that need to be replayed by the replay handler
1921 * - Verify CRC
1922 * - Create a list of excluded blocks for allocation during replay phase
1923 *
1924 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1925 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1926 * to indicate that scan has finished and JBD2 can now start replay phase.
1927 * It returns a negative error to indicate that there was an error. At the end
1928 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1929 * to indicate the number of tags that need to replayed during the replay phase.
1930 */
1931static int ext4_fc_replay_scan(journal_t *journal,
1932				struct buffer_head *bh, int off,
1933				tid_t expected_tid)
1934{
1935	struct super_block *sb = journal->j_private;
1936	struct ext4_sb_info *sbi = EXT4_SB(sb);
1937	struct ext4_fc_replay_state *state;
1938	int ret = JBD2_FC_REPLAY_CONTINUE;
1939	struct ext4_fc_add_range ext;
1940	struct ext4_fc_tl tl;
1941	struct ext4_fc_tail tail;
1942	__u8 *start, *end, *cur, *val;
1943	struct ext4_fc_head head;
1944	struct ext4_extent *ex;
1945
1946	state = &sbi->s_fc_replay_state;
1947
1948	start = (u8 *)bh->b_data;
1949	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1950
1951	if (state->fc_replay_expected_off == 0) {
1952		state->fc_cur_tag = 0;
1953		state->fc_replay_num_tags = 0;
1954		state->fc_crc = 0;
1955		state->fc_regions = NULL;
1956		state->fc_regions_valid = state->fc_regions_used =
1957			state->fc_regions_size = 0;
1958		/* Check if we can stop early */
1959		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1960			!= EXT4_FC_TAG_HEAD)
1961			return 0;
1962	}
1963
1964	if (off != state->fc_replay_expected_off) {
1965		ret = -EFSCORRUPTED;
1966		goto out_err;
1967	}
1968
1969	state->fc_replay_expected_off++;
1970	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1971		memcpy(&tl, cur, sizeof(tl));
1972		val = cur + sizeof(tl);
1973		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1974			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1975		switch (le16_to_cpu(tl.fc_tag)) {
 
 
 
 
 
 
 
1976		case EXT4_FC_TAG_ADD_RANGE:
1977			memcpy(&ext, val, sizeof(ext));
1978			ex = (struct ext4_extent *)&ext.fc_ex;
1979			ret = ext4_fc_record_regions(sb,
1980				le32_to_cpu(ext.fc_ino),
1981				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1982				ext4_ext_get_actual_len(ex));
1983			if (ret < 0)
1984				break;
1985			ret = JBD2_FC_REPLAY_CONTINUE;
1986			fallthrough;
1987		case EXT4_FC_TAG_DEL_RANGE:
1988		case EXT4_FC_TAG_LINK:
1989		case EXT4_FC_TAG_UNLINK:
1990		case EXT4_FC_TAG_CREAT:
1991		case EXT4_FC_TAG_INODE:
1992		case EXT4_FC_TAG_PAD:
1993			state->fc_cur_tag++;
1994			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995					sizeof(tl) + le16_to_cpu(tl.fc_len));
1996			break;
1997		case EXT4_FC_TAG_TAIL:
1998			state->fc_cur_tag++;
1999			memcpy(&tail, val, sizeof(tail));
2000			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2001						sizeof(tl) +
2002						offsetof(struct ext4_fc_tail,
2003						fc_crc));
2004			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2005				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2006				state->fc_replay_num_tags = state->fc_cur_tag;
2007				state->fc_regions_valid =
2008					state->fc_regions_used;
2009			} else {
2010				ret = state->fc_replay_num_tags ?
2011					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2012			}
2013			state->fc_crc = 0;
2014			break;
2015		case EXT4_FC_TAG_HEAD:
2016			memcpy(&head, val, sizeof(head));
2017			if (le32_to_cpu(head.fc_features) &
2018				~EXT4_FC_SUPPORTED_FEATURES) {
2019				ret = -EOPNOTSUPP;
2020				break;
2021			}
2022			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2023				ret = JBD2_FC_REPLAY_STOP;
2024				break;
2025			}
2026			state->fc_cur_tag++;
2027			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2028					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2029			break;
2030		default:
2031			ret = state->fc_replay_num_tags ?
2032				JBD2_FC_REPLAY_STOP : -ECANCELED;
2033		}
2034		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2035			break;
2036	}
2037
2038out_err:
2039	trace_ext4_fc_replay_scan(sb, ret, off);
2040	return ret;
2041}
2042
2043/*
2044 * Main recovery path entry point.
2045 * The meaning of return codes is similar as above.
2046 */
2047static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2048				enum passtype pass, int off, tid_t expected_tid)
2049{
2050	struct super_block *sb = journal->j_private;
2051	struct ext4_sb_info *sbi = EXT4_SB(sb);
2052	struct ext4_fc_tl tl;
2053	__u8 *start, *end, *cur, *val;
2054	int ret = JBD2_FC_REPLAY_CONTINUE;
2055	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2056	struct ext4_fc_tail tail;
2057
2058	if (pass == PASS_SCAN) {
2059		state->fc_current_pass = PASS_SCAN;
2060		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2061	}
2062
2063	if (state->fc_current_pass != pass) {
2064		state->fc_current_pass = pass;
2065		sbi->s_mount_state |= EXT4_FC_REPLAY;
2066	}
2067	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2068		jbd_debug(1, "Replay stops\n");
2069		ext4_fc_set_bitmaps_and_counters(sb);
2070		return 0;
2071	}
2072
2073#ifdef CONFIG_EXT4_DEBUG
2074	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2075		pr_warn("Dropping fc block %d because max_replay set\n", off);
2076		return JBD2_FC_REPLAY_STOP;
2077	}
2078#endif
2079
2080	start = (u8 *)bh->b_data;
2081	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2082
2083	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2084		memcpy(&tl, cur, sizeof(tl));
2085		val = cur + sizeof(tl);
 
2086
2087		if (state->fc_replay_num_tags == 0) {
2088			ret = JBD2_FC_REPLAY_STOP;
2089			ext4_fc_set_bitmaps_and_counters(sb);
2090			break;
2091		}
2092		jbd_debug(3, "Replay phase, tag:%s\n",
2093				tag2str(le16_to_cpu(tl.fc_tag)));
2094		state->fc_replay_num_tags--;
2095		switch (le16_to_cpu(tl.fc_tag)) {
2096		case EXT4_FC_TAG_LINK:
2097			ret = ext4_fc_replay_link(sb, &tl, val);
2098			break;
2099		case EXT4_FC_TAG_UNLINK:
2100			ret = ext4_fc_replay_unlink(sb, &tl, val);
2101			break;
2102		case EXT4_FC_TAG_ADD_RANGE:
2103			ret = ext4_fc_replay_add_range(sb, &tl, val);
2104			break;
2105		case EXT4_FC_TAG_CREAT:
2106			ret = ext4_fc_replay_create(sb, &tl, val);
2107			break;
2108		case EXT4_FC_TAG_DEL_RANGE:
2109			ret = ext4_fc_replay_del_range(sb, &tl, val);
2110			break;
2111		case EXT4_FC_TAG_INODE:
2112			ret = ext4_fc_replay_inode(sb, &tl, val);
2113			break;
2114		case EXT4_FC_TAG_PAD:
2115			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2116					     le16_to_cpu(tl.fc_len), 0);
2117			break;
2118		case EXT4_FC_TAG_TAIL:
2119			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2120					     le16_to_cpu(tl.fc_len), 0);
2121			memcpy(&tail, val, sizeof(tail));
2122			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2123			break;
2124		case EXT4_FC_TAG_HEAD:
2125			break;
2126		default:
2127			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2128					     le16_to_cpu(tl.fc_len), 0);
2129			ret = -ECANCELED;
2130			break;
2131		}
2132		if (ret < 0)
2133			break;
2134		ret = JBD2_FC_REPLAY_CONTINUE;
2135	}
2136	return ret;
2137}
2138
2139void ext4_fc_init(struct super_block *sb, journal_t *journal)
2140{
2141	/*
2142	 * We set replay callback even if fast commit disabled because we may
2143	 * could still have fast commit blocks that need to be replayed even if
2144	 * fast commit has now been turned off.
2145	 */
2146	journal->j_fc_replay_callback = ext4_fc_replay;
2147	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2148		return;
2149	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2150}
2151
2152static const char *fc_ineligible_reasons[] = {
2153	"Extended attributes changed",
2154	"Cross rename",
2155	"Journal flag changed",
2156	"Insufficient memory",
2157	"Swap boot",
2158	"Resize",
2159	"Dir renamed",
2160	"Falloc range op",
2161	"Data journalling",
2162	"FC Commit Failed"
2163};
2164
2165int ext4_fc_info_show(struct seq_file *seq, void *v)
2166{
2167	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2168	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2169	int i;
2170
2171	if (v != SEQ_START_TOKEN)
2172		return 0;
2173
2174	seq_printf(seq,
2175		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2176		   stats->fc_num_commits, stats->fc_ineligible_commits,
2177		   stats->fc_numblks,
2178		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2179	seq_puts(seq, "Ineligible reasons:\n");
2180	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2181		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2182			stats->fc_ineligible_reason_count[i]);
2183
2184	return 0;
2185}
2186
2187int __init ext4_fc_init_dentry_cache(void)
2188{
2189	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2190					   SLAB_RECLAIM_ACCOUNT);
2191
2192	if (ext4_fc_dentry_cachep == NULL)
2193		return -ENOMEM;
2194
2195	return 0;
 
 
 
 
 
2196}