fast_commit.c - fs/ext4/fast_commit.c - Linux source code v4.6

Note: File does not exist in v4.6.
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 324		return;
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 436
 437	mutex_unlock(&ei->i_fc_lock);
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 897				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 898
 899			/* Limit the number of blocks in one extent */
 900			map.m_len = min(max, map.m_len);
 901
 902			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 903			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 904			ex->ee_block = cpu_to_le32(map.m_lblk);
 905			ex->ee_len = cpu_to_le16(map.m_len);
 906			ext4_ext_store_pblock(ex, map.m_pblk);
 907			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 908				ext4_ext_mark_unwritten(ex);
 909			else
 910				ext4_ext_mark_initialized(ex);
 911			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 912					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 913				return -ENOSPC;
 914		}
 915
 916		cur_lblk_off += map.m_len;
 917	}
 918
 919	return 0;
 920}
 921
 922
 923/* Submit data for all the fast commit inodes */
 924static int ext4_fc_submit_inode_data_all(journal_t *journal)
 925{
 926	struct super_block *sb = (struct super_block *)(journal->j_private);
 927	struct ext4_sb_info *sbi = EXT4_SB(sb);
 928	struct ext4_inode_info *ei;
 929	int ret = 0;
 930
 931	spin_lock(&sbi->s_fc_lock);
 932	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 933	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 934		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 935		while (atomic_read(&ei->i_fc_updates)) {
 936			DEFINE_WAIT(wait);
 937
 938			prepare_to_wait(&ei->i_fc_wait, &wait,
 939						TASK_UNINTERRUPTIBLE);
 940			if (atomic_read(&ei->i_fc_updates)) {
 941				spin_unlock(&sbi->s_fc_lock);
 942				schedule();
 943				spin_lock(&sbi->s_fc_lock);
 944			}
 945			finish_wait(&ei->i_fc_wait, &wait);
 946		}
 947		spin_unlock(&sbi->s_fc_lock);
 948		ret = jbd2_submit_inode_data(ei->jinode);
 949		if (ret)
 950			return ret;
 951		spin_lock(&sbi->s_fc_lock);
 952	}
 953	spin_unlock(&sbi->s_fc_lock);
 954
 955	return ret;
 956}
 957
 958/* Wait for completion of data for all the fast commit inodes */
 959static int ext4_fc_wait_inode_data_all(journal_t *journal)
 960{
 961	struct super_block *sb = (struct super_block *)(journal->j_private);
 962	struct ext4_sb_info *sbi = EXT4_SB(sb);
 963	struct ext4_inode_info *pos, *n;
 964	int ret = 0;
 965
 966	spin_lock(&sbi->s_fc_lock);
 967	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 968		if (!ext4_test_inode_state(&pos->vfs_inode,
 969					   EXT4_STATE_FC_COMMITTING))
 970			continue;
 971		spin_unlock(&sbi->s_fc_lock);
 972
 973		ret = jbd2_wait_inode_data(journal, pos->jinode);
 974		if (ret)
 975			return ret;
 976		spin_lock(&sbi->s_fc_lock);
 977	}
 978	spin_unlock(&sbi->s_fc_lock);
 979
 980	return 0;
 981}
 982
 983/* Commit all the directory entry updates */
 984static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 985__acquires(&sbi->s_fc_lock)
 986__releases(&sbi->s_fc_lock)
 987{
 988	struct super_block *sb = (struct super_block *)(journal->j_private);
 989	struct ext4_sb_info *sbi = EXT4_SB(sb);
 990	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 991	struct inode *inode;
 992	struct ext4_inode_info *ei, *ei_n;
 993	int ret;
 994
 995	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 996		return 0;
 997	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 998				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 999		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1000			spin_unlock(&sbi->s_fc_lock);
1001			if (!ext4_fc_add_dentry_tlv(
1002				sb, fc_dentry->fcd_op,
1003				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1004				fc_dentry->fcd_name.len,
1005				fc_dentry->fcd_name.name, crc)) {
1006				ret = -ENOSPC;
1007				goto lock_and_exit;
1008			}
1009			spin_lock(&sbi->s_fc_lock);
1010			continue;
1011		}
1012
1013		inode = NULL;
1014		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1015					 i_fc_list) {
1016			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1017				inode = &ei->vfs_inode;
1018				break;
1019			}
1020		}
1021		/*
1022		 * If we don't find inode in our list, then it was deleted,
1023		 * in which case, we don't need to record it's create tag.
1024		 */
1025		if (!inode)
1026			continue;
1027		spin_unlock(&sbi->s_fc_lock);
1028
1029		/*
1030		 * We first write the inode and then the create dirent. This
1031		 * allows the recovery code to create an unnamed inode first
1032		 * and then link it to a directory entry. This allows us
1033		 * to use namei.c routines almost as is and simplifies
1034		 * the recovery code.
1035		 */
1036		ret = ext4_fc_write_inode(inode, crc);
1037		if (ret)
1038			goto lock_and_exit;
1039
1040		ret = ext4_fc_write_inode_data(inode, crc);
1041		if (ret)
1042			goto lock_and_exit;
1043
1044		if (!ext4_fc_add_dentry_tlv(
1045			sb, fc_dentry->fcd_op,
1046			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1047			fc_dentry->fcd_name.len,
1048			fc_dentry->fcd_name.name, crc)) {
1049			ret = -ENOSPC;
1050			goto lock_and_exit;
1051		}
1052
1053		spin_lock(&sbi->s_fc_lock);
1054	}
1055	return 0;
1056lock_and_exit:
1057	spin_lock(&sbi->s_fc_lock);
1058	return ret;
1059}
1060
1061static int ext4_fc_perform_commit(journal_t *journal)
1062{
1063	struct super_block *sb = (struct super_block *)(journal->j_private);
1064	struct ext4_sb_info *sbi = EXT4_SB(sb);
1065	struct ext4_inode_info *iter;
1066	struct ext4_fc_head head;
1067	struct inode *inode;
1068	struct blk_plug plug;
1069	int ret = 0;
1070	u32 crc = 0;
1071
1072	ret = ext4_fc_submit_inode_data_all(journal);
1073	if (ret)
1074		return ret;
1075
1076	ret = ext4_fc_wait_inode_data_all(journal);
1077	if (ret)
1078		return ret;
1079
1080	/*
1081	 * If file system device is different from journal device, issue a cache
1082	 * flush before we start writing fast commit blocks.
1083	 */
1084	if (journal->j_fs_dev != journal->j_dev)
1085		blkdev_issue_flush(journal->j_fs_dev);
1086
1087	blk_start_plug(&plug);
1088	if (sbi->s_fc_bytes == 0) {
1089		/*
1090		 * Add a head tag only if this is the first fast commit
1091		 * in this TID.
1092		 */
1093		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1094		head.fc_tid = cpu_to_le32(
1095			sbi->s_journal->j_running_transaction->t_tid);
1096		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1097			(u8 *)&head, &crc)) {
1098			ret = -ENOSPC;
1099			goto out;
1100		}
1101	}
1102
1103	spin_lock(&sbi->s_fc_lock);
1104	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1105	if (ret) {
1106		spin_unlock(&sbi->s_fc_lock);
1107		goto out;
1108	}
1109
1110	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1111		inode = &iter->vfs_inode;
1112		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1113			continue;
1114
1115		spin_unlock(&sbi->s_fc_lock);
1116		ret = ext4_fc_write_inode_data(inode, &crc);
1117		if (ret)
1118			goto out;
1119		ret = ext4_fc_write_inode(inode, &crc);
1120		if (ret)
1121			goto out;
1122		spin_lock(&sbi->s_fc_lock);
1123	}
1124	spin_unlock(&sbi->s_fc_lock);
1125
1126	ret = ext4_fc_write_tail(sb, crc);
1127
1128out:
1129	blk_finish_plug(&plug);
1130	return ret;
1131}
1132
1133/*
1134 * The main commit entry point. Performs a fast commit for transaction
1135 * commit_tid if needed. If it's not possible to perform a fast commit
1136 * due to various reasons, we fall back to full commit. Returns 0
1137 * on success, error otherwise.
1138 */
1139int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1140{
1141	struct super_block *sb = (struct super_block *)(journal->j_private);
1142	struct ext4_sb_info *sbi = EXT4_SB(sb);
1143	int nblks = 0, ret, bsize = journal->j_blocksize;
1144	int subtid = atomic_read(&sbi->s_fc_subtid);
1145	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1146	ktime_t start_time, commit_time;
1147
1148	trace_ext4_fc_commit_start(sb);
1149
1150	start_time = ktime_get();
1151
1152	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1153		(ext4_fc_is_ineligible(sb))) {
1154		reason = EXT4_FC_REASON_INELIGIBLE;
1155		goto out;
1156	}
1157
1158restart_fc:
1159	ret = jbd2_fc_begin_commit(journal, commit_tid);
1160	if (ret == -EALREADY) {
1161		/* There was an ongoing commit, check if we need to restart */
1162		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1163			commit_tid > journal->j_commit_sequence)
1164			goto restart_fc;
1165		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1166		goto out;
1167	} else if (ret) {
1168		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169		reason = EXT4_FC_REASON_FC_START_FAILED;
1170		goto out;
1171	}
1172
1173	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1174	ret = ext4_fc_perform_commit(journal);
1175	if (ret < 0) {
1176		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177		reason = EXT4_FC_REASON_FC_FAILED;
1178		goto out;
1179	}
1180	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181	ret = jbd2_fc_wait_bufs(journal, nblks);
1182	if (ret < 0) {
1183		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184		reason = EXT4_FC_REASON_FC_FAILED;
1185		goto out;
1186	}
1187	atomic_inc(&sbi->s_fc_subtid);
1188	jbd2_fc_end_commit(journal);
1189out:
1190	/* Has any ineligible update happened since we started? */
1191	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1192		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1193		reason = EXT4_FC_REASON_INELIGIBLE;
1194	}
1195
1196	spin_lock(&sbi->s_fc_lock);
1197	if (reason != EXT4_FC_REASON_OK &&
1198		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1199		sbi->s_fc_stats.fc_ineligible_commits++;
1200	} else {
1201		sbi->s_fc_stats.fc_num_commits++;
1202		sbi->s_fc_stats.fc_numblks += nblks;
1203	}
1204	spin_unlock(&sbi->s_fc_lock);
1205	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1206	trace_ext4_fc_commit_stop(sb, nblks, reason);
1207	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1208	/*
1209	 * weight the commit time higher than the average time so we don't
1210	 * react too strongly to vast changes in the commit time
1211	 */
1212	if (likely(sbi->s_fc_avg_commit_time))
1213		sbi->s_fc_avg_commit_time = (commit_time +
1214				sbi->s_fc_avg_commit_time * 3) / 4;
1215	else
1216		sbi->s_fc_avg_commit_time = commit_time;
1217	jbd_debug(1,
1218		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1219		nblks, reason, subtid);
1220	if (reason == EXT4_FC_REASON_FC_FAILED)
1221		return jbd2_fc_end_commit_fallback(journal);
1222	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1223		reason == EXT4_FC_REASON_INELIGIBLE)
1224		return jbd2_complete_transaction(journal, commit_tid);
1225	return 0;
1226}
1227
1228/*
1229 * Fast commit cleanup routine. This is called after every fast commit and
1230 * full commit. full is true if we are called after a full commit.
1231 */
1232static void ext4_fc_cleanup(journal_t *journal, int full)
1233{
1234	struct super_block *sb = journal->j_private;
1235	struct ext4_sb_info *sbi = EXT4_SB(sb);
1236	struct ext4_inode_info *iter, *iter_n;
1237	struct ext4_fc_dentry_update *fc_dentry;
1238
1239	if (full && sbi->s_fc_bh)
1240		sbi->s_fc_bh = NULL;
1241
1242	jbd2_fc_release_bufs(journal);
1243
1244	spin_lock(&sbi->s_fc_lock);
1245	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1246				 i_fc_list) {
1247		list_del_init(&iter->i_fc_list);
1248		ext4_clear_inode_state(&iter->vfs_inode,
1249				       EXT4_STATE_FC_COMMITTING);
1250		ext4_fc_reset_inode(&iter->vfs_inode);
1251		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1252		smp_mb();
1253#if (BITS_PER_LONG < 64)
1254		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1255#else
1256		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1257#endif
1258	}
1259
1260	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1261		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1262					     struct ext4_fc_dentry_update,
1263					     fcd_list);
1264		list_del_init(&fc_dentry->fcd_list);
1265		spin_unlock(&sbi->s_fc_lock);
1266
1267		if (fc_dentry->fcd_name.name &&
1268			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1269			kfree(fc_dentry->fcd_name.name);
1270		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1271		spin_lock(&sbi->s_fc_lock);
1272	}
1273
1274	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1275				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1276	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1277				&sbi->s_fc_q[FC_Q_MAIN]);
1278
1279	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1280	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1281
1282	if (full)
1283		sbi->s_fc_bytes = 0;
1284	spin_unlock(&sbi->s_fc_lock);
1285	trace_ext4_fc_stats(sb);
1286}
1287
1288/* Ext4 Replay Path Routines */
1289
1290/* Helper struct for dentry replay routines */
1291struct dentry_info_args {
1292	int parent_ino, dname_len, ino, inode_len;
1293	char *dname;
1294};
1295
1296static inline void tl_to_darg(struct dentry_info_args *darg,
1297			      struct  ext4_fc_tl *tl, u8 *val)
1298{
1299	struct ext4_fc_dentry_info fcd;
1300
1301	memcpy(&fcd, val, sizeof(fcd));
1302
1303	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1304	darg->ino = le32_to_cpu(fcd.fc_ino);
1305	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1306	darg->dname_len = le16_to_cpu(tl->fc_len) -
1307		sizeof(struct ext4_fc_dentry_info);
1308}
1309
1310/* Unlink replay function */
1311static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1312				 u8 *val)
1313{
1314	struct inode *inode, *old_parent;
1315	struct qstr entry;
1316	struct dentry_info_args darg;
1317	int ret = 0;
1318
1319	tl_to_darg(&darg, tl, val);
1320
1321	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1322			darg.parent_ino, darg.dname_len);
1323
1324	entry.name = darg.dname;
1325	entry.len = darg.dname_len;
1326	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1327
1328	if (IS_ERR(inode)) {
1329		jbd_debug(1, "Inode %d not found", darg.ino);
1330		return 0;
1331	}
1332
1333	old_parent = ext4_iget(sb, darg.parent_ino,
1334				EXT4_IGET_NORMAL);
1335	if (IS_ERR(old_parent)) {
1336		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1337		iput(inode);
1338		return 0;
1339	}
1340
1341	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1342	/* -ENOENT ok coz it might not exist anymore. */
1343	if (ret == -ENOENT)
1344		ret = 0;
1345	iput(old_parent);
1346	iput(inode);
1347	return ret;
1348}
1349
1350static int ext4_fc_replay_link_internal(struct super_block *sb,
1351				struct dentry_info_args *darg,
1352				struct inode *inode)
1353{
1354	struct inode *dir = NULL;
1355	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1356	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1357	int ret = 0;
1358
1359	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1360	if (IS_ERR(dir)) {
1361		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1362		dir = NULL;
1363		goto out;
1364	}
1365
1366	dentry_dir = d_obtain_alias(dir);
1367	if (IS_ERR(dentry_dir)) {
1368		jbd_debug(1, "Failed to obtain dentry");
1369		dentry_dir = NULL;
1370		goto out;
1371	}
1372
1373	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1374	if (!dentry_inode) {
1375		jbd_debug(1, "Inode dentry not created.");
1376		ret = -ENOMEM;
1377		goto out;
1378	}
1379
1380	ret = __ext4_link(dir, inode, dentry_inode);
1381	/*
1382	 * It's possible that link already existed since data blocks
1383	 * for the dir in question got persisted before we crashed OR
1384	 * we replayed this tag and crashed before the entire replay
1385	 * could complete.
1386	 */
1387	if (ret && ret != -EEXIST) {
1388		jbd_debug(1, "Failed to link\n");
1389		goto out;
1390	}
1391
1392	ret = 0;
1393out:
1394	if (dentry_dir) {
1395		d_drop(dentry_dir);
1396		dput(dentry_dir);
1397	} else if (dir) {
1398		iput(dir);
1399	}
1400	if (dentry_inode) {
1401		d_drop(dentry_inode);
1402		dput(dentry_inode);
1403	}
1404
1405	return ret;
1406}
1407
1408/* Link replay function */
1409static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1410			       u8 *val)
1411{
1412	struct inode *inode;
1413	struct dentry_info_args darg;
1414	int ret = 0;
1415
1416	tl_to_darg(&darg, tl, val);
1417	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1418			darg.parent_ino, darg.dname_len);
1419
1420	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421	if (IS_ERR(inode)) {
1422		jbd_debug(1, "Inode not found.");
1423		return 0;
1424	}
1425
1426	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1427	iput(inode);
1428	return ret;
1429}
1430
1431/*
1432 * Record all the modified inodes during replay. We use this later to setup
1433 * block bitmaps correctly.
1434 */
1435static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1436{
1437	struct ext4_fc_replay_state *state;
1438	int i;
1439
1440	state = &EXT4_SB(sb)->s_fc_replay_state;
1441	for (i = 0; i < state->fc_modified_inodes_used; i++)
1442		if (state->fc_modified_inodes[i] == ino)
1443			return 0;
1444	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1445		state->fc_modified_inodes_size +=
1446			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1447		state->fc_modified_inodes = krealloc(
1448					state->fc_modified_inodes, sizeof(int) *
1449					state->fc_modified_inodes_size,
1450					GFP_KERNEL);
1451		if (!state->fc_modified_inodes)
1452			return -ENOMEM;
1453	}
1454	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1455	return 0;
1456}
1457
1458/*
1459 * Inode replay function
1460 */
1461static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1462				u8 *val)
1463{
1464	struct ext4_fc_inode fc_inode;
1465	struct ext4_inode *raw_inode;
1466	struct ext4_inode *raw_fc_inode;
1467	struct inode *inode = NULL;
1468	struct ext4_iloc iloc;
1469	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1470	struct ext4_extent_header *eh;
1471
1472	memcpy(&fc_inode, val, sizeof(fc_inode));
1473
1474	ino = le32_to_cpu(fc_inode.fc_ino);
1475	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1476
1477	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1478	if (!IS_ERR(inode)) {
1479		ext4_ext_clear_bb(inode);
1480		iput(inode);
1481	}
1482	inode = NULL;
1483
1484	ext4_fc_record_modified_inode(sb, ino);
1485
1486	raw_fc_inode = (struct ext4_inode *)
1487		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1488	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1489	if (ret)
1490		goto out;
1491
1492	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1493	raw_inode = ext4_raw_inode(&iloc);
1494
1495	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1496	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1497		inode_len - offsetof(struct ext4_inode, i_generation));
1498	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1499		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1500		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1501			memset(eh, 0, sizeof(*eh));
1502			eh->eh_magic = EXT4_EXT_MAGIC;
1503			eh->eh_max = cpu_to_le16(
1504				(sizeof(raw_inode->i_block) -
1505				 sizeof(struct ext4_extent_header))
1506				 / sizeof(struct ext4_extent));
1507		}
1508	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1509		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1510			sizeof(raw_inode->i_block));
1511	}
1512
1513	/* Immediately update the inode on disk. */
1514	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515	if (ret)
1516		goto out;
1517	ret = sync_dirty_buffer(iloc.bh);
1518	if (ret)
1519		goto out;
1520	ret = ext4_mark_inode_used(sb, ino);
1521	if (ret)
1522		goto out;
1523
1524	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1525	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1526	if (IS_ERR(inode)) {
1527		jbd_debug(1, "Inode not found.");
1528		return -EFSCORRUPTED;
1529	}
1530
1531	/*
1532	 * Our allocator could have made different decisions than before
1533	 * crashing. This should be fixed but until then, we calculate
1534	 * the number of blocks the inode.
1535	 */
1536	ext4_ext_replay_set_iblocks(inode);
1537
1538	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1539	ext4_reset_inode_seed(inode);
1540
1541	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1542	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1543	sync_dirty_buffer(iloc.bh);
1544	brelse(iloc.bh);
1545out:
1546	iput(inode);
1547	if (!ret)
1548		blkdev_issue_flush(sb->s_bdev);
1549
1550	return 0;
1551}
1552
1553/*
1554 * Dentry create replay function.
1555 *
1556 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1557 * inode for which we are trying to create a dentry here, should already have
1558 * been replayed before we start here.
1559 */
1560static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1561				 u8 *val)
1562{
1563	int ret = 0;
1564	struct inode *inode = NULL;
1565	struct inode *dir = NULL;
1566	struct dentry_info_args darg;
1567
1568	tl_to_darg(&darg, tl, val);
1569
1570	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571			darg.parent_ino, darg.dname_len);
1572
1573	/* This takes care of update group descriptor and other metadata */
1574	ret = ext4_mark_inode_used(sb, darg.ino);
1575	if (ret)
1576		goto out;
1577
1578	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579	if (IS_ERR(inode)) {
1580		jbd_debug(1, "inode %d not found.", darg.ino);
1581		inode = NULL;
1582		ret = -EINVAL;
1583		goto out;
1584	}
1585
1586	if (S_ISDIR(inode->i_mode)) {
1587		/*
1588		 * If we are creating a directory, we need to make sure that the
1589		 * dot and dot dot dirents are setup properly.
1590		 */
1591		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592		if (IS_ERR(dir)) {
1593			jbd_debug(1, "Dir %d not found.", darg.ino);
1594			goto out;
1595		}
1596		ret = ext4_init_new_dir(NULL, dir, inode);
1597		iput(dir);
1598		if (ret) {
1599			ret = 0;
1600			goto out;
1601		}
1602	}
1603	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604	if (ret)
1605		goto out;
1606	set_nlink(inode, 1);
1607	ext4_mark_inode_dirty(NULL, inode);
1608out:
1609	if (inode)
1610		iput(inode);
1611	return ret;
1612}
1613
1614/*
1615 * Record physical disk regions which are in use as per fast commit area. Our
1616 * simple replay phase allocator excludes these regions from allocation.
1617 */
1618static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620{
1621	struct ext4_fc_replay_state *state;
1622	struct ext4_fc_alloc_region *region;
1623
1624	state = &EXT4_SB(sb)->s_fc_replay_state;
1625	if (state->fc_regions_used == state->fc_regions_size) {
1626		state->fc_regions_size +=
1627			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628		state->fc_regions = krealloc(
1629					state->fc_regions,
1630					state->fc_regions_size *
1631					sizeof(struct ext4_fc_alloc_region),
1632					GFP_KERNEL);
1633		if (!state->fc_regions)
1634			return -ENOMEM;
1635	}
1636	region = &state->fc_regions[state->fc_regions_used++];
1637	region->ino = ino;
1638	region->lblk = lblk;
1639	region->pblk = pblk;
1640	region->len = len;
1641
1642	return 0;
1643}
1644
1645/* Replay add range tag */
1646static int ext4_fc_replay_add_range(struct super_block *sb,
1647				    struct ext4_fc_tl *tl, u8 *val)
1648{
1649	struct ext4_fc_add_range fc_add_ex;
1650	struct ext4_extent newex, *ex;
1651	struct inode *inode;
1652	ext4_lblk_t start, cur;
1653	int remaining, len;
1654	ext4_fsblk_t start_pblk;
1655	struct ext4_map_blocks map;
1656	struct ext4_ext_path *path = NULL;
1657	int ret;
1658
1659	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1660	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1661
1662	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1664		ext4_ext_get_actual_len(ex));
1665
1666	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1667	if (IS_ERR(inode)) {
1668		jbd_debug(1, "Inode not found.");
1669		return 0;
1670	}
1671
1672	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1673
1674	start = le32_to_cpu(ex->ee_block);
1675	start_pblk = ext4_ext_pblock(ex);
1676	len = ext4_ext_get_actual_len(ex);
1677
1678	cur = start;
1679	remaining = len;
1680	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1681		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1682		  inode->i_ino);
1683
1684	while (remaining > 0) {
1685		map.m_lblk = cur;
1686		map.m_len = remaining;
1687		map.m_pblk = 0;
1688		ret = ext4_map_blocks(NULL, inode, &map, 0);
1689
1690		if (ret < 0) {
1691			iput(inode);
1692			return 0;
1693		}
1694
1695		if (ret == 0) {
1696			/* Range is not mapped */
1697			path = ext4_find_extent(inode, cur, NULL, 0);
1698			if (IS_ERR(path)) {
1699				iput(inode);
1700				return 0;
1701			}
1702			memset(&newex, 0, sizeof(newex));
1703			newex.ee_block = cpu_to_le32(cur);
1704			ext4_ext_store_pblock(
1705				&newex, start_pblk + cur - start);
1706			newex.ee_len = cpu_to_le16(map.m_len);
1707			if (ext4_ext_is_unwritten(ex))
1708				ext4_ext_mark_unwritten(&newex);
1709			down_write(&EXT4_I(inode)->i_data_sem);
1710			ret = ext4_ext_insert_extent(
1711				NULL, inode, &path, &newex, 0);
1712			up_write((&EXT4_I(inode)->i_data_sem));
1713			ext4_ext_drop_refs(path);
1714			kfree(path);
1715			if (ret) {
1716				iput(inode);
1717				return 0;
1718			}
1719			goto next;
1720		}
1721
1722		if (start_pblk + cur - start != map.m_pblk) {
1723			/*
1724			 * Logical to physical mapping changed. This can happen
1725			 * if this range was removed and then reallocated to
1726			 * map to new physical blocks during a fast commit.
1727			 */
1728			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729					ext4_ext_is_unwritten(ex),
1730					start_pblk + cur - start);
1731			if (ret) {
1732				iput(inode);
1733				return 0;
1734			}
1735			/*
1736			 * Mark the old blocks as free since they aren't used
1737			 * anymore. We maintain an array of all the modified
1738			 * inodes. In case these blocks are still used at either
1739			 * a different logical range in the same inode or in
1740			 * some different inode, we will mark them as allocated
1741			 * at the end of the FC replay using our array of
1742			 * modified inodes.
1743			 */
1744			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1745			goto next;
1746		}
1747
1748		/* Range is mapped and needs a state change */
1749		jbd_debug(1, "Converting from %ld to %d %lld",
1750				map.m_flags & EXT4_MAP_UNWRITTEN,
1751			ext4_ext_is_unwritten(ex), map.m_pblk);
1752		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1753					ext4_ext_is_unwritten(ex), map.m_pblk);
1754		if (ret) {
1755			iput(inode);
1756			return 0;
1757		}
1758		/*
1759		 * We may have split the extent tree while toggling the state.
1760		 * Try to shrink the extent tree now.
1761		 */
1762		ext4_ext_replay_shrink_inode(inode, start + len);
1763next:
1764		cur += map.m_len;
1765		remaining -= map.m_len;
1766	}
1767	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1768					sb->s_blocksize_bits);
1769	iput(inode);
1770	return 0;
1771}
1772
1773/* Replay DEL_RANGE tag */
1774static int
1775ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1776			 u8 *val)
1777{
1778	struct inode *inode;
1779	struct ext4_fc_del_range lrange;
1780	struct ext4_map_blocks map;
1781	ext4_lblk_t cur, remaining;
1782	int ret;
1783
1784	memcpy(&lrange, val, sizeof(lrange));
1785	cur = le32_to_cpu(lrange.fc_lblk);
1786	remaining = le32_to_cpu(lrange.fc_len);
1787
1788	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789		le32_to_cpu(lrange.fc_ino), cur, remaining);
1790
1791	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1792	if (IS_ERR(inode)) {
1793		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1794		return 0;
1795	}
1796
1797	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1798
1799	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1801			le32_to_cpu(lrange.fc_len));
1802	while (remaining > 0) {
1803		map.m_lblk = cur;
1804		map.m_len = remaining;
1805
1806		ret = ext4_map_blocks(NULL, inode, &map, 0);
1807		if (ret < 0) {
1808			iput(inode);
1809			return 0;
1810		}
1811		if (ret > 0) {
1812			remaining -= ret;
1813			cur += ret;
1814			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815		} else {
1816			remaining -= map.m_len;
1817			cur += map.m_len;
1818		}
1819	}
1820
1821	ret = ext4_punch_hole(inode,
1822		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1823		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
1824	if (ret)
1825		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826	ext4_ext_replay_shrink_inode(inode,
1827		i_size_read(inode) >> sb->s_blocksize_bits);
1828	ext4_mark_inode_dirty(NULL, inode);
1829	iput(inode);
1830
1831	return 0;
1832}
1833
1834static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1835{
1836	struct ext4_fc_replay_state *state;
1837	struct inode *inode;
1838	struct ext4_ext_path *path = NULL;
1839	struct ext4_map_blocks map;
1840	int i, ret, j;
1841	ext4_lblk_t cur, end;
1842
1843	state = &EXT4_SB(sb)->s_fc_replay_state;
1844	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1845		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1846			EXT4_IGET_NORMAL);
1847		if (IS_ERR(inode)) {
1848			jbd_debug(1, "Inode %d not found.",
1849				state->fc_modified_inodes[i]);
1850			continue;
1851		}
1852		cur = 0;
1853		end = EXT_MAX_BLOCKS;
1854		while (cur < end) {
1855			map.m_lblk = cur;
1856			map.m_len = end - cur;
1857
1858			ret = ext4_map_blocks(NULL, inode, &map, 0);
1859			if (ret < 0)
1860				break;
1861
1862			if (ret > 0) {
1863				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1864				if (!IS_ERR(path)) {
1865					for (j = 0; j < path->p_depth; j++)
1866						ext4_mb_mark_bb(inode->i_sb,
1867							path[j].p_block, 1, 1);
1868					ext4_ext_drop_refs(path);
1869					kfree(path);
1870				}
1871				cur += ret;
1872				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1873							map.m_len, 1);
1874			} else {
1875				cur = cur + (map.m_len ? map.m_len : 1);
1876			}
1877		}
1878		iput(inode);
1879	}
1880}
1881
1882/*
1883 * Check if block is in excluded regions for block allocation. The simple
1884 * allocator that runs during replay phase is calls this function to see
1885 * if it is okay to use a block.
1886 */
1887bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1888{
1889	int i;
1890	struct ext4_fc_replay_state *state;
1891
1892	state = &EXT4_SB(sb)->s_fc_replay_state;
1893	for (i = 0; i < state->fc_regions_valid; i++) {
1894		if (state->fc_regions[i].ino == 0 ||
1895			state->fc_regions[i].len == 0)
1896			continue;
1897		if (blk >= state->fc_regions[i].pblk &&
1898		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1899			return true;
1900	}
1901	return false;
1902}
1903
1904/* Cleanup function called after replay */
1905void ext4_fc_replay_cleanup(struct super_block *sb)
1906{
1907	struct ext4_sb_info *sbi = EXT4_SB(sb);
1908
1909	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1910	kfree(sbi->s_fc_replay_state.fc_regions);
1911	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1912}
1913
1914/*
1915 * Recovery Scan phase handler
1916 *
1917 * This function is called during the scan phase and is responsible
1918 * for doing following things:
1919 * - Make sure the fast commit area has valid tags for replay
1920 * - Count number of tags that need to be replayed by the replay handler
1921 * - Verify CRC
1922 * - Create a list of excluded blocks for allocation during replay phase
1923 *
1924 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1925 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1926 * to indicate that scan has finished and JBD2 can now start replay phase.
1927 * It returns a negative error to indicate that there was an error. At the end
1928 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1929 * to indicate the number of tags that need to replayed during the replay phase.
1930 */
1931static int ext4_fc_replay_scan(journal_t *journal,
1932				struct buffer_head *bh, int off,
1933				tid_t expected_tid)
1934{
1935	struct super_block *sb = journal->j_private;
1936	struct ext4_sb_info *sbi = EXT4_SB(sb);
1937	struct ext4_fc_replay_state *state;
1938	int ret = JBD2_FC_REPLAY_CONTINUE;
1939	struct ext4_fc_add_range ext;
1940	struct ext4_fc_tl tl;
1941	struct ext4_fc_tail tail;
1942	__u8 *start, *end, *cur, *val;
1943	struct ext4_fc_head head;
1944	struct ext4_extent *ex;
1945
1946	state = &sbi->s_fc_replay_state;
1947
1948	start = (u8 *)bh->b_data;
1949	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1950
1951	if (state->fc_replay_expected_off == 0) {
1952		state->fc_cur_tag = 0;
1953		state->fc_replay_num_tags = 0;
1954		state->fc_crc = 0;
1955		state->fc_regions = NULL;
1956		state->fc_regions_valid = state->fc_regions_used =
1957			state->fc_regions_size = 0;
1958		/* Check if we can stop early */
1959		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1960			!= EXT4_FC_TAG_HEAD)
1961			return 0;
1962	}
1963
1964	if (off != state->fc_replay_expected_off) {
1965		ret = -EFSCORRUPTED;
1966		goto out_err;
1967	}
1968
1969	state->fc_replay_expected_off++;
1970	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1971		memcpy(&tl, cur, sizeof(tl));
1972		val = cur + sizeof(tl);
1973		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1974			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1975		switch (le16_to_cpu(tl.fc_tag)) {
1976		case EXT4_FC_TAG_ADD_RANGE:
1977			memcpy(&ext, val, sizeof(ext));
1978			ex = (struct ext4_extent *)&ext.fc_ex;
1979			ret = ext4_fc_record_regions(sb,
1980				le32_to_cpu(ext.fc_ino),
1981				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1982				ext4_ext_get_actual_len(ex));
1983			if (ret < 0)
1984				break;
1985			ret = JBD2_FC_REPLAY_CONTINUE;
1986			fallthrough;
1987		case EXT4_FC_TAG_DEL_RANGE:
1988		case EXT4_FC_TAG_LINK:
1989		case EXT4_FC_TAG_UNLINK:
1990		case EXT4_FC_TAG_CREAT:
1991		case EXT4_FC_TAG_INODE:
1992		case EXT4_FC_TAG_PAD:
1993			state->fc_cur_tag++;
1994			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995					sizeof(tl) + le16_to_cpu(tl.fc_len));
1996			break;
1997		case EXT4_FC_TAG_TAIL:
1998			state->fc_cur_tag++;
1999			memcpy(&tail, val, sizeof(tail));
2000			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2001						sizeof(tl) +
2002						offsetof(struct ext4_fc_tail,
2003						fc_crc));
2004			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2005				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2006				state->fc_replay_num_tags = state->fc_cur_tag;
2007				state->fc_regions_valid =
2008					state->fc_regions_used;
2009			} else {
2010				ret = state->fc_replay_num_tags ?
2011					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2012			}
2013			state->fc_crc = 0;
2014			break;
2015		case EXT4_FC_TAG_HEAD:
2016			memcpy(&head, val, sizeof(head));
2017			if (le32_to_cpu(head.fc_features) &
2018				~EXT4_FC_SUPPORTED_FEATURES) {
2019				ret = -EOPNOTSUPP;
2020				break;
2021			}
2022			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2023				ret = JBD2_FC_REPLAY_STOP;
2024				break;
2025			}
2026			state->fc_cur_tag++;
2027			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2028					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2029			break;
2030		default:
2031			ret = state->fc_replay_num_tags ?
2032				JBD2_FC_REPLAY_STOP : -ECANCELED;
2033		}
2034		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2035			break;
2036	}
2037
2038out_err:
2039	trace_ext4_fc_replay_scan(sb, ret, off);
2040	return ret;
2041}
2042
2043/*
2044 * Main recovery path entry point.
2045 * The meaning of return codes is similar as above.
2046 */
2047static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2048				enum passtype pass, int off, tid_t expected_tid)
2049{
2050	struct super_block *sb = journal->j_private;
2051	struct ext4_sb_info *sbi = EXT4_SB(sb);
2052	struct ext4_fc_tl tl;
2053	__u8 *start, *end, *cur, *val;
2054	int ret = JBD2_FC_REPLAY_CONTINUE;
2055	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2056	struct ext4_fc_tail tail;
2057
2058	if (pass == PASS_SCAN) {
2059		state->fc_current_pass = PASS_SCAN;
2060		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2061	}
2062
2063	if (state->fc_current_pass != pass) {
2064		state->fc_current_pass = pass;
2065		sbi->s_mount_state |= EXT4_FC_REPLAY;
2066	}
2067	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2068		jbd_debug(1, "Replay stops\n");
2069		ext4_fc_set_bitmaps_and_counters(sb);
2070		return 0;
2071	}
2072
2073#ifdef CONFIG_EXT4_DEBUG
2074	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2075		pr_warn("Dropping fc block %d because max_replay set\n", off);
2076		return JBD2_FC_REPLAY_STOP;
2077	}
2078#endif
2079
2080	start = (u8 *)bh->b_data;
2081	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2082
2083	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2084		memcpy(&tl, cur, sizeof(tl));
2085		val = cur + sizeof(tl);
2086
2087		if (state->fc_replay_num_tags == 0) {
2088			ret = JBD2_FC_REPLAY_STOP;
2089			ext4_fc_set_bitmaps_and_counters(sb);
2090			break;
2091		}
2092		jbd_debug(3, "Replay phase, tag:%s\n",
2093				tag2str(le16_to_cpu(tl.fc_tag)));
2094		state->fc_replay_num_tags--;
2095		switch (le16_to_cpu(tl.fc_tag)) {
2096		case EXT4_FC_TAG_LINK:
2097			ret = ext4_fc_replay_link(sb, &tl, val);
2098			break;
2099		case EXT4_FC_TAG_UNLINK:
2100			ret = ext4_fc_replay_unlink(sb, &tl, val);
2101			break;
2102		case EXT4_FC_TAG_ADD_RANGE:
2103			ret = ext4_fc_replay_add_range(sb, &tl, val);
2104			break;
2105		case EXT4_FC_TAG_CREAT:
2106			ret = ext4_fc_replay_create(sb, &tl, val);
2107			break;
2108		case EXT4_FC_TAG_DEL_RANGE:
2109			ret = ext4_fc_replay_del_range(sb, &tl, val);
2110			break;
2111		case EXT4_FC_TAG_INODE:
2112			ret = ext4_fc_replay_inode(sb, &tl, val);
2113			break;
2114		case EXT4_FC_TAG_PAD:
2115			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2116					     le16_to_cpu(tl.fc_len), 0);
2117			break;
2118		case EXT4_FC_TAG_TAIL:
2119			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2120					     le16_to_cpu(tl.fc_len), 0);
2121			memcpy(&tail, val, sizeof(tail));
2122			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2123			break;
2124		case EXT4_FC_TAG_HEAD:
2125			break;
2126		default:
2127			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2128					     le16_to_cpu(tl.fc_len), 0);
2129			ret = -ECANCELED;
2130			break;
2131		}
2132		if (ret < 0)
2133			break;
2134		ret = JBD2_FC_REPLAY_CONTINUE;
2135	}
2136	return ret;
2137}
2138
2139void ext4_fc_init(struct super_block *sb, journal_t *journal)
2140{
2141	/*
2142	 * We set replay callback even if fast commit disabled because we may
2143	 * could still have fast commit blocks that need to be replayed even if
2144	 * fast commit has now been turned off.
2145	 */
2146	journal->j_fc_replay_callback = ext4_fc_replay;
2147	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2148		return;
2149	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2150}
2151
2152static const char *fc_ineligible_reasons[] = {
2153	"Extended attributes changed",
2154	"Cross rename",
2155	"Journal flag changed",
2156	"Insufficient memory",
2157	"Swap boot",
2158	"Resize",
2159	"Dir renamed",
2160	"Falloc range op",
2161	"Data journalling",
2162	"FC Commit Failed"
2163};
2164
2165int ext4_fc_info_show(struct seq_file *seq, void *v)
2166{
2167	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2168	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2169	int i;
2170
2171	if (v != SEQ_START_TOKEN)
2172		return 0;
2173
2174	seq_printf(seq,
2175		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2176		   stats->fc_num_commits, stats->fc_ineligible_commits,
2177		   stats->fc_numblks,
2178		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2179	seq_puts(seq, "Ineligible reasons:\n");
2180	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2181		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2182			stats->fc_ineligible_reason_count[i]);
2183
2184	return 0;
2185}
2186
2187int __init ext4_fc_init_dentry_cache(void)
2188{
2189	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2190					   SLAB_RECLAIM_ACCOUNT);
2191
2192	if (ext4_fc_dentry_cachep == NULL)
2193		return -ENOMEM;
2194
2195	return 0;
2196}