Linux Audio

Check our new training course

Loading...
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 
 
 
 
 
 
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 
 
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 
 
 
 
 324		return;
 
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 
 342
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 
 
 
 
 357		return;
 358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 
 436
 437	mutex_unlock(&ei->i_fc_lock);
 
 
 
 
 
 
 
 
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 
 
 
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 
 
 
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 
 
 
 
 
 
 
 
 
 
 
 
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 
 
 
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 
 
 
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 
 
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 
 
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 
 
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 897				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 898
 899			/* Limit the number of blocks in one extent */
 900			map.m_len = min(max, map.m_len);
 901
 902			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 903			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 904			ex->ee_block = cpu_to_le32(map.m_lblk);
 905			ex->ee_len = cpu_to_le16(map.m_len);
 906			ext4_ext_store_pblock(ex, map.m_pblk);
 907			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 908				ext4_ext_mark_unwritten(ex);
 909			else
 910				ext4_ext_mark_initialized(ex);
 911			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 912					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 913				return -ENOSPC;
 914		}
 915
 916		cur_lblk_off += map.m_len;
 917	}
 918
 919	return 0;
 920}
 921
 922
 923/* Submit data for all the fast commit inodes */
 924static int ext4_fc_submit_inode_data_all(journal_t *journal)
 925{
 926	struct super_block *sb = (struct super_block *)(journal->j_private);
 927	struct ext4_sb_info *sbi = EXT4_SB(sb);
 928	struct ext4_inode_info *ei;
 929	int ret = 0;
 930
 931	spin_lock(&sbi->s_fc_lock);
 932	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 933	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 934		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 935		while (atomic_read(&ei->i_fc_updates)) {
 936			DEFINE_WAIT(wait);
 937
 938			prepare_to_wait(&ei->i_fc_wait, &wait,
 939						TASK_UNINTERRUPTIBLE);
 940			if (atomic_read(&ei->i_fc_updates)) {
 941				spin_unlock(&sbi->s_fc_lock);
 942				schedule();
 943				spin_lock(&sbi->s_fc_lock);
 944			}
 945			finish_wait(&ei->i_fc_wait, &wait);
 946		}
 947		spin_unlock(&sbi->s_fc_lock);
 948		ret = jbd2_submit_inode_data(ei->jinode);
 949		if (ret)
 950			return ret;
 951		spin_lock(&sbi->s_fc_lock);
 952	}
 953	spin_unlock(&sbi->s_fc_lock);
 954
 955	return ret;
 956}
 957
 958/* Wait for completion of data for all the fast commit inodes */
 959static int ext4_fc_wait_inode_data_all(journal_t *journal)
 960{
 961	struct super_block *sb = (struct super_block *)(journal->j_private);
 962	struct ext4_sb_info *sbi = EXT4_SB(sb);
 963	struct ext4_inode_info *pos, *n;
 964	int ret = 0;
 965
 966	spin_lock(&sbi->s_fc_lock);
 967	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 968		if (!ext4_test_inode_state(&pos->vfs_inode,
 969					   EXT4_STATE_FC_COMMITTING))
 970			continue;
 971		spin_unlock(&sbi->s_fc_lock);
 972
 973		ret = jbd2_wait_inode_data(journal, pos->jinode);
 974		if (ret)
 975			return ret;
 976		spin_lock(&sbi->s_fc_lock);
 977	}
 978	spin_unlock(&sbi->s_fc_lock);
 979
 980	return 0;
 981}
 982
 983/* Commit all the directory entry updates */
 984static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 985__acquires(&sbi->s_fc_lock)
 986__releases(&sbi->s_fc_lock)
 987{
 988	struct super_block *sb = (struct super_block *)(journal->j_private);
 989	struct ext4_sb_info *sbi = EXT4_SB(sb);
 990	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 991	struct inode *inode;
 992	struct ext4_inode_info *ei, *ei_n;
 993	int ret;
 994
 995	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 996		return 0;
 997	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 998				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 999		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1000			spin_unlock(&sbi->s_fc_lock);
1001			if (!ext4_fc_add_dentry_tlv(
1002				sb, fc_dentry->fcd_op,
1003				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1004				fc_dentry->fcd_name.len,
1005				fc_dentry->fcd_name.name, crc)) {
1006				ret = -ENOSPC;
1007				goto lock_and_exit;
1008			}
1009			spin_lock(&sbi->s_fc_lock);
1010			continue;
1011		}
1012
1013		inode = NULL;
1014		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1015					 i_fc_list) {
1016			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1017				inode = &ei->vfs_inode;
1018				break;
1019			}
1020		}
1021		/*
1022		 * If we don't find inode in our list, then it was deleted,
1023		 * in which case, we don't need to record it's create tag.
1024		 */
1025		if (!inode)
1026			continue;
 
 
 
 
1027		spin_unlock(&sbi->s_fc_lock);
1028
1029		/*
1030		 * We first write the inode and then the create dirent. This
1031		 * allows the recovery code to create an unnamed inode first
1032		 * and then link it to a directory entry. This allows us
1033		 * to use namei.c routines almost as is and simplifies
1034		 * the recovery code.
1035		 */
1036		ret = ext4_fc_write_inode(inode, crc);
1037		if (ret)
1038			goto lock_and_exit;
1039
1040		ret = ext4_fc_write_inode_data(inode, crc);
1041		if (ret)
1042			goto lock_and_exit;
1043
1044		if (!ext4_fc_add_dentry_tlv(
1045			sb, fc_dentry->fcd_op,
1046			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1047			fc_dentry->fcd_name.len,
1048			fc_dentry->fcd_name.name, crc)) {
1049			ret = -ENOSPC;
1050			goto lock_and_exit;
1051		}
1052
1053		spin_lock(&sbi->s_fc_lock);
1054	}
1055	return 0;
1056lock_and_exit:
1057	spin_lock(&sbi->s_fc_lock);
1058	return ret;
1059}
1060
1061static int ext4_fc_perform_commit(journal_t *journal)
1062{
1063	struct super_block *sb = (struct super_block *)(journal->j_private);
1064	struct ext4_sb_info *sbi = EXT4_SB(sb);
1065	struct ext4_inode_info *iter;
1066	struct ext4_fc_head head;
1067	struct inode *inode;
1068	struct blk_plug plug;
1069	int ret = 0;
1070	u32 crc = 0;
1071
1072	ret = ext4_fc_submit_inode_data_all(journal);
1073	if (ret)
1074		return ret;
1075
1076	ret = ext4_fc_wait_inode_data_all(journal);
1077	if (ret)
1078		return ret;
1079
1080	/*
1081	 * If file system device is different from journal device, issue a cache
1082	 * flush before we start writing fast commit blocks.
1083	 */
1084	if (journal->j_fs_dev != journal->j_dev)
1085		blkdev_issue_flush(journal->j_fs_dev);
1086
1087	blk_start_plug(&plug);
1088	if (sbi->s_fc_bytes == 0) {
1089		/*
1090		 * Add a head tag only if this is the first fast commit
1091		 * in this TID.
1092		 */
1093		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1094		head.fc_tid = cpu_to_le32(
1095			sbi->s_journal->j_running_transaction->t_tid);
1096		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1097			(u8 *)&head, &crc)) {
1098			ret = -ENOSPC;
1099			goto out;
1100		}
1101	}
1102
1103	spin_lock(&sbi->s_fc_lock);
1104	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1105	if (ret) {
1106		spin_unlock(&sbi->s_fc_lock);
1107		goto out;
1108	}
1109
1110	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1111		inode = &iter->vfs_inode;
1112		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1113			continue;
1114
1115		spin_unlock(&sbi->s_fc_lock);
1116		ret = ext4_fc_write_inode_data(inode, &crc);
1117		if (ret)
1118			goto out;
1119		ret = ext4_fc_write_inode(inode, &crc);
1120		if (ret)
1121			goto out;
1122		spin_lock(&sbi->s_fc_lock);
1123	}
1124	spin_unlock(&sbi->s_fc_lock);
1125
1126	ret = ext4_fc_write_tail(sb, crc);
1127
1128out:
1129	blk_finish_plug(&plug);
1130	return ret;
1131}
1132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1133/*
1134 * The main commit entry point. Performs a fast commit for transaction
1135 * commit_tid if needed. If it's not possible to perform a fast commit
1136 * due to various reasons, we fall back to full commit. Returns 0
1137 * on success, error otherwise.
1138 */
1139int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1140{
1141	struct super_block *sb = (struct super_block *)(journal->j_private);
1142	struct ext4_sb_info *sbi = EXT4_SB(sb);
1143	int nblks = 0, ret, bsize = journal->j_blocksize;
1144	int subtid = atomic_read(&sbi->s_fc_subtid);
1145	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1146	ktime_t start_time, commit_time;
1147
1148	trace_ext4_fc_commit_start(sb);
 
1149
1150	start_time = ktime_get();
1151
1152	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1153		(ext4_fc_is_ineligible(sb))) {
1154		reason = EXT4_FC_REASON_INELIGIBLE;
1155		goto out;
1156	}
1157
1158restart_fc:
1159	ret = jbd2_fc_begin_commit(journal, commit_tid);
1160	if (ret == -EALREADY) {
1161		/* There was an ongoing commit, check if we need to restart */
1162		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1163			commit_tid > journal->j_commit_sequence)
1164			goto restart_fc;
1165		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1166		goto out;
 
1167	} else if (ret) {
1168		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169		reason = EXT4_FC_REASON_FC_START_FAILED;
1170		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
1171	}
1172
1173	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1174	ret = ext4_fc_perform_commit(journal);
1175	if (ret < 0) {
1176		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177		reason = EXT4_FC_REASON_FC_FAILED;
1178		goto out;
1179	}
1180	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181	ret = jbd2_fc_wait_bufs(journal, nblks);
1182	if (ret < 0) {
1183		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184		reason = EXT4_FC_REASON_FC_FAILED;
1185		goto out;
1186	}
1187	atomic_inc(&sbi->s_fc_subtid);
1188	jbd2_fc_end_commit(journal);
1189out:
1190	/* Has any ineligible update happened since we started? */
1191	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1192		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1193		reason = EXT4_FC_REASON_INELIGIBLE;
1194	}
1195
1196	spin_lock(&sbi->s_fc_lock);
1197	if (reason != EXT4_FC_REASON_OK &&
1198		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1199		sbi->s_fc_stats.fc_ineligible_commits++;
1200	} else {
1201		sbi->s_fc_stats.fc_num_commits++;
1202		sbi->s_fc_stats.fc_numblks += nblks;
1203	}
1204	spin_unlock(&sbi->s_fc_lock);
1205	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1206	trace_ext4_fc_commit_stop(sb, nblks, reason);
1207	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1208	/*
1209	 * weight the commit time higher than the average time so we don't
1210	 * react too strongly to vast changes in the commit time
1211	 */
1212	if (likely(sbi->s_fc_avg_commit_time))
1213		sbi->s_fc_avg_commit_time = (commit_time +
1214				sbi->s_fc_avg_commit_time * 3) / 4;
1215	else
1216		sbi->s_fc_avg_commit_time = commit_time;
1217	jbd_debug(1,
1218		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1219		nblks, reason, subtid);
1220	if (reason == EXT4_FC_REASON_FC_FAILED)
1221		return jbd2_fc_end_commit_fallback(journal);
1222	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1223		reason == EXT4_FC_REASON_INELIGIBLE)
1224		return jbd2_complete_transaction(journal, commit_tid);
1225	return 0;
1226}
1227
1228/*
1229 * Fast commit cleanup routine. This is called after every fast commit and
1230 * full commit. full is true if we are called after a full commit.
1231 */
1232static void ext4_fc_cleanup(journal_t *journal, int full)
1233{
1234	struct super_block *sb = journal->j_private;
1235	struct ext4_sb_info *sbi = EXT4_SB(sb);
1236	struct ext4_inode_info *iter, *iter_n;
1237	struct ext4_fc_dentry_update *fc_dentry;
1238
1239	if (full && sbi->s_fc_bh)
1240		sbi->s_fc_bh = NULL;
1241
 
1242	jbd2_fc_release_bufs(journal);
1243
1244	spin_lock(&sbi->s_fc_lock);
1245	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1246				 i_fc_list) {
1247		list_del_init(&iter->i_fc_list);
1248		ext4_clear_inode_state(&iter->vfs_inode,
1249				       EXT4_STATE_FC_COMMITTING);
1250		ext4_fc_reset_inode(&iter->vfs_inode);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1251		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1252		smp_mb();
1253#if (BITS_PER_LONG < 64)
1254		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1255#else
1256		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1257#endif
1258	}
1259
1260	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1261		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1262					     struct ext4_fc_dentry_update,
1263					     fcd_list);
1264		list_del_init(&fc_dentry->fcd_list);
 
1265		spin_unlock(&sbi->s_fc_lock);
1266
1267		if (fc_dentry->fcd_name.name &&
1268			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1269			kfree(fc_dentry->fcd_name.name);
1270		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1271		spin_lock(&sbi->s_fc_lock);
1272	}
1273
1274	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1275				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1276	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1277				&sbi->s_fc_q[FC_Q_MAIN]);
1278
1279	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1280	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 
 
1281
1282	if (full)
1283		sbi->s_fc_bytes = 0;
1284	spin_unlock(&sbi->s_fc_lock);
1285	trace_ext4_fc_stats(sb);
1286}
1287
1288/* Ext4 Replay Path Routines */
1289
1290/* Helper struct for dentry replay routines */
1291struct dentry_info_args {
1292	int parent_ino, dname_len, ino, inode_len;
1293	char *dname;
1294};
1295
 
 
 
 
 
 
1296static inline void tl_to_darg(struct dentry_info_args *darg,
1297			      struct  ext4_fc_tl *tl, u8 *val)
1298{
1299	struct ext4_fc_dentry_info fcd;
1300
1301	memcpy(&fcd, val, sizeof(fcd));
1302
1303	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1304	darg->ino = le32_to_cpu(fcd.fc_ino);
1305	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1306	darg->dname_len = le16_to_cpu(tl->fc_len) -
1307		sizeof(struct ext4_fc_dentry_info);
 
 
 
 
 
 
 
 
1308}
1309
1310/* Unlink replay function */
1311static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1312				 u8 *val)
1313{
1314	struct inode *inode, *old_parent;
1315	struct qstr entry;
1316	struct dentry_info_args darg;
1317	int ret = 0;
1318
1319	tl_to_darg(&darg, tl, val);
1320
1321	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1322			darg.parent_ino, darg.dname_len);
1323
1324	entry.name = darg.dname;
1325	entry.len = darg.dname_len;
1326	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1327
1328	if (IS_ERR(inode)) {
1329		jbd_debug(1, "Inode %d not found", darg.ino);
1330		return 0;
1331	}
1332
1333	old_parent = ext4_iget(sb, darg.parent_ino,
1334				EXT4_IGET_NORMAL);
1335	if (IS_ERR(old_parent)) {
1336		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1337		iput(inode);
1338		return 0;
1339	}
1340
1341	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1342	/* -ENOENT ok coz it might not exist anymore. */
1343	if (ret == -ENOENT)
1344		ret = 0;
1345	iput(old_parent);
1346	iput(inode);
1347	return ret;
1348}
1349
1350static int ext4_fc_replay_link_internal(struct super_block *sb,
1351				struct dentry_info_args *darg,
1352				struct inode *inode)
1353{
1354	struct inode *dir = NULL;
1355	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1356	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1357	int ret = 0;
1358
1359	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1360	if (IS_ERR(dir)) {
1361		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1362		dir = NULL;
1363		goto out;
1364	}
1365
1366	dentry_dir = d_obtain_alias(dir);
1367	if (IS_ERR(dentry_dir)) {
1368		jbd_debug(1, "Failed to obtain dentry");
1369		dentry_dir = NULL;
1370		goto out;
1371	}
1372
1373	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1374	if (!dentry_inode) {
1375		jbd_debug(1, "Inode dentry not created.");
1376		ret = -ENOMEM;
1377		goto out;
1378	}
1379
1380	ret = __ext4_link(dir, inode, dentry_inode);
1381	/*
1382	 * It's possible that link already existed since data blocks
1383	 * for the dir in question got persisted before we crashed OR
1384	 * we replayed this tag and crashed before the entire replay
1385	 * could complete.
1386	 */
1387	if (ret && ret != -EEXIST) {
1388		jbd_debug(1, "Failed to link\n");
1389		goto out;
1390	}
1391
1392	ret = 0;
1393out:
1394	if (dentry_dir) {
1395		d_drop(dentry_dir);
1396		dput(dentry_dir);
1397	} else if (dir) {
1398		iput(dir);
1399	}
1400	if (dentry_inode) {
1401		d_drop(dentry_inode);
1402		dput(dentry_inode);
1403	}
1404
1405	return ret;
1406}
1407
1408/* Link replay function */
1409static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1410			       u8 *val)
1411{
1412	struct inode *inode;
1413	struct dentry_info_args darg;
1414	int ret = 0;
1415
1416	tl_to_darg(&darg, tl, val);
1417	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1418			darg.parent_ino, darg.dname_len);
1419
1420	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421	if (IS_ERR(inode)) {
1422		jbd_debug(1, "Inode not found.");
1423		return 0;
1424	}
1425
1426	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1427	iput(inode);
1428	return ret;
1429}
1430
1431/*
1432 * Record all the modified inodes during replay. We use this later to setup
1433 * block bitmaps correctly.
1434 */
1435static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1436{
1437	struct ext4_fc_replay_state *state;
1438	int i;
1439
1440	state = &EXT4_SB(sb)->s_fc_replay_state;
1441	for (i = 0; i < state->fc_modified_inodes_used; i++)
1442		if (state->fc_modified_inodes[i] == ino)
1443			return 0;
1444	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
 
 
 
 
 
 
 
 
 
1445		state->fc_modified_inodes_size +=
1446			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1447		state->fc_modified_inodes = krealloc(
1448					state->fc_modified_inodes, sizeof(int) *
1449					state->fc_modified_inodes_size,
1450					GFP_KERNEL);
1451		if (!state->fc_modified_inodes)
1452			return -ENOMEM;
1453	}
1454	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1455	return 0;
1456}
1457
1458/*
1459 * Inode replay function
1460 */
1461static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1462				u8 *val)
1463{
1464	struct ext4_fc_inode fc_inode;
1465	struct ext4_inode *raw_inode;
1466	struct ext4_inode *raw_fc_inode;
1467	struct inode *inode = NULL;
1468	struct ext4_iloc iloc;
1469	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1470	struct ext4_extent_header *eh;
 
1471
1472	memcpy(&fc_inode, val, sizeof(fc_inode));
1473
1474	ino = le32_to_cpu(fc_inode.fc_ino);
1475	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1476
1477	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1478	if (!IS_ERR(inode)) {
1479		ext4_ext_clear_bb(inode);
1480		iput(inode);
1481	}
1482	inode = NULL;
1483
1484	ext4_fc_record_modified_inode(sb, ino);
 
 
1485
1486	raw_fc_inode = (struct ext4_inode *)
1487		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1488	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1489	if (ret)
1490		goto out;
1491
1492	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1493	raw_inode = ext4_raw_inode(&iloc);
1494
1495	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1496	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1497		inode_len - offsetof(struct ext4_inode, i_generation));
1498	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1499		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1500		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1501			memset(eh, 0, sizeof(*eh));
1502			eh->eh_magic = EXT4_EXT_MAGIC;
1503			eh->eh_max = cpu_to_le16(
1504				(sizeof(raw_inode->i_block) -
1505				 sizeof(struct ext4_extent_header))
1506				 / sizeof(struct ext4_extent));
1507		}
1508	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1509		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1510			sizeof(raw_inode->i_block));
1511	}
1512
1513	/* Immediately update the inode on disk. */
1514	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515	if (ret)
1516		goto out;
1517	ret = sync_dirty_buffer(iloc.bh);
1518	if (ret)
1519		goto out;
1520	ret = ext4_mark_inode_used(sb, ino);
1521	if (ret)
1522		goto out;
1523
1524	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1525	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1526	if (IS_ERR(inode)) {
1527		jbd_debug(1, "Inode not found.");
1528		return -EFSCORRUPTED;
1529	}
1530
1531	/*
1532	 * Our allocator could have made different decisions than before
1533	 * crashing. This should be fixed but until then, we calculate
1534	 * the number of blocks the inode.
1535	 */
1536	ext4_ext_replay_set_iblocks(inode);
 
1537
1538	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1539	ext4_reset_inode_seed(inode);
1540
1541	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1542	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1543	sync_dirty_buffer(iloc.bh);
1544	brelse(iloc.bh);
1545out:
1546	iput(inode);
1547	if (!ret)
1548		blkdev_issue_flush(sb->s_bdev);
1549
1550	return 0;
1551}
1552
1553/*
1554 * Dentry create replay function.
1555 *
1556 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1557 * inode for which we are trying to create a dentry here, should already have
1558 * been replayed before we start here.
1559 */
1560static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1561				 u8 *val)
1562{
1563	int ret = 0;
1564	struct inode *inode = NULL;
1565	struct inode *dir = NULL;
1566	struct dentry_info_args darg;
1567
1568	tl_to_darg(&darg, tl, val);
1569
1570	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571			darg.parent_ino, darg.dname_len);
1572
1573	/* This takes care of update group descriptor and other metadata */
1574	ret = ext4_mark_inode_used(sb, darg.ino);
1575	if (ret)
1576		goto out;
1577
1578	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579	if (IS_ERR(inode)) {
1580		jbd_debug(1, "inode %d not found.", darg.ino);
1581		inode = NULL;
1582		ret = -EINVAL;
1583		goto out;
1584	}
1585
1586	if (S_ISDIR(inode->i_mode)) {
1587		/*
1588		 * If we are creating a directory, we need to make sure that the
1589		 * dot and dot dot dirents are setup properly.
1590		 */
1591		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592		if (IS_ERR(dir)) {
1593			jbd_debug(1, "Dir %d not found.", darg.ino);
1594			goto out;
1595		}
1596		ret = ext4_init_new_dir(NULL, dir, inode);
1597		iput(dir);
1598		if (ret) {
1599			ret = 0;
1600			goto out;
1601		}
1602	}
1603	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604	if (ret)
1605		goto out;
1606	set_nlink(inode, 1);
1607	ext4_mark_inode_dirty(NULL, inode);
1608out:
1609	if (inode)
1610		iput(inode);
1611	return ret;
1612}
1613
1614/*
1615 * Record physical disk regions which are in use as per fast commit area. Our
1616 * simple replay phase allocator excludes these regions from allocation.
 
1617 */
1618static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620{
1621	struct ext4_fc_replay_state *state;
1622	struct ext4_fc_alloc_region *region;
1623
1624	state = &EXT4_SB(sb)->s_fc_replay_state;
 
 
 
 
 
 
1625	if (state->fc_regions_used == state->fc_regions_size) {
 
 
 
 
 
 
 
 
 
1626		state->fc_regions_size +=
1627			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628		state->fc_regions = krealloc(
1629					state->fc_regions,
1630					state->fc_regions_size *
1631					sizeof(struct ext4_fc_alloc_region),
1632					GFP_KERNEL);
1633		if (!state->fc_regions)
1634			return -ENOMEM;
1635	}
1636	region = &state->fc_regions[state->fc_regions_used++];
1637	region->ino = ino;
1638	region->lblk = lblk;
1639	region->pblk = pblk;
1640	region->len = len;
1641
 
 
 
1642	return 0;
1643}
1644
1645/* Replay add range tag */
1646static int ext4_fc_replay_add_range(struct super_block *sb,
1647				    struct ext4_fc_tl *tl, u8 *val)
1648{
1649	struct ext4_fc_add_range fc_add_ex;
1650	struct ext4_extent newex, *ex;
1651	struct inode *inode;
1652	ext4_lblk_t start, cur;
1653	int remaining, len;
1654	ext4_fsblk_t start_pblk;
1655	struct ext4_map_blocks map;
1656	struct ext4_ext_path *path = NULL;
1657	int ret;
1658
1659	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1660	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1661
1662	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1664		ext4_ext_get_actual_len(ex));
1665
1666	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1667	if (IS_ERR(inode)) {
1668		jbd_debug(1, "Inode not found.");
1669		return 0;
1670	}
1671
1672	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1673
1674	start = le32_to_cpu(ex->ee_block);
1675	start_pblk = ext4_ext_pblock(ex);
1676	len = ext4_ext_get_actual_len(ex);
1677
1678	cur = start;
1679	remaining = len;
1680	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1681		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1682		  inode->i_ino);
1683
1684	while (remaining > 0) {
1685		map.m_lblk = cur;
1686		map.m_len = remaining;
1687		map.m_pblk = 0;
1688		ret = ext4_map_blocks(NULL, inode, &map, 0);
1689
1690		if (ret < 0) {
1691			iput(inode);
1692			return 0;
1693		}
1694
1695		if (ret == 0) {
1696			/* Range is not mapped */
1697			path = ext4_find_extent(inode, cur, NULL, 0);
1698			if (IS_ERR(path)) {
1699				iput(inode);
1700				return 0;
1701			}
1702			memset(&newex, 0, sizeof(newex));
1703			newex.ee_block = cpu_to_le32(cur);
1704			ext4_ext_store_pblock(
1705				&newex, start_pblk + cur - start);
1706			newex.ee_len = cpu_to_le16(map.m_len);
1707			if (ext4_ext_is_unwritten(ex))
1708				ext4_ext_mark_unwritten(&newex);
1709			down_write(&EXT4_I(inode)->i_data_sem);
1710			ret = ext4_ext_insert_extent(
1711				NULL, inode, &path, &newex, 0);
1712			up_write((&EXT4_I(inode)->i_data_sem));
1713			ext4_ext_drop_refs(path);
1714			kfree(path);
1715			if (ret) {
1716				iput(inode);
1717				return 0;
1718			}
1719			goto next;
1720		}
1721
1722		if (start_pblk + cur - start != map.m_pblk) {
1723			/*
1724			 * Logical to physical mapping changed. This can happen
1725			 * if this range was removed and then reallocated to
1726			 * map to new physical blocks during a fast commit.
1727			 */
1728			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729					ext4_ext_is_unwritten(ex),
1730					start_pblk + cur - start);
1731			if (ret) {
1732				iput(inode);
1733				return 0;
1734			}
1735			/*
1736			 * Mark the old blocks as free since they aren't used
1737			 * anymore. We maintain an array of all the modified
1738			 * inodes. In case these blocks are still used at either
1739			 * a different logical range in the same inode or in
1740			 * some different inode, we will mark them as allocated
1741			 * at the end of the FC replay using our array of
1742			 * modified inodes.
1743			 */
1744			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1745			goto next;
1746		}
1747
1748		/* Range is mapped and needs a state change */
1749		jbd_debug(1, "Converting from %ld to %d %lld",
1750				map.m_flags & EXT4_MAP_UNWRITTEN,
1751			ext4_ext_is_unwritten(ex), map.m_pblk);
1752		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1753					ext4_ext_is_unwritten(ex), map.m_pblk);
1754		if (ret) {
1755			iput(inode);
1756			return 0;
1757		}
1758		/*
1759		 * We may have split the extent tree while toggling the state.
1760		 * Try to shrink the extent tree now.
1761		 */
1762		ext4_ext_replay_shrink_inode(inode, start + len);
1763next:
1764		cur += map.m_len;
1765		remaining -= map.m_len;
1766	}
1767	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1768					sb->s_blocksize_bits);
 
 
1769	iput(inode);
1770	return 0;
1771}
1772
1773/* Replay DEL_RANGE tag */
1774static int
1775ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1776			 u8 *val)
1777{
1778	struct inode *inode;
1779	struct ext4_fc_del_range lrange;
1780	struct ext4_map_blocks map;
1781	ext4_lblk_t cur, remaining;
1782	int ret;
1783
1784	memcpy(&lrange, val, sizeof(lrange));
1785	cur = le32_to_cpu(lrange.fc_lblk);
1786	remaining = le32_to_cpu(lrange.fc_len);
1787
1788	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789		le32_to_cpu(lrange.fc_ino), cur, remaining);
1790
1791	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1792	if (IS_ERR(inode)) {
1793		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1794		return 0;
1795	}
1796
1797	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1798
1799	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1801			le32_to_cpu(lrange.fc_len));
1802	while (remaining > 0) {
1803		map.m_lblk = cur;
1804		map.m_len = remaining;
1805
1806		ret = ext4_map_blocks(NULL, inode, &map, 0);
1807		if (ret < 0) {
1808			iput(inode);
1809			return 0;
1810		}
1811		if (ret > 0) {
1812			remaining -= ret;
1813			cur += ret;
1814			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815		} else {
1816			remaining -= map.m_len;
1817			cur += map.m_len;
1818		}
1819	}
1820
1821	ret = ext4_punch_hole(inode,
1822		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1823		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
 
 
1824	if (ret)
1825		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826	ext4_ext_replay_shrink_inode(inode,
1827		i_size_read(inode) >> sb->s_blocksize_bits);
1828	ext4_mark_inode_dirty(NULL, inode);
 
1829	iput(inode);
1830
1831	return 0;
1832}
1833
1834static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1835{
1836	struct ext4_fc_replay_state *state;
1837	struct inode *inode;
1838	struct ext4_ext_path *path = NULL;
1839	struct ext4_map_blocks map;
1840	int i, ret, j;
1841	ext4_lblk_t cur, end;
1842
1843	state = &EXT4_SB(sb)->s_fc_replay_state;
1844	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1845		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1846			EXT4_IGET_NORMAL);
1847		if (IS_ERR(inode)) {
1848			jbd_debug(1, "Inode %d not found.",
1849				state->fc_modified_inodes[i]);
1850			continue;
1851		}
1852		cur = 0;
1853		end = EXT_MAX_BLOCKS;
 
 
 
 
1854		while (cur < end) {
1855			map.m_lblk = cur;
1856			map.m_len = end - cur;
1857
1858			ret = ext4_map_blocks(NULL, inode, &map, 0);
1859			if (ret < 0)
1860				break;
1861
1862			if (ret > 0) {
1863				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1864				if (!IS_ERR(path)) {
1865					for (j = 0; j < path->p_depth; j++)
1866						ext4_mb_mark_bb(inode->i_sb,
1867							path[j].p_block, 1, 1);
1868					ext4_ext_drop_refs(path);
1869					kfree(path);
1870				}
1871				cur += ret;
1872				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1873							map.m_len, 1);
1874			} else {
1875				cur = cur + (map.m_len ? map.m_len : 1);
1876			}
1877		}
1878		iput(inode);
1879	}
 
 
1880}
1881
1882/*
1883 * Check if block is in excluded regions for block allocation. The simple
1884 * allocator that runs during replay phase is calls this function to see
1885 * if it is okay to use a block.
1886 */
1887bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1888{
1889	int i;
1890	struct ext4_fc_replay_state *state;
1891
1892	state = &EXT4_SB(sb)->s_fc_replay_state;
1893	for (i = 0; i < state->fc_regions_valid; i++) {
1894		if (state->fc_regions[i].ino == 0 ||
1895			state->fc_regions[i].len == 0)
1896			continue;
1897		if (blk >= state->fc_regions[i].pblk &&
1898		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1899			return true;
1900	}
1901	return false;
1902}
1903
1904/* Cleanup function called after replay */
1905void ext4_fc_replay_cleanup(struct super_block *sb)
1906{
1907	struct ext4_sb_info *sbi = EXT4_SB(sb);
1908
1909	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1910	kfree(sbi->s_fc_replay_state.fc_regions);
1911	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1912}
1913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1914/*
1915 * Recovery Scan phase handler
1916 *
1917 * This function is called during the scan phase and is responsible
1918 * for doing following things:
1919 * - Make sure the fast commit area has valid tags for replay
1920 * - Count number of tags that need to be replayed by the replay handler
1921 * - Verify CRC
1922 * - Create a list of excluded blocks for allocation during replay phase
1923 *
1924 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1925 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1926 * to indicate that scan has finished and JBD2 can now start replay phase.
1927 * It returns a negative error to indicate that there was an error. At the end
1928 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1929 * to indicate the number of tags that need to replayed during the replay phase.
1930 */
1931static int ext4_fc_replay_scan(journal_t *journal,
1932				struct buffer_head *bh, int off,
1933				tid_t expected_tid)
1934{
1935	struct super_block *sb = journal->j_private;
1936	struct ext4_sb_info *sbi = EXT4_SB(sb);
1937	struct ext4_fc_replay_state *state;
1938	int ret = JBD2_FC_REPLAY_CONTINUE;
1939	struct ext4_fc_add_range ext;
1940	struct ext4_fc_tl tl;
1941	struct ext4_fc_tail tail;
1942	__u8 *start, *end, *cur, *val;
1943	struct ext4_fc_head head;
1944	struct ext4_extent *ex;
1945
1946	state = &sbi->s_fc_replay_state;
1947
1948	start = (u8 *)bh->b_data;
1949	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1950
1951	if (state->fc_replay_expected_off == 0) {
1952		state->fc_cur_tag = 0;
1953		state->fc_replay_num_tags = 0;
1954		state->fc_crc = 0;
1955		state->fc_regions = NULL;
1956		state->fc_regions_valid = state->fc_regions_used =
1957			state->fc_regions_size = 0;
1958		/* Check if we can stop early */
1959		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1960			!= EXT4_FC_TAG_HEAD)
1961			return 0;
1962	}
1963
1964	if (off != state->fc_replay_expected_off) {
1965		ret = -EFSCORRUPTED;
1966		goto out_err;
1967	}
1968
1969	state->fc_replay_expected_off++;
1970	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1971		memcpy(&tl, cur, sizeof(tl));
1972		val = cur + sizeof(tl);
1973		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1974			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1975		switch (le16_to_cpu(tl.fc_tag)) {
 
 
 
 
 
 
 
1976		case EXT4_FC_TAG_ADD_RANGE:
1977			memcpy(&ext, val, sizeof(ext));
1978			ex = (struct ext4_extent *)&ext.fc_ex;
1979			ret = ext4_fc_record_regions(sb,
1980				le32_to_cpu(ext.fc_ino),
1981				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1982				ext4_ext_get_actual_len(ex));
1983			if (ret < 0)
1984				break;
1985			ret = JBD2_FC_REPLAY_CONTINUE;
1986			fallthrough;
1987		case EXT4_FC_TAG_DEL_RANGE:
1988		case EXT4_FC_TAG_LINK:
1989		case EXT4_FC_TAG_UNLINK:
1990		case EXT4_FC_TAG_CREAT:
1991		case EXT4_FC_TAG_INODE:
1992		case EXT4_FC_TAG_PAD:
1993			state->fc_cur_tag++;
1994			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995					sizeof(tl) + le16_to_cpu(tl.fc_len));
1996			break;
1997		case EXT4_FC_TAG_TAIL:
1998			state->fc_cur_tag++;
1999			memcpy(&tail, val, sizeof(tail));
2000			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2001						sizeof(tl) +
2002						offsetof(struct ext4_fc_tail,
2003						fc_crc));
2004			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2005				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2006				state->fc_replay_num_tags = state->fc_cur_tag;
2007				state->fc_regions_valid =
2008					state->fc_regions_used;
2009			} else {
2010				ret = state->fc_replay_num_tags ?
2011					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2012			}
2013			state->fc_crc = 0;
2014			break;
2015		case EXT4_FC_TAG_HEAD:
2016			memcpy(&head, val, sizeof(head));
2017			if (le32_to_cpu(head.fc_features) &
2018				~EXT4_FC_SUPPORTED_FEATURES) {
2019				ret = -EOPNOTSUPP;
2020				break;
2021			}
2022			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2023				ret = JBD2_FC_REPLAY_STOP;
2024				break;
2025			}
2026			state->fc_cur_tag++;
2027			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2028					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2029			break;
2030		default:
2031			ret = state->fc_replay_num_tags ?
2032				JBD2_FC_REPLAY_STOP : -ECANCELED;
2033		}
2034		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2035			break;
2036	}
2037
2038out_err:
2039	trace_ext4_fc_replay_scan(sb, ret, off);
2040	return ret;
2041}
2042
2043/*
2044 * Main recovery path entry point.
2045 * The meaning of return codes is similar as above.
2046 */
2047static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2048				enum passtype pass, int off, tid_t expected_tid)
2049{
2050	struct super_block *sb = journal->j_private;
2051	struct ext4_sb_info *sbi = EXT4_SB(sb);
2052	struct ext4_fc_tl tl;
2053	__u8 *start, *end, *cur, *val;
2054	int ret = JBD2_FC_REPLAY_CONTINUE;
2055	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2056	struct ext4_fc_tail tail;
2057
2058	if (pass == PASS_SCAN) {
2059		state->fc_current_pass = PASS_SCAN;
2060		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2061	}
2062
2063	if (state->fc_current_pass != pass) {
2064		state->fc_current_pass = pass;
2065		sbi->s_mount_state |= EXT4_FC_REPLAY;
2066	}
2067	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2068		jbd_debug(1, "Replay stops\n");
2069		ext4_fc_set_bitmaps_and_counters(sb);
2070		return 0;
2071	}
2072
2073#ifdef CONFIG_EXT4_DEBUG
2074	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2075		pr_warn("Dropping fc block %d because max_replay set\n", off);
2076		return JBD2_FC_REPLAY_STOP;
2077	}
2078#endif
2079
2080	start = (u8 *)bh->b_data;
2081	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2082
2083	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2084		memcpy(&tl, cur, sizeof(tl));
2085		val = cur + sizeof(tl);
 
2086
2087		if (state->fc_replay_num_tags == 0) {
2088			ret = JBD2_FC_REPLAY_STOP;
2089			ext4_fc_set_bitmaps_and_counters(sb);
2090			break;
2091		}
2092		jbd_debug(3, "Replay phase, tag:%s\n",
2093				tag2str(le16_to_cpu(tl.fc_tag)));
2094		state->fc_replay_num_tags--;
2095		switch (le16_to_cpu(tl.fc_tag)) {
2096		case EXT4_FC_TAG_LINK:
2097			ret = ext4_fc_replay_link(sb, &tl, val);
2098			break;
2099		case EXT4_FC_TAG_UNLINK:
2100			ret = ext4_fc_replay_unlink(sb, &tl, val);
2101			break;
2102		case EXT4_FC_TAG_ADD_RANGE:
2103			ret = ext4_fc_replay_add_range(sb, &tl, val);
2104			break;
2105		case EXT4_FC_TAG_CREAT:
2106			ret = ext4_fc_replay_create(sb, &tl, val);
2107			break;
2108		case EXT4_FC_TAG_DEL_RANGE:
2109			ret = ext4_fc_replay_del_range(sb, &tl, val);
2110			break;
2111		case EXT4_FC_TAG_INODE:
2112			ret = ext4_fc_replay_inode(sb, &tl, val);
2113			break;
2114		case EXT4_FC_TAG_PAD:
2115			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2116					     le16_to_cpu(tl.fc_len), 0);
2117			break;
2118		case EXT4_FC_TAG_TAIL:
2119			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2120					     le16_to_cpu(tl.fc_len), 0);
2121			memcpy(&tail, val, sizeof(tail));
2122			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2123			break;
2124		case EXT4_FC_TAG_HEAD:
2125			break;
2126		default:
2127			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2128					     le16_to_cpu(tl.fc_len), 0);
2129			ret = -ECANCELED;
2130			break;
2131		}
2132		if (ret < 0)
2133			break;
2134		ret = JBD2_FC_REPLAY_CONTINUE;
2135	}
2136	return ret;
2137}
2138
2139void ext4_fc_init(struct super_block *sb, journal_t *journal)
2140{
2141	/*
2142	 * We set replay callback even if fast commit disabled because we may
2143	 * could still have fast commit blocks that need to be replayed even if
2144	 * fast commit has now been turned off.
2145	 */
2146	journal->j_fc_replay_callback = ext4_fc_replay;
2147	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2148		return;
2149	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2150}
2151
2152static const char *fc_ineligible_reasons[] = {
2153	"Extended attributes changed",
2154	"Cross rename",
2155	"Journal flag changed",
2156	"Insufficient memory",
2157	"Swap boot",
2158	"Resize",
2159	"Dir renamed",
2160	"Falloc range op",
2161	"Data journalling",
2162	"FC Commit Failed"
2163};
2164
2165int ext4_fc_info_show(struct seq_file *seq, void *v)
2166{
2167	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2168	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2169	int i;
2170
2171	if (v != SEQ_START_TOKEN)
2172		return 0;
2173
2174	seq_printf(seq,
2175		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2176		   stats->fc_num_commits, stats->fc_ineligible_commits,
2177		   stats->fc_numblks,
2178		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2179	seq_puts(seq, "Ineligible reasons:\n");
2180	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2181		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2182			stats->fc_ineligible_reason_count[i]);
2183
2184	return 0;
2185}
2186
2187int __init ext4_fc_init_dentry_cache(void)
2188{
2189	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2190					   SLAB_RECLAIM_ACCOUNT);
2191
2192	if (ext4_fc_dentry_cachep == NULL)
2193		return -ENOMEM;
2194
2195	return 0;
 
 
 
 
 
2196}
v6.13.7
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
 
 
 
 
 
 
  68 *
  69 * Not all operations are supported by fast commits today (e.g extended
  70 * attributes). Fast commit ineligibility is marked by calling
  71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  72 * to full commit.
 
 
 
 
  73 *
  74 * Atomicity of commits
  75 * --------------------
  76 * In order to guarantee atomicity during the commit operation, fast commit
  77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  78 * tag contains CRC of the contents and TID of the transaction after which
  79 * this fast commit should be applied. Recovery code replays fast commit
  80 * logs only if there's at least 1 valid tail present. For every fast commit
  81 * operation, there is 1 tail. This means, we may end up with multiple tails
  82 * in the fast commit space. Here's an example:
  83 *
  84 * - Create a new file A and remove existing file B
  85 * - fsync()
  86 * - Append contents to file A
  87 * - Truncate file A
  88 * - fsync()
  89 *
  90 * The fast commit space at the end of above operations would look like this:
  91 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
  92 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
  93 *
  94 * Replay code should thus check for all the valid tails in the FC area.
  95 *
  96 * Fast Commit Replay Idempotence
  97 * ------------------------------
  98 *
  99 * Fast commits tags are idempotent in nature provided the recovery code follows
 100 * certain rules. The guiding principle that the commit path follows while
 101 * committing is that it stores the result of a particular operation instead of
 102 * storing the procedure.
 103 *
 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 105 * was associated with inode 10. During fast commit, instead of storing this
 106 * operation as a procedure "rename a to b", we store the resulting file system
 107 * state as a "series" of outcomes:
 108 *
 109 * - Link dirent b to inode 10
 110 * - Unlink dirent a
 111 * - Inode <10> with valid refcount
 112 *
 113 * Now when recovery code runs, it needs "enforce" this state on the file
 114 * system. This is what guarantees idempotence of fast commit replay.
 115 *
 116 * Let's take an example of a procedure that is not idempotent and see how fast
 117 * commits make it idempotent. Consider following sequence of operations:
 118 *
 119 *     rm A;    mv B A;    read A
 120 *  (x)     (y)        (z)
 121 *
 122 * (x), (y) and (z) are the points at which we can crash. If we store this
 123 * sequence of operations as is then the replay is not idempotent. Let's say
 124 * while in replay, we crash at (z). During the second replay, file A (which was
 125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 126 * file named A would be absent when we try to read A. So, this sequence of
 127 * operations is not idempotent. However, as mentioned above, instead of storing
 128 * the procedure fast commits store the outcome of each procedure. Thus the fast
 129 * commit log for above procedure would be as follows:
 130 *
 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 132 * inode 11 before the replay)
 133 *
 134 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 135 * (w)          (x)                    (y)          (z)
 136 *
 137 * If we crash at (z), we will have file A linked to inode 11. During the second
 138 * replay, we will remove file A (inode 11). But we will create it back and make
 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 142 * similarly. Thus, by converting a non-idempotent procedure into a series of
 143 * idempotent outcomes, fast commits ensured idempotence during the replay.
 144 *
 145 * TODOs
 146 * -----
 147 *
 148 * 0) Fast commit replay path hardening: Fast commit replay code should use
 149 *    journal handles to make sure all the updates it does during the replay
 150 *    path are atomic. With that if we crash during fast commit replay, after
 151 *    trying to do recovery again, we will find a file system where fast commit
 152 *    area is invalid (because new full commit would be found). In order to deal
 153 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 154 *    superblock state is persisted before starting the replay, so that after
 155 *    the crash, fast commit recovery code can look at that flag and perform
 156 *    fast commit recovery even if that area is invalidated by later full
 157 *    commits.
 158 *
 159 * 1) Fast commit's commit path locks the entire file system during fast
 160 *    commit. This has significant performance penalty. Instead of that, we
 161 *    should use ext4_fc_start/stop_update functions to start inode level
 162 *    updates from ext4_journal_start/stop. Once we do that we can drop file
 163 *    system locking during commit path.
 
 
 164 *
 165 * 2) Handle more ineligible cases.
 166 */
 167
 168#include <trace/events/ext4.h>
 169static struct kmem_cache *ext4_fc_dentry_cachep;
 170
 171static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172{
 173	BUFFER_TRACE(bh, "");
 174	if (uptodate) {
 175		ext4_debug("%s: Block %lld up-to-date",
 176			   __func__, bh->b_blocknr);
 177		set_buffer_uptodate(bh);
 178	} else {
 179		ext4_debug("%s: Block %lld not up-to-date",
 180			   __func__, bh->b_blocknr);
 181		clear_buffer_uptodate(bh);
 182	}
 183
 184	unlock_buffer(bh);
 185}
 186
 187static inline void ext4_fc_reset_inode(struct inode *inode)
 188{
 189	struct ext4_inode_info *ei = EXT4_I(inode);
 190
 191	ei->i_fc_lblk_start = 0;
 192	ei->i_fc_lblk_len = 0;
 193}
 194
 195void ext4_fc_init_inode(struct inode *inode)
 196{
 197	struct ext4_inode_info *ei = EXT4_I(inode);
 198
 199	ext4_fc_reset_inode(inode);
 200	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 201	INIT_LIST_HEAD(&ei->i_fc_list);
 202	INIT_LIST_HEAD(&ei->i_fc_dilist);
 203	init_waitqueue_head(&ei->i_fc_wait);
 204	atomic_set(&ei->i_fc_updates, 0);
 205}
 206
 207/* This function must be called with sbi->s_fc_lock held. */
 208static void ext4_fc_wait_committing_inode(struct inode *inode)
 209__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 210{
 211	wait_queue_head_t *wq;
 212	struct ext4_inode_info *ei = EXT4_I(inode);
 213
 214#if (BITS_PER_LONG < 64)
 215	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 216			EXT4_STATE_FC_COMMITTING);
 217	wq = bit_waitqueue(&ei->i_state_flags,
 218				EXT4_STATE_FC_COMMITTING);
 219#else
 220	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 221			EXT4_STATE_FC_COMMITTING);
 222	wq = bit_waitqueue(&ei->i_flags,
 223				EXT4_STATE_FC_COMMITTING);
 224#endif
 225	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 226	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 227	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 228	schedule();
 229	finish_wait(wq, &wait.wq_entry);
 230}
 231
 232static bool ext4_fc_disabled(struct super_block *sb)
 233{
 234	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 235		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
 236}
 237
 238/*
 239 * Inform Ext4's fast about start of an inode update
 240 *
 241 * This function is called by the high level call VFS callbacks before
 242 * performing any inode update. This function blocks if there's an ongoing
 243 * fast commit on the inode in question.
 244 */
 245void ext4_fc_start_update(struct inode *inode)
 246{
 247	struct ext4_inode_info *ei = EXT4_I(inode);
 248
 249	if (ext4_fc_disabled(inode->i_sb))
 
 250		return;
 251
 252restart:
 253	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 254	if (list_empty(&ei->i_fc_list))
 255		goto out;
 256
 257	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 258		ext4_fc_wait_committing_inode(inode);
 259		goto restart;
 260	}
 261out:
 262	atomic_inc(&ei->i_fc_updates);
 263	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 264}
 265
 266/*
 267 * Stop inode update and wake up waiting fast commits if any.
 268 */
 269void ext4_fc_stop_update(struct inode *inode)
 270{
 271	struct ext4_inode_info *ei = EXT4_I(inode);
 272
 273	if (ext4_fc_disabled(inode->i_sb))
 
 274		return;
 275
 276	if (atomic_dec_and_test(&ei->i_fc_updates))
 277		wake_up_all(&ei->i_fc_wait);
 278}
 279
 280/*
 281 * Remove inode from fast commit list. If the inode is being committed
 282 * we wait until inode commit is done.
 283 */
 284void ext4_fc_del(struct inode *inode)
 285{
 286	struct ext4_inode_info *ei = EXT4_I(inode);
 287	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 288	struct ext4_fc_dentry_update *fc_dentry;
 289
 290	if (ext4_fc_disabled(inode->i_sb))
 
 291		return;
 292
 293restart:
 294	spin_lock(&sbi->s_fc_lock);
 295	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 296		spin_unlock(&sbi->s_fc_lock);
 297		return;
 298	}
 299
 300	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 301		ext4_fc_wait_committing_inode(inode);
 302		goto restart;
 303	}
 
 
 
 304
 305	if (!list_empty(&ei->i_fc_list))
 306		list_del_init(&ei->i_fc_list);
 
 
 
 
 
 307
 308	/*
 309	 * Since this inode is getting removed, let's also remove all FC
 310	 * dentry create references, since it is not needed to log it anyways.
 311	 */
 312	if (list_empty(&ei->i_fc_dilist)) {
 313		spin_unlock(&sbi->s_fc_lock);
 314		return;
 315	}
 316
 317	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
 318	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 319	list_del_init(&fc_dentry->fcd_list);
 320	list_del_init(&fc_dentry->fcd_dilist);
 321
 322	WARN_ON(!list_empty(&ei->i_fc_dilist));
 323	spin_unlock(&sbi->s_fc_lock);
 
 
 
 
 
 324
 325	if (fc_dentry->fcd_name.name &&
 326		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
 327		kfree(fc_dentry->fcd_name.name);
 328	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 329
 330	return;
 
 
 331}
 332
 333/*
 334 * Mark file system as fast commit ineligible, and record latest
 335 * ineligible transaction tid. This means until the recorded
 336 * transaction, commit operation would result in a full jbd2 commit.
 337 */
 338void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 339{
 340	struct ext4_sb_info *sbi = EXT4_SB(sb);
 341	tid_t tid;
 342	bool has_transaction = true;
 343	bool is_ineligible;
 344
 345	if (ext4_fc_disabled(sb))
 346		return;
 347
 348	if (handle && !IS_ERR(handle))
 349		tid = handle->h_transaction->t_tid;
 350	else {
 351		read_lock(&sbi->s_journal->j_state_lock);
 352		if (sbi->s_journal->j_running_transaction)
 353			tid = sbi->s_journal->j_running_transaction->t_tid;
 354		else
 355			has_transaction = false;
 356		read_unlock(&sbi->s_journal->j_state_lock);
 357	}
 358	spin_lock(&sbi->s_fc_lock);
 359	is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
 361		sbi->s_fc_ineligible_tid = tid;
 362	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 363	spin_unlock(&sbi->s_fc_lock);
 364	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 365	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 
 
 
 
 366}
 367
 368/*
 369 * Generic fast commit tracking function. If this is the first time this we are
 370 * called after a full commit, we initialize fast commit fields and then call
 371 * __fc_track_fn() with update = 0. If we have already been called after a full
 372 * commit, we pass update = 1. Based on that, the track function can determine
 373 * if it needs to track a field for the first time or if it needs to just
 374 * update the previously tracked value.
 375 *
 376 * If enqueue is set, this function enqueues the inode in fast commit list.
 377 */
 378static int ext4_fc_track_template(
 379	handle_t *handle, struct inode *inode,
 380	int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
 381	void *args, int enqueue)
 382{
 383	bool update = false;
 384	struct ext4_inode_info *ei = EXT4_I(inode);
 385	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 386	tid_t tid = 0;
 387	int ret;
 388
 
 
 
 
 
 
 
 389	tid = handle->h_transaction->t_tid;
 390	mutex_lock(&ei->i_fc_lock);
 391	if (tid == ei->i_sync_tid) {
 392		update = true;
 393	} else {
 394		ext4_fc_reset_inode(inode);
 395		ei->i_sync_tid = tid;
 396	}
 397	ret = __fc_track_fn(handle, inode, args, update);
 398	mutex_unlock(&ei->i_fc_lock);
 399
 400	if (!enqueue)
 401		return ret;
 402
 403	spin_lock(&sbi->s_fc_lock);
 404	if (list_empty(&EXT4_I(inode)->i_fc_list))
 405		list_add_tail(&EXT4_I(inode)->i_fc_list,
 406				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 407				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 408				&sbi->s_fc_q[FC_Q_STAGING] :
 409				&sbi->s_fc_q[FC_Q_MAIN]);
 410	spin_unlock(&sbi->s_fc_lock);
 411
 412	return ret;
 413}
 414
 415struct __track_dentry_update_args {
 416	struct dentry *dentry;
 417	int op;
 418};
 419
 420/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 421static int __track_dentry_update(handle_t *handle, struct inode *inode,
 422				 void *arg, bool update)
 423{
 424	struct ext4_fc_dentry_update *node;
 425	struct ext4_inode_info *ei = EXT4_I(inode);
 426	struct __track_dentry_update_args *dentry_update =
 427		(struct __track_dentry_update_args *)arg;
 428	struct dentry *dentry = dentry_update->dentry;
 429	struct inode *dir = dentry->d_parent->d_inode;
 430	struct super_block *sb = inode->i_sb;
 431	struct ext4_sb_info *sbi = EXT4_SB(sb);
 432
 433	mutex_unlock(&ei->i_fc_lock);
 434
 435	if (IS_ENCRYPTED(dir)) {
 436		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
 437					handle);
 438		mutex_lock(&ei->i_fc_lock);
 439		return -EOPNOTSUPP;
 440	}
 441
 442	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 443	if (!node) {
 444		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
 445		mutex_lock(&ei->i_fc_lock);
 446		return -ENOMEM;
 447	}
 448
 449	node->fcd_op = dentry_update->op;
 450	node->fcd_parent = dir->i_ino;
 451	node->fcd_ino = inode->i_ino;
 452	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 453		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 454		if (!node->fcd_name.name) {
 455			kmem_cache_free(ext4_fc_dentry_cachep, node);
 456			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
 
 457			mutex_lock(&ei->i_fc_lock);
 458			return -ENOMEM;
 459		}
 460		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 461			dentry->d_name.len);
 462	} else {
 463		memcpy(node->fcd_iname, dentry->d_name.name,
 464			dentry->d_name.len);
 465		node->fcd_name.name = node->fcd_iname;
 466	}
 467	node->fcd_name.len = dentry->d_name.len;
 468	INIT_LIST_HEAD(&node->fcd_dilist);
 469	spin_lock(&sbi->s_fc_lock);
 470	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 471		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 472		list_add_tail(&node->fcd_list,
 473				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 474	else
 475		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 476
 477	/*
 478	 * This helps us keep a track of all fc_dentry updates which is part of
 479	 * this ext4 inode. So in case the inode is getting unlinked, before
 480	 * even we get a chance to fsync, we could remove all fc_dentry
 481	 * references while evicting the inode in ext4_fc_del().
 482	 * Also with this, we don't need to loop over all the inodes in
 483	 * sbi->s_fc_q to get the corresponding inode in
 484	 * ext4_fc_commit_dentry_updates().
 485	 */
 486	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
 487		WARN_ON(!list_empty(&ei->i_fc_dilist));
 488		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
 489	}
 490	spin_unlock(&sbi->s_fc_lock);
 491	mutex_lock(&ei->i_fc_lock);
 492
 493	return 0;
 494}
 495
 496void __ext4_fc_track_unlink(handle_t *handle,
 497		struct inode *inode, struct dentry *dentry)
 498{
 499	struct __track_dentry_update_args args;
 500	int ret;
 501
 502	args.dentry = dentry;
 503	args.op = EXT4_FC_TAG_UNLINK;
 504
 505	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 506					(void *)&args, 0);
 507	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
 508}
 509
 510void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 511{
 512	struct inode *inode = d_inode(dentry);
 513
 514	if (ext4_fc_disabled(inode->i_sb))
 515		return;
 516
 517	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 518		return;
 519
 520	__ext4_fc_track_unlink(handle, inode, dentry);
 521}
 522
 523void __ext4_fc_track_link(handle_t *handle,
 524	struct inode *inode, struct dentry *dentry)
 525{
 526	struct __track_dentry_update_args args;
 527	int ret;
 528
 529	args.dentry = dentry;
 530	args.op = EXT4_FC_TAG_LINK;
 531
 532	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 533					(void *)&args, 0);
 534	trace_ext4_fc_track_link(handle, inode, dentry, ret);
 535}
 536
 537void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 538{
 539	struct inode *inode = d_inode(dentry);
 540
 541	if (ext4_fc_disabled(inode->i_sb))
 542		return;
 543
 544	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 545		return;
 546
 547	__ext4_fc_track_link(handle, inode, dentry);
 548}
 549
 550void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 551			  struct dentry *dentry)
 552{
 553	struct __track_dentry_update_args args;
 554	int ret;
 555
 556	args.dentry = dentry;
 557	args.op = EXT4_FC_TAG_CREAT;
 558
 559	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 560					(void *)&args, 0);
 561	trace_ext4_fc_track_create(handle, inode, dentry, ret);
 562}
 563
 564void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 565{
 566	struct inode *inode = d_inode(dentry);
 567
 568	if (ext4_fc_disabled(inode->i_sb))
 569		return;
 570
 571	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 572		return;
 573
 574	__ext4_fc_track_create(handle, inode, dentry);
 575}
 576
 577/* __track_fn for inode tracking */
 578static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 579			 bool update)
 580{
 581	if (update)
 582		return -EEXIST;
 583
 584	EXT4_I(inode)->i_fc_lblk_len = 0;
 585
 586	return 0;
 587}
 588
 589void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 590{
 591	int ret;
 592
 593	if (S_ISDIR(inode->i_mode))
 594		return;
 595
 596	if (ext4_fc_disabled(inode->i_sb))
 597		return;
 598
 599	if (ext4_should_journal_data(inode)) {
 600		ext4_fc_mark_ineligible(inode->i_sb,
 601					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 602		return;
 603	}
 604
 605	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 606		return;
 607
 608	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 609	trace_ext4_fc_track_inode(handle, inode, ret);
 610}
 611
 612struct __track_range_args {
 613	ext4_lblk_t start, end;
 614};
 615
 616/* __track_fn for tracking data updates */
 617static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 618			 bool update)
 619{
 620	struct ext4_inode_info *ei = EXT4_I(inode);
 621	ext4_lblk_t oldstart;
 622	struct __track_range_args *__arg =
 623		(struct __track_range_args *)arg;
 624
 625	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 626		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 627		return -ECANCELED;
 628	}
 629
 630	oldstart = ei->i_fc_lblk_start;
 631
 632	if (update && ei->i_fc_lblk_len > 0) {
 633		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 634		ei->i_fc_lblk_len =
 635			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 636				ei->i_fc_lblk_start + 1;
 637	} else {
 638		ei->i_fc_lblk_start = __arg->start;
 639		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 640	}
 641
 642	return 0;
 643}
 644
 645void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 646			 ext4_lblk_t end)
 647{
 648	struct __track_range_args args;
 649	int ret;
 650
 651	if (S_ISDIR(inode->i_mode))
 652		return;
 653
 654	if (ext4_fc_disabled(inode->i_sb))
 655		return;
 656
 657	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 658		return;
 659
 660	if (ext4_has_inline_data(inode)) {
 661		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
 662					handle);
 663		return;
 664	}
 665
 666	args.start = start;
 667	args.end = end;
 668
 669	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 670
 671	trace_ext4_fc_track_range(handle, inode, start, end, ret);
 672}
 673
 674static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 675{
 676	blk_opf_t write_flags = REQ_SYNC;
 677	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 678
 679	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 680	if (test_opt(sb, BARRIER) && is_tail)
 681		write_flags |= REQ_FUA | REQ_PREFLUSH;
 682	lock_buffer(bh);
 683	set_buffer_dirty(bh);
 684	set_buffer_uptodate(bh);
 685	bh->b_end_io = ext4_end_buffer_io_sync;
 686	submit_bh(REQ_OP_WRITE | write_flags, bh);
 687	EXT4_SB(sb)->s_fc_bh = NULL;
 688}
 689
 690/* Ext4 commit path routines */
 691
 
 
 
 
 
 
 
 
 
 
 
 
 692/*
 693 * Allocate len bytes on a fast commit buffer.
 694 *
 695 * During the commit time this function is used to manage fast commit
 696 * block space. We don't split a fast commit log onto different
 697 * blocks. So this function makes sure that if there's not enough space
 698 * on the current block, the remaining space in the current block is
 699 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 700 * new block is from jbd2 and CRC is updated to reflect the padding
 701 * we added.
 702 */
 703static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 704{
 705	struct ext4_fc_tl tl;
 706	struct ext4_sb_info *sbi = EXT4_SB(sb);
 707	struct buffer_head *bh;
 708	int bsize = sbi->s_journal->j_blocksize;
 709	int ret, off = sbi->s_fc_bytes % bsize;
 710	int remaining;
 711	u8 *dst;
 712
 713	/*
 714	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
 715	 * cannot fulfill the request.
 716	 */
 717	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
 718		return NULL;
 719
 720	if (!sbi->s_fc_bh) {
 721		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 722		if (ret)
 723			return NULL;
 724		sbi->s_fc_bh = bh;
 725	}
 726	dst = sbi->s_fc_bh->b_data + off;
 727
 728	/*
 729	 * Allocate the bytes in the current block if we can do so while still
 730	 * leaving enough space for a PAD tlv.
 731	 */
 732	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
 733	if (len <= remaining) {
 734		sbi->s_fc_bytes += len;
 735		return dst;
 736	}
 737
 738	/*
 739	 * Else, terminate the current block with a PAD tlv, then allocate a new
 740	 * block and allocate the bytes at the start of that new block.
 741	 */
 742
 743	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 744	tl.fc_len = cpu_to_le16(remaining);
 745	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 746	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
 747	*crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
 748
 749	ext4_fc_submit_bh(sb, false);
 750
 751	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 752	if (ret)
 753		return NULL;
 754	sbi->s_fc_bh = bh;
 755	sbi->s_fc_bytes += bsize - off + len;
 756	return sbi->s_fc_bh->b_data;
 757}
 758
 
 
 
 
 
 
 
 
 
 759/*
 760 * Complete a fast commit by writing tail tag.
 761 *
 762 * Writing tail tag marks the end of a fast commit. In order to guarantee
 763 * atomicity, after writing tail tag, even if there's space remaining
 764 * in the block, next commit shouldn't use it. That's why tail tag
 765 * has the length as that of the remaining space on the block.
 766 */
 767static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 768{
 769	struct ext4_sb_info *sbi = EXT4_SB(sb);
 770	struct ext4_fc_tl tl;
 771	struct ext4_fc_tail tail;
 772	int off, bsize = sbi->s_journal->j_blocksize;
 773	u8 *dst;
 774
 775	/*
 776	 * ext4_fc_reserve_space takes care of allocating an extra block if
 777	 * there's no enough space on this block for accommodating this tail.
 778	 */
 779	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
 780	if (!dst)
 781		return -ENOSPC;
 782
 783	off = sbi->s_fc_bytes % bsize;
 784
 785	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 786	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 787	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 788
 789	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 790	dst += EXT4_FC_TAG_BASE_LEN;
 791	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 792	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
 793	dst += sizeof(tail.fc_tid);
 794	crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
 795			  dst - (u8 *)sbi->s_fc_bh->b_data);
 796	tail.fc_crc = cpu_to_le32(crc);
 797	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
 798	dst += sizeof(tail.fc_crc);
 799	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 800
 801	ext4_fc_submit_bh(sb, true);
 802
 803	return 0;
 804}
 805
 806/*
 807 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 808 * Returns false if there's not enough space.
 809 */
 810static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 811			   u32 *crc)
 812{
 813	struct ext4_fc_tl tl;
 814	u8 *dst;
 815
 816	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
 817	if (!dst)
 818		return false;
 819
 820	tl.fc_tag = cpu_to_le16(tag);
 821	tl.fc_len = cpu_to_le16(len);
 822
 823	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 824	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
 825
 826	return true;
 827}
 828
 829/* Same as above, but adds dentry tlv. */
 830static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 831				   struct ext4_fc_dentry_update *fc_dentry)
 
 
 832{
 833	struct ext4_fc_dentry_info fcd;
 834	struct ext4_fc_tl tl;
 835	int dlen = fc_dentry->fcd_name.len;
 836	u8 *dst = ext4_fc_reserve_space(sb,
 837			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
 838
 839	if (!dst)
 840		return false;
 841
 842	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 843	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 844	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 845	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 846	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 847	dst += EXT4_FC_TAG_BASE_LEN;
 848	memcpy(dst, &fcd, sizeof(fcd));
 849	dst += sizeof(fcd);
 850	memcpy(dst, fc_dentry->fcd_name.name, dlen);
 
 851
 852	return true;
 853}
 854
 855/*
 856 * Writes inode in the fast commit space under TLV with tag @tag.
 857 * Returns 0 on success, error on failure.
 858 */
 859static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 860{
 861	struct ext4_inode_info *ei = EXT4_I(inode);
 862	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 863	int ret;
 864	struct ext4_iloc iloc;
 865	struct ext4_fc_inode fc_inode;
 866	struct ext4_fc_tl tl;
 867	u8 *dst;
 868
 869	ret = ext4_get_inode_loc(inode, &iloc);
 870	if (ret)
 871		return ret;
 872
 873	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 874		inode_len = EXT4_INODE_SIZE(inode->i_sb);
 875	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 876		inode_len += ei->i_extra_isize;
 877
 878	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 879	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 880	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 881
 882	ret = -ECANCELED;
 883	dst = ext4_fc_reserve_space(inode->i_sb,
 884		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
 885	if (!dst)
 886		goto err;
 887
 888	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 889	dst += EXT4_FC_TAG_BASE_LEN;
 890	memcpy(dst, &fc_inode, sizeof(fc_inode));
 
 
 891	dst += sizeof(fc_inode);
 892	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
 893	ret = 0;
 894err:
 895	brelse(iloc.bh);
 896	return ret;
 897}
 898
 899/*
 900 * Writes updated data ranges for the inode in question. Updates CRC.
 901 * Returns 0 on success, error otherwise.
 902 */
 903static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 904{
 905	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 906	struct ext4_inode_info *ei = EXT4_I(inode);
 907	struct ext4_map_blocks map;
 908	struct ext4_fc_add_range fc_ext;
 909	struct ext4_fc_del_range lrange;
 910	struct ext4_extent *ex;
 911	int ret;
 912
 913	mutex_lock(&ei->i_fc_lock);
 914	if (ei->i_fc_lblk_len == 0) {
 915		mutex_unlock(&ei->i_fc_lock);
 916		return 0;
 917	}
 918	old_blk_size = ei->i_fc_lblk_start;
 919	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 920	ei->i_fc_lblk_len = 0;
 921	mutex_unlock(&ei->i_fc_lock);
 922
 923	cur_lblk_off = old_blk_size;
 924	ext4_debug("will try writing %d to %d for inode %ld\n",
 925		   cur_lblk_off, new_blk_size, inode->i_ino);
 926
 927	while (cur_lblk_off <= new_blk_size) {
 928		map.m_lblk = cur_lblk_off;
 929		map.m_len = new_blk_size - cur_lblk_off + 1;
 930		ret = ext4_map_blocks(NULL, inode, &map, 0);
 931		if (ret < 0)
 932			return -ECANCELED;
 933
 934		if (map.m_len == 0) {
 935			cur_lblk_off++;
 936			continue;
 937		}
 938
 939		if (ret == 0) {
 940			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 941			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 942			lrange.fc_len = cpu_to_le32(map.m_len);
 943			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 944					    sizeof(lrange), (u8 *)&lrange, crc))
 945				return -ENOSPC;
 946		} else {
 947			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 948				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 949
 950			/* Limit the number of blocks in one extent */
 951			map.m_len = min(max, map.m_len);
 952
 953			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 954			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 955			ex->ee_block = cpu_to_le32(map.m_lblk);
 956			ex->ee_len = cpu_to_le16(map.m_len);
 957			ext4_ext_store_pblock(ex, map.m_pblk);
 958			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 959				ext4_ext_mark_unwritten(ex);
 960			else
 961				ext4_ext_mark_initialized(ex);
 962			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 963					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 964				return -ENOSPC;
 965		}
 966
 967		cur_lblk_off += map.m_len;
 968	}
 969
 970	return 0;
 971}
 972
 973
 974/* Submit data for all the fast commit inodes */
 975static int ext4_fc_submit_inode_data_all(journal_t *journal)
 976{
 977	struct super_block *sb = journal->j_private;
 978	struct ext4_sb_info *sbi = EXT4_SB(sb);
 979	struct ext4_inode_info *ei;
 980	int ret = 0;
 981
 982	spin_lock(&sbi->s_fc_lock);
 
 983	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 984		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 985		while (atomic_read(&ei->i_fc_updates)) {
 986			DEFINE_WAIT(wait);
 987
 988			prepare_to_wait(&ei->i_fc_wait, &wait,
 989						TASK_UNINTERRUPTIBLE);
 990			if (atomic_read(&ei->i_fc_updates)) {
 991				spin_unlock(&sbi->s_fc_lock);
 992				schedule();
 993				spin_lock(&sbi->s_fc_lock);
 994			}
 995			finish_wait(&ei->i_fc_wait, &wait);
 996		}
 997		spin_unlock(&sbi->s_fc_lock);
 998		ret = jbd2_submit_inode_data(journal, ei->jinode);
 999		if (ret)
1000			return ret;
1001		spin_lock(&sbi->s_fc_lock);
1002	}
1003	spin_unlock(&sbi->s_fc_lock);
1004
1005	return ret;
1006}
1007
1008/* Wait for completion of data for all the fast commit inodes */
1009static int ext4_fc_wait_inode_data_all(journal_t *journal)
1010{
1011	struct super_block *sb = journal->j_private;
1012	struct ext4_sb_info *sbi = EXT4_SB(sb);
1013	struct ext4_inode_info *pos, *n;
1014	int ret = 0;
1015
1016	spin_lock(&sbi->s_fc_lock);
1017	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1018		if (!ext4_test_inode_state(&pos->vfs_inode,
1019					   EXT4_STATE_FC_COMMITTING))
1020			continue;
1021		spin_unlock(&sbi->s_fc_lock);
1022
1023		ret = jbd2_wait_inode_data(journal, pos->jinode);
1024		if (ret)
1025			return ret;
1026		spin_lock(&sbi->s_fc_lock);
1027	}
1028	spin_unlock(&sbi->s_fc_lock);
1029
1030	return 0;
1031}
1032
1033/* Commit all the directory entry updates */
1034static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1035__acquires(&sbi->s_fc_lock)
1036__releases(&sbi->s_fc_lock)
1037{
1038	struct super_block *sb = journal->j_private;
1039	struct ext4_sb_info *sbi = EXT4_SB(sb);
1040	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1041	struct inode *inode;
1042	struct ext4_inode_info *ei;
1043	int ret;
1044
1045	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1046		return 0;
1047	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1048				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1049		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1050			spin_unlock(&sbi->s_fc_lock);
1051			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1052				ret = -ENOSPC;
1053				goto lock_and_exit;
1054			}
1055			spin_lock(&sbi->s_fc_lock);
1056			continue;
1057		}
 
 
 
 
 
 
 
 
 
1058		/*
1059		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1060		 * corresponding inode pointer
1061		 */
1062		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1063		ei = list_first_entry(&fc_dentry->fcd_dilist,
1064				struct ext4_inode_info, i_fc_dilist);
1065		inode = &ei->vfs_inode;
1066		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1067
1068		spin_unlock(&sbi->s_fc_lock);
1069
1070		/*
1071		 * We first write the inode and then the create dirent. This
1072		 * allows the recovery code to create an unnamed inode first
1073		 * and then link it to a directory entry. This allows us
1074		 * to use namei.c routines almost as is and simplifies
1075		 * the recovery code.
1076		 */
1077		ret = ext4_fc_write_inode(inode, crc);
1078		if (ret)
1079			goto lock_and_exit;
1080
1081		ret = ext4_fc_write_inode_data(inode, crc);
1082		if (ret)
1083			goto lock_and_exit;
1084
1085		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1086			ret = -ENOSPC;
1087			goto lock_and_exit;
1088		}
1089
1090		spin_lock(&sbi->s_fc_lock);
1091	}
1092	return 0;
1093lock_and_exit:
1094	spin_lock(&sbi->s_fc_lock);
1095	return ret;
1096}
1097
1098static int ext4_fc_perform_commit(journal_t *journal)
1099{
1100	struct super_block *sb = journal->j_private;
1101	struct ext4_sb_info *sbi = EXT4_SB(sb);
1102	struct ext4_inode_info *iter;
1103	struct ext4_fc_head head;
1104	struct inode *inode;
1105	struct blk_plug plug;
1106	int ret = 0;
1107	u32 crc = 0;
1108
1109	ret = ext4_fc_submit_inode_data_all(journal);
1110	if (ret)
1111		return ret;
1112
1113	ret = ext4_fc_wait_inode_data_all(journal);
1114	if (ret)
1115		return ret;
1116
1117	/*
1118	 * If file system device is different from journal device, issue a cache
1119	 * flush before we start writing fast commit blocks.
1120	 */
1121	if (journal->j_fs_dev != journal->j_dev)
1122		blkdev_issue_flush(journal->j_fs_dev);
1123
1124	blk_start_plug(&plug);
1125	if (sbi->s_fc_bytes == 0) {
1126		/*
1127		 * Add a head tag only if this is the first fast commit
1128		 * in this TID.
1129		 */
1130		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1131		head.fc_tid = cpu_to_le32(
1132			sbi->s_journal->j_running_transaction->t_tid);
1133		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1134			(u8 *)&head, &crc)) {
1135			ret = -ENOSPC;
1136			goto out;
1137		}
1138	}
1139
1140	spin_lock(&sbi->s_fc_lock);
1141	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1142	if (ret) {
1143		spin_unlock(&sbi->s_fc_lock);
1144		goto out;
1145	}
1146
1147	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1148		inode = &iter->vfs_inode;
1149		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1150			continue;
1151
1152		spin_unlock(&sbi->s_fc_lock);
1153		ret = ext4_fc_write_inode_data(inode, &crc);
1154		if (ret)
1155			goto out;
1156		ret = ext4_fc_write_inode(inode, &crc);
1157		if (ret)
1158			goto out;
1159		spin_lock(&sbi->s_fc_lock);
1160	}
1161	spin_unlock(&sbi->s_fc_lock);
1162
1163	ret = ext4_fc_write_tail(sb, crc);
1164
1165out:
1166	blk_finish_plug(&plug);
1167	return ret;
1168}
1169
1170static void ext4_fc_update_stats(struct super_block *sb, int status,
1171				 u64 commit_time, int nblks, tid_t commit_tid)
1172{
1173	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1174
1175	ext4_debug("Fast commit ended with status = %d for tid %u",
1176			status, commit_tid);
1177	if (status == EXT4_FC_STATUS_OK) {
1178		stats->fc_num_commits++;
1179		stats->fc_numblks += nblks;
1180		if (likely(stats->s_fc_avg_commit_time))
1181			stats->s_fc_avg_commit_time =
1182				(commit_time +
1183				 stats->s_fc_avg_commit_time * 3) / 4;
1184		else
1185			stats->s_fc_avg_commit_time = commit_time;
1186	} else if (status == EXT4_FC_STATUS_FAILED ||
1187		   status == EXT4_FC_STATUS_INELIGIBLE) {
1188		if (status == EXT4_FC_STATUS_FAILED)
1189			stats->fc_failed_commits++;
1190		stats->fc_ineligible_commits++;
1191	} else {
1192		stats->fc_skipped_commits++;
1193	}
1194	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1195}
1196
1197/*
1198 * The main commit entry point. Performs a fast commit for transaction
1199 * commit_tid if needed. If it's not possible to perform a fast commit
1200 * due to various reasons, we fall back to full commit. Returns 0
1201 * on success, error otherwise.
1202 */
1203int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1204{
1205	struct super_block *sb = journal->j_private;
1206	struct ext4_sb_info *sbi = EXT4_SB(sb);
1207	int nblks = 0, ret, bsize = journal->j_blocksize;
1208	int subtid = atomic_read(&sbi->s_fc_subtid);
1209	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1210	ktime_t start_time, commit_time;
1211
1212	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1213		return jbd2_complete_transaction(journal, commit_tid);
1214
1215	trace_ext4_fc_commit_start(sb, commit_tid);
1216
1217	start_time = ktime_get();
 
 
 
 
1218
1219restart_fc:
1220	ret = jbd2_fc_begin_commit(journal, commit_tid);
1221	if (ret == -EALREADY) {
1222		/* There was an ongoing commit, check if we need to restart */
1223		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1224		    tid_gt(commit_tid, journal->j_commit_sequence))
1225			goto restart_fc;
1226		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1227				commit_tid);
1228		return 0;
1229	} else if (ret) {
1230		/*
1231		 * Commit couldn't start. Just update stats and perform a
1232		 * full commit.
1233		 */
1234		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1235				commit_tid);
1236		return jbd2_complete_transaction(journal, commit_tid);
1237	}
1238
1239	/*
1240	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241	 * if we are fast commit ineligible.
1242	 */
1243	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1244		status = EXT4_FC_STATUS_INELIGIBLE;
1245		goto fallback;
1246	}
1247
1248	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1249	ret = ext4_fc_perform_commit(journal);
1250	if (ret < 0) {
1251		status = EXT4_FC_STATUS_FAILED;
1252		goto fallback;
 
1253	}
1254	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1255	ret = jbd2_fc_wait_bufs(journal, nblks);
1256	if (ret < 0) {
1257		status = EXT4_FC_STATUS_FAILED;
1258		goto fallback;
 
1259	}
1260	atomic_inc(&sbi->s_fc_subtid);
1261	ret = jbd2_fc_end_commit(journal);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1262	/*
1263	 * weight the commit time higher than the average time so we
1264	 * don't react too strongly to vast changes in the commit time
1265	 */
1266	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1267	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1268	return ret;
1269
1270fallback:
1271	ret = jbd2_fc_end_commit_fallback(journal);
1272	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1273	return ret;
 
 
 
 
 
 
1274}
1275
1276/*
1277 * Fast commit cleanup routine. This is called after every fast commit and
1278 * full commit. full is true if we are called after a full commit.
1279 */
1280static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1281{
1282	struct super_block *sb = journal->j_private;
1283	struct ext4_sb_info *sbi = EXT4_SB(sb);
1284	struct ext4_inode_info *iter, *iter_n;
1285	struct ext4_fc_dentry_update *fc_dentry;
1286
1287	if (full && sbi->s_fc_bh)
1288		sbi->s_fc_bh = NULL;
1289
1290	trace_ext4_fc_cleanup(journal, full, tid);
1291	jbd2_fc_release_bufs(journal);
1292
1293	spin_lock(&sbi->s_fc_lock);
1294	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1295				 i_fc_list) {
1296		list_del_init(&iter->i_fc_list);
1297		ext4_clear_inode_state(&iter->vfs_inode,
1298				       EXT4_STATE_FC_COMMITTING);
1299		if (tid_geq(tid, iter->i_sync_tid)) {
1300			ext4_fc_reset_inode(&iter->vfs_inode);
1301		} else if (full) {
1302			/*
1303			 * We are called after a full commit, inode has been
1304			 * modified while the commit was running. Re-enqueue
1305			 * the inode into STAGING, which will then be splice
1306			 * back into MAIN. This cannot happen during
1307			 * fastcommit because the journal is locked all the
1308			 * time in that case (and tid doesn't increase so
1309			 * tid check above isn't reliable).
1310			 */
1311			list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
1312				      &sbi->s_fc_q[FC_Q_STAGING]);
1313		}
1314		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1315		smp_mb();
1316#if (BITS_PER_LONG < 64)
1317		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1318#else
1319		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1320#endif
1321	}
1322
1323	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1324		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1325					     struct ext4_fc_dentry_update,
1326					     fcd_list);
1327		list_del_init(&fc_dentry->fcd_list);
1328		list_del_init(&fc_dentry->fcd_dilist);
1329		spin_unlock(&sbi->s_fc_lock);
1330
1331		if (fc_dentry->fcd_name.name &&
1332			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1333			kfree(fc_dentry->fcd_name.name);
1334		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1335		spin_lock(&sbi->s_fc_lock);
1336	}
1337
1338	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1339				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1340	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1341				&sbi->s_fc_q[FC_Q_MAIN]);
1342
1343	if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1344		sbi->s_fc_ineligible_tid = 0;
1345		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1346	}
1347
1348	if (full)
1349		sbi->s_fc_bytes = 0;
1350	spin_unlock(&sbi->s_fc_lock);
1351	trace_ext4_fc_stats(sb);
1352}
1353
1354/* Ext4 Replay Path Routines */
1355
1356/* Helper struct for dentry replay routines */
1357struct dentry_info_args {
1358	int parent_ino, dname_len, ino, inode_len;
1359	char *dname;
1360};
1361
1362/* Same as struct ext4_fc_tl, but uses native endianness fields */
1363struct ext4_fc_tl_mem {
1364	u16 fc_tag;
1365	u16 fc_len;
1366};
1367
1368static inline void tl_to_darg(struct dentry_info_args *darg,
1369			      struct ext4_fc_tl_mem *tl, u8 *val)
1370{
1371	struct ext4_fc_dentry_info fcd;
1372
1373	memcpy(&fcd, val, sizeof(fcd));
1374
1375	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1376	darg->ino = le32_to_cpu(fcd.fc_ino);
1377	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1378	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1379}
1380
1381static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1382{
1383	struct ext4_fc_tl tl_disk;
1384
1385	memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1386	tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1387	tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1388}
1389
1390/* Unlink replay function */
1391static int ext4_fc_replay_unlink(struct super_block *sb,
1392				 struct ext4_fc_tl_mem *tl, u8 *val)
1393{
1394	struct inode *inode, *old_parent;
1395	struct qstr entry;
1396	struct dentry_info_args darg;
1397	int ret = 0;
1398
1399	tl_to_darg(&darg, tl, val);
1400
1401	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1402			darg.parent_ino, darg.dname_len);
1403
1404	entry.name = darg.dname;
1405	entry.len = darg.dname_len;
1406	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1407
1408	if (IS_ERR(inode)) {
1409		ext4_debug("Inode %d not found", darg.ino);
1410		return 0;
1411	}
1412
1413	old_parent = ext4_iget(sb, darg.parent_ino,
1414				EXT4_IGET_NORMAL);
1415	if (IS_ERR(old_parent)) {
1416		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1417		iput(inode);
1418		return 0;
1419	}
1420
1421	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1422	/* -ENOENT ok coz it might not exist anymore. */
1423	if (ret == -ENOENT)
1424		ret = 0;
1425	iput(old_parent);
1426	iput(inode);
1427	return ret;
1428}
1429
1430static int ext4_fc_replay_link_internal(struct super_block *sb,
1431				struct dentry_info_args *darg,
1432				struct inode *inode)
1433{
1434	struct inode *dir = NULL;
1435	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1436	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1437	int ret = 0;
1438
1439	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1440	if (IS_ERR(dir)) {
1441		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1442		dir = NULL;
1443		goto out;
1444	}
1445
1446	dentry_dir = d_obtain_alias(dir);
1447	if (IS_ERR(dentry_dir)) {
1448		ext4_debug("Failed to obtain dentry");
1449		dentry_dir = NULL;
1450		goto out;
1451	}
1452
1453	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1454	if (!dentry_inode) {
1455		ext4_debug("Inode dentry not created.");
1456		ret = -ENOMEM;
1457		goto out;
1458	}
1459
1460	ret = __ext4_link(dir, inode, dentry_inode);
1461	/*
1462	 * It's possible that link already existed since data blocks
1463	 * for the dir in question got persisted before we crashed OR
1464	 * we replayed this tag and crashed before the entire replay
1465	 * could complete.
1466	 */
1467	if (ret && ret != -EEXIST) {
1468		ext4_debug("Failed to link\n");
1469		goto out;
1470	}
1471
1472	ret = 0;
1473out:
1474	if (dentry_dir) {
1475		d_drop(dentry_dir);
1476		dput(dentry_dir);
1477	} else if (dir) {
1478		iput(dir);
1479	}
1480	if (dentry_inode) {
1481		d_drop(dentry_inode);
1482		dput(dentry_inode);
1483	}
1484
1485	return ret;
1486}
1487
1488/* Link replay function */
1489static int ext4_fc_replay_link(struct super_block *sb,
1490			       struct ext4_fc_tl_mem *tl, u8 *val)
1491{
1492	struct inode *inode;
1493	struct dentry_info_args darg;
1494	int ret = 0;
1495
1496	tl_to_darg(&darg, tl, val);
1497	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1498			darg.parent_ino, darg.dname_len);
1499
1500	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1501	if (IS_ERR(inode)) {
1502		ext4_debug("Inode not found.");
1503		return 0;
1504	}
1505
1506	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1507	iput(inode);
1508	return ret;
1509}
1510
1511/*
1512 * Record all the modified inodes during replay. We use this later to setup
1513 * block bitmaps correctly.
1514 */
1515static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1516{
1517	struct ext4_fc_replay_state *state;
1518	int i;
1519
1520	state = &EXT4_SB(sb)->s_fc_replay_state;
1521	for (i = 0; i < state->fc_modified_inodes_used; i++)
1522		if (state->fc_modified_inodes[i] == ino)
1523			return 0;
1524	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1525		int *fc_modified_inodes;
1526
1527		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1528				sizeof(int) * (state->fc_modified_inodes_size +
1529				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1530				GFP_KERNEL);
1531		if (!fc_modified_inodes)
1532			return -ENOMEM;
1533		state->fc_modified_inodes = fc_modified_inodes;
1534		state->fc_modified_inodes_size +=
1535			EXT4_FC_REPLAY_REALLOC_INCREMENT;
 
 
 
 
 
 
1536	}
1537	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1538	return 0;
1539}
1540
1541/*
1542 * Inode replay function
1543 */
1544static int ext4_fc_replay_inode(struct super_block *sb,
1545				struct ext4_fc_tl_mem *tl, u8 *val)
1546{
1547	struct ext4_fc_inode fc_inode;
1548	struct ext4_inode *raw_inode;
1549	struct ext4_inode *raw_fc_inode;
1550	struct inode *inode = NULL;
1551	struct ext4_iloc iloc;
1552	int inode_len, ino, ret, tag = tl->fc_tag;
1553	struct ext4_extent_header *eh;
1554	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1555
1556	memcpy(&fc_inode, val, sizeof(fc_inode));
1557
1558	ino = le32_to_cpu(fc_inode.fc_ino);
1559	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1560
1561	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1562	if (!IS_ERR(inode)) {
1563		ext4_ext_clear_bb(inode);
1564		iput(inode);
1565	}
1566	inode = NULL;
1567
1568	ret = ext4_fc_record_modified_inode(sb, ino);
1569	if (ret)
1570		goto out;
1571
1572	raw_fc_inode = (struct ext4_inode *)
1573		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1574	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1575	if (ret)
1576		goto out;
1577
1578	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1579	raw_inode = ext4_raw_inode(&iloc);
1580
1581	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1582	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1583	       inode_len - off_gen);
1584	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1585		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1586		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1587			memset(eh, 0, sizeof(*eh));
1588			eh->eh_magic = EXT4_EXT_MAGIC;
1589			eh->eh_max = cpu_to_le16(
1590				(sizeof(raw_inode->i_block) -
1591				 sizeof(struct ext4_extent_header))
1592				 / sizeof(struct ext4_extent));
1593		}
1594	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1595		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1596			sizeof(raw_inode->i_block));
1597	}
1598
1599	/* Immediately update the inode on disk. */
1600	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1601	if (ret)
1602		goto out;
1603	ret = sync_dirty_buffer(iloc.bh);
1604	if (ret)
1605		goto out;
1606	ret = ext4_mark_inode_used(sb, ino);
1607	if (ret)
1608		goto out;
1609
1610	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1611	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1612	if (IS_ERR(inode)) {
1613		ext4_debug("Inode not found.");
1614		return -EFSCORRUPTED;
1615	}
1616
1617	/*
1618	 * Our allocator could have made different decisions than before
1619	 * crashing. This should be fixed but until then, we calculate
1620	 * the number of blocks the inode.
1621	 */
1622	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1623		ext4_ext_replay_set_iblocks(inode);
1624
1625	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1626	ext4_reset_inode_seed(inode);
1627
1628	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1629	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1630	sync_dirty_buffer(iloc.bh);
1631	brelse(iloc.bh);
1632out:
1633	iput(inode);
1634	if (!ret)
1635		blkdev_issue_flush(sb->s_bdev);
1636
1637	return 0;
1638}
1639
1640/*
1641 * Dentry create replay function.
1642 *
1643 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1644 * inode for which we are trying to create a dentry here, should already have
1645 * been replayed before we start here.
1646 */
1647static int ext4_fc_replay_create(struct super_block *sb,
1648				 struct ext4_fc_tl_mem *tl, u8 *val)
1649{
1650	int ret = 0;
1651	struct inode *inode = NULL;
1652	struct inode *dir = NULL;
1653	struct dentry_info_args darg;
1654
1655	tl_to_darg(&darg, tl, val);
1656
1657	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1658			darg.parent_ino, darg.dname_len);
1659
1660	/* This takes care of update group descriptor and other metadata */
1661	ret = ext4_mark_inode_used(sb, darg.ino);
1662	if (ret)
1663		goto out;
1664
1665	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1666	if (IS_ERR(inode)) {
1667		ext4_debug("inode %d not found.", darg.ino);
1668		inode = NULL;
1669		ret = -EINVAL;
1670		goto out;
1671	}
1672
1673	if (S_ISDIR(inode->i_mode)) {
1674		/*
1675		 * If we are creating a directory, we need to make sure that the
1676		 * dot and dot dot dirents are setup properly.
1677		 */
1678		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1679		if (IS_ERR(dir)) {
1680			ext4_debug("Dir %d not found.", darg.ino);
1681			goto out;
1682		}
1683		ret = ext4_init_new_dir(NULL, dir, inode);
1684		iput(dir);
1685		if (ret) {
1686			ret = 0;
1687			goto out;
1688		}
1689	}
1690	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1691	if (ret)
1692		goto out;
1693	set_nlink(inode, 1);
1694	ext4_mark_inode_dirty(NULL, inode);
1695out:
1696	iput(inode);
 
1697	return ret;
1698}
1699
1700/*
1701 * Record physical disk regions which are in use as per fast commit area,
1702 * and used by inodes during replay phase. Our simple replay phase
1703 * allocator excludes these regions from allocation.
1704 */
1705int ext4_fc_record_regions(struct super_block *sb, int ino,
1706		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1707{
1708	struct ext4_fc_replay_state *state;
1709	struct ext4_fc_alloc_region *region;
1710
1711	state = &EXT4_SB(sb)->s_fc_replay_state;
1712	/*
1713	 * during replay phase, the fc_regions_valid may not same as
1714	 * fc_regions_used, update it when do new additions.
1715	 */
1716	if (replay && state->fc_regions_used != state->fc_regions_valid)
1717		state->fc_regions_used = state->fc_regions_valid;
1718	if (state->fc_regions_used == state->fc_regions_size) {
1719		struct ext4_fc_alloc_region *fc_regions;
1720
1721		fc_regions = krealloc(state->fc_regions,
1722				      sizeof(struct ext4_fc_alloc_region) *
1723				      (state->fc_regions_size +
1724				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1725				      GFP_KERNEL);
1726		if (!fc_regions)
1727			return -ENOMEM;
1728		state->fc_regions_size +=
1729			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1730		state->fc_regions = fc_regions;
 
 
 
 
 
 
1731	}
1732	region = &state->fc_regions[state->fc_regions_used++];
1733	region->ino = ino;
1734	region->lblk = lblk;
1735	region->pblk = pblk;
1736	region->len = len;
1737
1738	if (replay)
1739		state->fc_regions_valid++;
1740
1741	return 0;
1742}
1743
1744/* Replay add range tag */
1745static int ext4_fc_replay_add_range(struct super_block *sb,
1746				    struct ext4_fc_tl_mem *tl, u8 *val)
1747{
1748	struct ext4_fc_add_range fc_add_ex;
1749	struct ext4_extent newex, *ex;
1750	struct inode *inode;
1751	ext4_lblk_t start, cur;
1752	int remaining, len;
1753	ext4_fsblk_t start_pblk;
1754	struct ext4_map_blocks map;
1755	struct ext4_ext_path *path = NULL;
1756	int ret;
1757
1758	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1759	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1760
1761	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1762		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1763		ext4_ext_get_actual_len(ex));
1764
1765	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1766	if (IS_ERR(inode)) {
1767		ext4_debug("Inode not found.");
1768		return 0;
1769	}
1770
1771	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1772	if (ret)
1773		goto out;
1774
1775	start = le32_to_cpu(ex->ee_block);
1776	start_pblk = ext4_ext_pblock(ex);
1777	len = ext4_ext_get_actual_len(ex);
1778
1779	cur = start;
1780	remaining = len;
1781	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1782		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1783		  inode->i_ino);
1784
1785	while (remaining > 0) {
1786		map.m_lblk = cur;
1787		map.m_len = remaining;
1788		map.m_pblk = 0;
1789		ret = ext4_map_blocks(NULL, inode, &map, 0);
1790
1791		if (ret < 0)
1792			goto out;
 
 
1793
1794		if (ret == 0) {
1795			/* Range is not mapped */
1796			path = ext4_find_extent(inode, cur, path, 0);
1797			if (IS_ERR(path))
1798				goto out;
 
 
1799			memset(&newex, 0, sizeof(newex));
1800			newex.ee_block = cpu_to_le32(cur);
1801			ext4_ext_store_pblock(
1802				&newex, start_pblk + cur - start);
1803			newex.ee_len = cpu_to_le16(map.m_len);
1804			if (ext4_ext_is_unwritten(ex))
1805				ext4_ext_mark_unwritten(&newex);
1806			down_write(&EXT4_I(inode)->i_data_sem);
1807			path = ext4_ext_insert_extent(NULL, inode,
1808						      path, &newex, 0);
1809			up_write((&EXT4_I(inode)->i_data_sem));
1810			if (IS_ERR(path))
1811				goto out;
 
 
 
 
1812			goto next;
1813		}
1814
1815		if (start_pblk + cur - start != map.m_pblk) {
1816			/*
1817			 * Logical to physical mapping changed. This can happen
1818			 * if this range was removed and then reallocated to
1819			 * map to new physical blocks during a fast commit.
1820			 */
1821			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1822					ext4_ext_is_unwritten(ex),
1823					start_pblk + cur - start);
1824			if (ret)
1825				goto out;
 
 
1826			/*
1827			 * Mark the old blocks as free since they aren't used
1828			 * anymore. We maintain an array of all the modified
1829			 * inodes. In case these blocks are still used at either
1830			 * a different logical range in the same inode or in
1831			 * some different inode, we will mark them as allocated
1832			 * at the end of the FC replay using our array of
1833			 * modified inodes.
1834			 */
1835			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1836			goto next;
1837		}
1838
1839		/* Range is mapped and needs a state change */
1840		ext4_debug("Converting from %ld to %d %lld",
1841				map.m_flags & EXT4_MAP_UNWRITTEN,
1842			ext4_ext_is_unwritten(ex), map.m_pblk);
1843		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1844					ext4_ext_is_unwritten(ex), map.m_pblk);
1845		if (ret)
1846			goto out;
 
 
1847		/*
1848		 * We may have split the extent tree while toggling the state.
1849		 * Try to shrink the extent tree now.
1850		 */
1851		ext4_ext_replay_shrink_inode(inode, start + len);
1852next:
1853		cur += map.m_len;
1854		remaining -= map.m_len;
1855	}
1856	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1857					sb->s_blocksize_bits);
1858out:
1859	ext4_free_ext_path(path);
1860	iput(inode);
1861	return 0;
1862}
1863
1864/* Replay DEL_RANGE tag */
1865static int
1866ext4_fc_replay_del_range(struct super_block *sb,
1867			 struct ext4_fc_tl_mem *tl, u8 *val)
1868{
1869	struct inode *inode;
1870	struct ext4_fc_del_range lrange;
1871	struct ext4_map_blocks map;
1872	ext4_lblk_t cur, remaining;
1873	int ret;
1874
1875	memcpy(&lrange, val, sizeof(lrange));
1876	cur = le32_to_cpu(lrange.fc_lblk);
1877	remaining = le32_to_cpu(lrange.fc_len);
1878
1879	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1880		le32_to_cpu(lrange.fc_ino), cur, remaining);
1881
1882	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1883	if (IS_ERR(inode)) {
1884		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1885		return 0;
1886	}
1887
1888	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1889	if (ret)
1890		goto out;
1891
1892	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1893			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1894			le32_to_cpu(lrange.fc_len));
1895	while (remaining > 0) {
1896		map.m_lblk = cur;
1897		map.m_len = remaining;
1898
1899		ret = ext4_map_blocks(NULL, inode, &map, 0);
1900		if (ret < 0)
1901			goto out;
 
 
1902		if (ret > 0) {
1903			remaining -= ret;
1904			cur += ret;
1905			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1906		} else {
1907			remaining -= map.m_len;
1908			cur += map.m_len;
1909		}
1910	}
1911
1912	down_write(&EXT4_I(inode)->i_data_sem);
1913	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1914				le32_to_cpu(lrange.fc_lblk) +
1915				le32_to_cpu(lrange.fc_len) - 1);
1916	up_write(&EXT4_I(inode)->i_data_sem);
1917	if (ret)
1918		goto out;
1919	ext4_ext_replay_shrink_inode(inode,
1920		i_size_read(inode) >> sb->s_blocksize_bits);
1921	ext4_mark_inode_dirty(NULL, inode);
1922out:
1923	iput(inode);
 
1924	return 0;
1925}
1926
1927static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1928{
1929	struct ext4_fc_replay_state *state;
1930	struct inode *inode;
1931	struct ext4_ext_path *path = NULL;
1932	struct ext4_map_blocks map;
1933	int i, ret, j;
1934	ext4_lblk_t cur, end;
1935
1936	state = &EXT4_SB(sb)->s_fc_replay_state;
1937	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1938		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1939			EXT4_IGET_NORMAL);
1940		if (IS_ERR(inode)) {
1941			ext4_debug("Inode %d not found.",
1942				state->fc_modified_inodes[i]);
1943			continue;
1944		}
1945		cur = 0;
1946		end = EXT_MAX_BLOCKS;
1947		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1948			iput(inode);
1949			continue;
1950		}
1951		while (cur < end) {
1952			map.m_lblk = cur;
1953			map.m_len = end - cur;
1954
1955			ret = ext4_map_blocks(NULL, inode, &map, 0);
1956			if (ret < 0)
1957				break;
1958
1959			if (ret > 0) {
1960				path = ext4_find_extent(inode, map.m_lblk, path, 0);
1961				if (!IS_ERR(path)) {
1962					for (j = 0; j < path->p_depth; j++)
1963						ext4_mb_mark_bb(inode->i_sb,
1964							path[j].p_block, 1, true);
1965				} else {
1966					path = NULL;
1967				}
1968				cur += ret;
1969				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1970							map.m_len, true);
1971			} else {
1972				cur = cur + (map.m_len ? map.m_len : 1);
1973			}
1974		}
1975		iput(inode);
1976	}
1977
1978	ext4_free_ext_path(path);
1979}
1980
1981/*
1982 * Check if block is in excluded regions for block allocation. The simple
1983 * allocator that runs during replay phase is calls this function to see
1984 * if it is okay to use a block.
1985 */
1986bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1987{
1988	int i;
1989	struct ext4_fc_replay_state *state;
1990
1991	state = &EXT4_SB(sb)->s_fc_replay_state;
1992	for (i = 0; i < state->fc_regions_valid; i++) {
1993		if (state->fc_regions[i].ino == 0 ||
1994			state->fc_regions[i].len == 0)
1995			continue;
1996		if (in_range(blk, state->fc_regions[i].pblk,
1997					state->fc_regions[i].len))
1998			return true;
1999	}
2000	return false;
2001}
2002
2003/* Cleanup function called after replay */
2004void ext4_fc_replay_cleanup(struct super_block *sb)
2005{
2006	struct ext4_sb_info *sbi = EXT4_SB(sb);
2007
2008	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2009	kfree(sbi->s_fc_replay_state.fc_regions);
2010	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2011}
2012
2013static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2014				      int tag, int len)
2015{
2016	switch (tag) {
2017	case EXT4_FC_TAG_ADD_RANGE:
2018		return len == sizeof(struct ext4_fc_add_range);
2019	case EXT4_FC_TAG_DEL_RANGE:
2020		return len == sizeof(struct ext4_fc_del_range);
2021	case EXT4_FC_TAG_CREAT:
2022	case EXT4_FC_TAG_LINK:
2023	case EXT4_FC_TAG_UNLINK:
2024		len -= sizeof(struct ext4_fc_dentry_info);
2025		return len >= 1 && len <= EXT4_NAME_LEN;
2026	case EXT4_FC_TAG_INODE:
2027		len -= sizeof(struct ext4_fc_inode);
2028		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2029			len <= sbi->s_inode_size;
2030	case EXT4_FC_TAG_PAD:
2031		return true; /* padding can have any length */
2032	case EXT4_FC_TAG_TAIL:
2033		return len >= sizeof(struct ext4_fc_tail);
2034	case EXT4_FC_TAG_HEAD:
2035		return len == sizeof(struct ext4_fc_head);
2036	}
2037	return false;
2038}
2039
2040/*
2041 * Recovery Scan phase handler
2042 *
2043 * This function is called during the scan phase and is responsible
2044 * for doing following things:
2045 * - Make sure the fast commit area has valid tags for replay
2046 * - Count number of tags that need to be replayed by the replay handler
2047 * - Verify CRC
2048 * - Create a list of excluded blocks for allocation during replay phase
2049 *
2050 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2051 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2052 * to indicate that scan has finished and JBD2 can now start replay phase.
2053 * It returns a negative error to indicate that there was an error. At the end
2054 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2055 * to indicate the number of tags that need to replayed during the replay phase.
2056 */
2057static int ext4_fc_replay_scan(journal_t *journal,
2058				struct buffer_head *bh, int off,
2059				tid_t expected_tid)
2060{
2061	struct super_block *sb = journal->j_private;
2062	struct ext4_sb_info *sbi = EXT4_SB(sb);
2063	struct ext4_fc_replay_state *state;
2064	int ret = JBD2_FC_REPLAY_CONTINUE;
2065	struct ext4_fc_add_range ext;
2066	struct ext4_fc_tl_mem tl;
2067	struct ext4_fc_tail tail;
2068	__u8 *start, *end, *cur, *val;
2069	struct ext4_fc_head head;
2070	struct ext4_extent *ex;
2071
2072	state = &sbi->s_fc_replay_state;
2073
2074	start = (u8 *)bh->b_data;
2075	end = start + journal->j_blocksize;
2076
2077	if (state->fc_replay_expected_off == 0) {
2078		state->fc_cur_tag = 0;
2079		state->fc_replay_num_tags = 0;
2080		state->fc_crc = 0;
2081		state->fc_regions = NULL;
2082		state->fc_regions_valid = state->fc_regions_used =
2083			state->fc_regions_size = 0;
2084		/* Check if we can stop early */
2085		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2086			!= EXT4_FC_TAG_HEAD)
2087			return 0;
2088	}
2089
2090	if (off != state->fc_replay_expected_off) {
2091		ret = -EFSCORRUPTED;
2092		goto out_err;
2093	}
2094
2095	state->fc_replay_expected_off++;
2096	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2097	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2098		ext4_fc_get_tl(&tl, cur);
2099		val = cur + EXT4_FC_TAG_BASE_LEN;
2100		if (tl.fc_len > end - val ||
2101		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2102			ret = state->fc_replay_num_tags ?
2103				JBD2_FC_REPLAY_STOP : -ECANCELED;
2104			goto out_err;
2105		}
2106		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2107			   tag2str(tl.fc_tag), bh->b_blocknr);
2108		switch (tl.fc_tag) {
2109		case EXT4_FC_TAG_ADD_RANGE:
2110			memcpy(&ext, val, sizeof(ext));
2111			ex = (struct ext4_extent *)&ext.fc_ex;
2112			ret = ext4_fc_record_regions(sb,
2113				le32_to_cpu(ext.fc_ino),
2114				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2115				ext4_ext_get_actual_len(ex), 0);
2116			if (ret < 0)
2117				break;
2118			ret = JBD2_FC_REPLAY_CONTINUE;
2119			fallthrough;
2120		case EXT4_FC_TAG_DEL_RANGE:
2121		case EXT4_FC_TAG_LINK:
2122		case EXT4_FC_TAG_UNLINK:
2123		case EXT4_FC_TAG_CREAT:
2124		case EXT4_FC_TAG_INODE:
2125		case EXT4_FC_TAG_PAD:
2126			state->fc_cur_tag++;
2127			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2128				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2129			break;
2130		case EXT4_FC_TAG_TAIL:
2131			state->fc_cur_tag++;
2132			memcpy(&tail, val, sizeof(tail));
2133			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2134						EXT4_FC_TAG_BASE_LEN +
2135						offsetof(struct ext4_fc_tail,
2136						fc_crc));
2137			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2138				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2139				state->fc_replay_num_tags = state->fc_cur_tag;
2140				state->fc_regions_valid =
2141					state->fc_regions_used;
2142			} else {
2143				ret = state->fc_replay_num_tags ?
2144					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2145			}
2146			state->fc_crc = 0;
2147			break;
2148		case EXT4_FC_TAG_HEAD:
2149			memcpy(&head, val, sizeof(head));
2150			if (le32_to_cpu(head.fc_features) &
2151				~EXT4_FC_SUPPORTED_FEATURES) {
2152				ret = -EOPNOTSUPP;
2153				break;
2154			}
2155			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2156				ret = JBD2_FC_REPLAY_STOP;
2157				break;
2158			}
2159			state->fc_cur_tag++;
2160			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2161				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2162			break;
2163		default:
2164			ret = state->fc_replay_num_tags ?
2165				JBD2_FC_REPLAY_STOP : -ECANCELED;
2166		}
2167		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2168			break;
2169	}
2170
2171out_err:
2172	trace_ext4_fc_replay_scan(sb, ret, off);
2173	return ret;
2174}
2175
2176/*
2177 * Main recovery path entry point.
2178 * The meaning of return codes is similar as above.
2179 */
2180static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2181				enum passtype pass, int off, tid_t expected_tid)
2182{
2183	struct super_block *sb = journal->j_private;
2184	struct ext4_sb_info *sbi = EXT4_SB(sb);
2185	struct ext4_fc_tl_mem tl;
2186	__u8 *start, *end, *cur, *val;
2187	int ret = JBD2_FC_REPLAY_CONTINUE;
2188	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2189	struct ext4_fc_tail tail;
2190
2191	if (pass == PASS_SCAN) {
2192		state->fc_current_pass = PASS_SCAN;
2193		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2194	}
2195
2196	if (state->fc_current_pass != pass) {
2197		state->fc_current_pass = pass;
2198		sbi->s_mount_state |= EXT4_FC_REPLAY;
2199	}
2200	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2201		ext4_debug("Replay stops\n");
2202		ext4_fc_set_bitmaps_and_counters(sb);
2203		return 0;
2204	}
2205
2206#ifdef CONFIG_EXT4_DEBUG
2207	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2208		pr_warn("Dropping fc block %d because max_replay set\n", off);
2209		return JBD2_FC_REPLAY_STOP;
2210	}
2211#endif
2212
2213	start = (u8 *)bh->b_data;
2214	end = start + journal->j_blocksize;
2215
2216	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2217	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2218		ext4_fc_get_tl(&tl, cur);
2219		val = cur + EXT4_FC_TAG_BASE_LEN;
2220
2221		if (state->fc_replay_num_tags == 0) {
2222			ret = JBD2_FC_REPLAY_STOP;
2223			ext4_fc_set_bitmaps_and_counters(sb);
2224			break;
2225		}
2226
2227		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2228		state->fc_replay_num_tags--;
2229		switch (tl.fc_tag) {
2230		case EXT4_FC_TAG_LINK:
2231			ret = ext4_fc_replay_link(sb, &tl, val);
2232			break;
2233		case EXT4_FC_TAG_UNLINK:
2234			ret = ext4_fc_replay_unlink(sb, &tl, val);
2235			break;
2236		case EXT4_FC_TAG_ADD_RANGE:
2237			ret = ext4_fc_replay_add_range(sb, &tl, val);
2238			break;
2239		case EXT4_FC_TAG_CREAT:
2240			ret = ext4_fc_replay_create(sb, &tl, val);
2241			break;
2242		case EXT4_FC_TAG_DEL_RANGE:
2243			ret = ext4_fc_replay_del_range(sb, &tl, val);
2244			break;
2245		case EXT4_FC_TAG_INODE:
2246			ret = ext4_fc_replay_inode(sb, &tl, val);
2247			break;
2248		case EXT4_FC_TAG_PAD:
2249			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2250					     tl.fc_len, 0);
2251			break;
2252		case EXT4_FC_TAG_TAIL:
2253			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2254					     0, tl.fc_len, 0);
2255			memcpy(&tail, val, sizeof(tail));
2256			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2257			break;
2258		case EXT4_FC_TAG_HEAD:
2259			break;
2260		default:
2261			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
 
2262			ret = -ECANCELED;
2263			break;
2264		}
2265		if (ret < 0)
2266			break;
2267		ret = JBD2_FC_REPLAY_CONTINUE;
2268	}
2269	return ret;
2270}
2271
2272void ext4_fc_init(struct super_block *sb, journal_t *journal)
2273{
2274	/*
2275	 * We set replay callback even if fast commit disabled because we may
2276	 * could still have fast commit blocks that need to be replayed even if
2277	 * fast commit has now been turned off.
2278	 */
2279	journal->j_fc_replay_callback = ext4_fc_replay;
2280	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2281		return;
2282	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2283}
2284
2285static const char * const fc_ineligible_reasons[] = {
2286	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2287	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2288	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2289	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2290	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2291	[EXT4_FC_REASON_RESIZE] = "Resize",
2292	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2293	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2294	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2295	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2296};
2297
2298int ext4_fc_info_show(struct seq_file *seq, void *v)
2299{
2300	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2301	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2302	int i;
2303
2304	if (v != SEQ_START_TOKEN)
2305		return 0;
2306
2307	seq_printf(seq,
2308		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2309		   stats->fc_num_commits, stats->fc_ineligible_commits,
2310		   stats->fc_numblks,
2311		   div_u64(stats->s_fc_avg_commit_time, 1000));
2312	seq_puts(seq, "Ineligible reasons:\n");
2313	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2314		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2315			stats->fc_ineligible_reason_count[i]);
2316
2317	return 0;
2318}
2319
2320int __init ext4_fc_init_dentry_cache(void)
2321{
2322	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2323					   SLAB_RECLAIM_ACCOUNT);
2324
2325	if (ext4_fc_dentry_cachep == NULL)
2326		return -ENOMEM;
2327
2328	return 0;
2329}
2330
2331void ext4_fc_destroy_dentry_cache(void)
2332{
2333	kmem_cache_destroy(ext4_fc_dentry_cachep);
2334}