Linux Audio

Check our new training course

Loading...
v6.9.4
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
  68 *
  69 * Not all operations are supported by fast commits today (e.g extended
  70 * attributes). Fast commit ineligibility is marked by calling
  71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
  72 * to full commit.
 
 
 
 
 
 
 
 
 
 
 
  73 *
  74 * Atomicity of commits
  75 * --------------------
  76 * In order to guarantee atomicity during the commit operation, fast commit
  77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  78 * tag contains CRC of the contents and TID of the transaction after which
  79 * this fast commit should be applied. Recovery code replays fast commit
  80 * logs only if there's at least 1 valid tail present. For every fast commit
  81 * operation, there is 1 tail. This means, we may end up with multiple tails
  82 * in the fast commit space. Here's an example:
  83 *
  84 * - Create a new file A and remove existing file B
  85 * - fsync()
  86 * - Append contents to file A
  87 * - Truncate file A
  88 * - fsync()
  89 *
  90 * The fast commit space at the end of above operations would look like this:
  91 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
  92 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
  93 *
  94 * Replay code should thus check for all the valid tails in the FC area.
  95 *
  96 * Fast Commit Replay Idempotence
  97 * ------------------------------
  98 *
  99 * Fast commits tags are idempotent in nature provided the recovery code follows
 100 * certain rules. The guiding principle that the commit path follows while
 101 * committing is that it stores the result of a particular operation instead of
 102 * storing the procedure.
 103 *
 104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 105 * was associated with inode 10. During fast commit, instead of storing this
 106 * operation as a procedure "rename a to b", we store the resulting file system
 107 * state as a "series" of outcomes:
 108 *
 109 * - Link dirent b to inode 10
 110 * - Unlink dirent a
 111 * - Inode <10> with valid refcount
 112 *
 113 * Now when recovery code runs, it needs "enforce" this state on the file
 114 * system. This is what guarantees idempotence of fast commit replay.
 115 *
 116 * Let's take an example of a procedure that is not idempotent and see how fast
 117 * commits make it idempotent. Consider following sequence of operations:
 118 *
 119 *     rm A;    mv B A;    read A
 120 *  (x)     (y)        (z)
 121 *
 122 * (x), (y) and (z) are the points at which we can crash. If we store this
 123 * sequence of operations as is then the replay is not idempotent. Let's say
 124 * while in replay, we crash at (z). During the second replay, file A (which was
 125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 126 * file named A would be absent when we try to read A. So, this sequence of
 127 * operations is not idempotent. However, as mentioned above, instead of storing
 128 * the procedure fast commits store the outcome of each procedure. Thus the fast
 129 * commit log for above procedure would be as follows:
 130 *
 131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 132 * inode 11 before the replay)
 133 *
 134 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 135 * (w)          (x)                    (y)          (z)
 136 *
 137 * If we crash at (z), we will have file A linked to inode 11. During the second
 138 * replay, we will remove file A (inode 11). But we will create it back and make
 139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 142 * similarly. Thus, by converting a non-idempotent procedure into a series of
 143 * idempotent outcomes, fast commits ensured idempotence during the replay.
 144 *
 145 * TODOs
 146 * -----
 147 *
 148 * 0) Fast commit replay path hardening: Fast commit replay code should use
 149 *    journal handles to make sure all the updates it does during the replay
 150 *    path are atomic. With that if we crash during fast commit replay, after
 151 *    trying to do recovery again, we will find a file system where fast commit
 152 *    area is invalid (because new full commit would be found). In order to deal
 153 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 154 *    superblock state is persisted before starting the replay, so that after
 155 *    the crash, fast commit recovery code can look at that flag and perform
 156 *    fast commit recovery even if that area is invalidated by later full
 157 *    commits.
 158 *
 159 * 1) Fast commit's commit path locks the entire file system during fast
 160 *    commit. This has significant performance penalty. Instead of that, we
 161 *    should use ext4_fc_start/stop_update functions to start inode level
 162 *    updates from ext4_journal_start/stop. Once we do that we can drop file
 163 *    system locking during commit path.
 
 
 164 *
 165 * 2) Handle more ineligible cases.
 166 */
 167
 168#include <trace/events/ext4.h>
 169static struct kmem_cache *ext4_fc_dentry_cachep;
 170
 171static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 172{
 173	BUFFER_TRACE(bh, "");
 174	if (uptodate) {
 175		ext4_debug("%s: Block %lld up-to-date",
 176			   __func__, bh->b_blocknr);
 177		set_buffer_uptodate(bh);
 178	} else {
 179		ext4_debug("%s: Block %lld not up-to-date",
 180			   __func__, bh->b_blocknr);
 181		clear_buffer_uptodate(bh);
 182	}
 183
 184	unlock_buffer(bh);
 185}
 186
 187static inline void ext4_fc_reset_inode(struct inode *inode)
 188{
 189	struct ext4_inode_info *ei = EXT4_I(inode);
 190
 191	ei->i_fc_lblk_start = 0;
 192	ei->i_fc_lblk_len = 0;
 193}
 194
 195void ext4_fc_init_inode(struct inode *inode)
 196{
 197	struct ext4_inode_info *ei = EXT4_I(inode);
 198
 199	ext4_fc_reset_inode(inode);
 200	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 201	INIT_LIST_HEAD(&ei->i_fc_list);
 202	INIT_LIST_HEAD(&ei->i_fc_dilist);
 203	init_waitqueue_head(&ei->i_fc_wait);
 204	atomic_set(&ei->i_fc_updates, 0);
 205}
 206
 207/* This function must be called with sbi->s_fc_lock held. */
 208static void ext4_fc_wait_committing_inode(struct inode *inode)
 209__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 210{
 211	wait_queue_head_t *wq;
 212	struct ext4_inode_info *ei = EXT4_I(inode);
 213
 214#if (BITS_PER_LONG < 64)
 215	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 216			EXT4_STATE_FC_COMMITTING);
 217	wq = bit_waitqueue(&ei->i_state_flags,
 218				EXT4_STATE_FC_COMMITTING);
 219#else
 220	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 221			EXT4_STATE_FC_COMMITTING);
 222	wq = bit_waitqueue(&ei->i_flags,
 223				EXT4_STATE_FC_COMMITTING);
 224#endif
 225	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 226	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 227	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 228	schedule();
 229	finish_wait(wq, &wait.wq_entry);
 230}
 231
 232static bool ext4_fc_disabled(struct super_block *sb)
 233{
 234	return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 235		(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
 236}
 237
 238/*
 239 * Inform Ext4's fast about start of an inode update
 240 *
 241 * This function is called by the high level call VFS callbacks before
 242 * performing any inode update. This function blocks if there's an ongoing
 243 * fast commit on the inode in question.
 244 */
 245void ext4_fc_start_update(struct inode *inode)
 246{
 247	struct ext4_inode_info *ei = EXT4_I(inode);
 248
 249	if (ext4_fc_disabled(inode->i_sb))
 
 250		return;
 251
 252restart:
 253	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 254	if (list_empty(&ei->i_fc_list))
 255		goto out;
 256
 257	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 258		ext4_fc_wait_committing_inode(inode);
 259		goto restart;
 260	}
 261out:
 262	atomic_inc(&ei->i_fc_updates);
 263	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 264}
 265
 266/*
 267 * Stop inode update and wake up waiting fast commits if any.
 268 */
 269void ext4_fc_stop_update(struct inode *inode)
 270{
 271	struct ext4_inode_info *ei = EXT4_I(inode);
 272
 273	if (ext4_fc_disabled(inode->i_sb))
 
 274		return;
 275
 276	if (atomic_dec_and_test(&ei->i_fc_updates))
 277		wake_up_all(&ei->i_fc_wait);
 278}
 279
 280/*
 281 * Remove inode from fast commit list. If the inode is being committed
 282 * we wait until inode commit is done.
 283 */
 284void ext4_fc_del(struct inode *inode)
 285{
 286	struct ext4_inode_info *ei = EXT4_I(inode);
 287	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 288	struct ext4_fc_dentry_update *fc_dentry;
 289
 290	if (ext4_fc_disabled(inode->i_sb))
 
 291		return;
 292
 293restart:
 294	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 295	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
 296		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 297		return;
 298	}
 299
 300	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 301		ext4_fc_wait_committing_inode(inode);
 302		goto restart;
 303	}
 
 
 
 304
 305	if (!list_empty(&ei->i_fc_list))
 306		list_del_init(&ei->i_fc_list);
 
 
 
 
 
 307
 308	/*
 309	 * Since this inode is getting removed, let's also remove all FC
 310	 * dentry create references, since it is not needed to log it anyways.
 311	 */
 312	if (list_empty(&ei->i_fc_dilist)) {
 313		spin_unlock(&sbi->s_fc_lock);
 314		return;
 315	}
 316
 317	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
 318	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 319	list_del_init(&fc_dentry->fcd_list);
 320	list_del_init(&fc_dentry->fcd_dilist);
 321
 322	WARN_ON(!list_empty(&ei->i_fc_dilist));
 323	spin_unlock(&sbi->s_fc_lock);
 324
 325	if (fc_dentry->fcd_name.name &&
 326		fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
 327		kfree(fc_dentry->fcd_name.name);
 328	kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
 329
 330	return;
 331}
 332
 333/*
 334 * Mark file system as fast commit ineligible, and record latest
 335 * ineligible transaction tid. This means until the recorded
 336 * transaction, commit operation would result in a full jbd2 commit.
 337 */
 338void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
 339{
 340	struct ext4_sb_info *sbi = EXT4_SB(sb);
 341	tid_t tid;
 342
 343	if (ext4_fc_disabled(sb))
 
 344		return;
 345
 346	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 347	if (handle && !IS_ERR(handle))
 348		tid = handle->h_transaction->t_tid;
 349	else {
 350		read_lock(&sbi->s_journal->j_state_lock);
 351		tid = sbi->s_journal->j_running_transaction ?
 352				sbi->s_journal->j_running_transaction->t_tid : 0;
 353		read_unlock(&sbi->s_journal->j_state_lock);
 354	}
 355	spin_lock(&sbi->s_fc_lock);
 356	if (sbi->s_fc_ineligible_tid < tid)
 357		sbi->s_fc_ineligible_tid = tid;
 358	spin_unlock(&sbi->s_fc_lock);
 359	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 360	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 361}
 362
 363/*
 364 * Generic fast commit tracking function. If this is the first time this we are
 365 * called after a full commit, we initialize fast commit fields and then call
 366 * __fc_track_fn() with update = 0. If we have already been called after a full
 367 * commit, we pass update = 1. Based on that, the track function can determine
 368 * if it needs to track a field for the first time or if it needs to just
 369 * update the previously tracked value.
 370 *
 371 * If enqueue is set, this function enqueues the inode in fast commit list.
 372 */
 373static int ext4_fc_track_template(
 374	handle_t *handle, struct inode *inode,
 375	int (*__fc_track_fn)(struct inode *, void *, bool),
 376	void *args, int enqueue)
 377{
 378	bool update = false;
 379	struct ext4_inode_info *ei = EXT4_I(inode);
 380	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 381	tid_t tid = 0;
 382	int ret;
 383
 
 
 
 
 
 
 
 384	tid = handle->h_transaction->t_tid;
 385	mutex_lock(&ei->i_fc_lock);
 386	if (tid == ei->i_sync_tid) {
 387		update = true;
 388	} else {
 389		ext4_fc_reset_inode(inode);
 390		ei->i_sync_tid = tid;
 391	}
 392	ret = __fc_track_fn(inode, args, update);
 393	mutex_unlock(&ei->i_fc_lock);
 394
 395	if (!enqueue)
 396		return ret;
 397
 398	spin_lock(&sbi->s_fc_lock);
 399	if (list_empty(&EXT4_I(inode)->i_fc_list))
 400		list_add_tail(&EXT4_I(inode)->i_fc_list,
 401				(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 402				 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
 403				&sbi->s_fc_q[FC_Q_STAGING] :
 404				&sbi->s_fc_q[FC_Q_MAIN]);
 405	spin_unlock(&sbi->s_fc_lock);
 406
 407	return ret;
 408}
 409
 410struct __track_dentry_update_args {
 411	struct dentry *dentry;
 412	int op;
 413};
 414
 415/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 416static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 417{
 418	struct ext4_fc_dentry_update *node;
 419	struct ext4_inode_info *ei = EXT4_I(inode);
 420	struct __track_dentry_update_args *dentry_update =
 421		(struct __track_dentry_update_args *)arg;
 422	struct dentry *dentry = dentry_update->dentry;
 423	struct inode *dir = dentry->d_parent->d_inode;
 424	struct super_block *sb = inode->i_sb;
 425	struct ext4_sb_info *sbi = EXT4_SB(sb);
 426
 427	mutex_unlock(&ei->i_fc_lock);
 428
 429	if (IS_ENCRYPTED(dir)) {
 430		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
 431					NULL);
 432		mutex_lock(&ei->i_fc_lock);
 433		return -EOPNOTSUPP;
 434	}
 435
 436	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 437	if (!node) {
 438		ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 439		mutex_lock(&ei->i_fc_lock);
 440		return -ENOMEM;
 441	}
 442
 443	node->fcd_op = dentry_update->op;
 444	node->fcd_parent = dir->i_ino;
 445	node->fcd_ino = inode->i_ino;
 446	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 447		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 448		if (!node->fcd_name.name) {
 449			kmem_cache_free(ext4_fc_dentry_cachep, node);
 450			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL);
 
 451			mutex_lock(&ei->i_fc_lock);
 452			return -ENOMEM;
 453		}
 454		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 455			dentry->d_name.len);
 456	} else {
 457		memcpy(node->fcd_iname, dentry->d_name.name,
 458			dentry->d_name.len);
 459		node->fcd_name.name = node->fcd_iname;
 460	}
 461	node->fcd_name.len = dentry->d_name.len;
 462	INIT_LIST_HEAD(&node->fcd_dilist);
 463	spin_lock(&sbi->s_fc_lock);
 464	if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
 465		sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
 466		list_add_tail(&node->fcd_list,
 467				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 468	else
 469		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 470
 471	/*
 472	 * This helps us keep a track of all fc_dentry updates which is part of
 473	 * this ext4 inode. So in case the inode is getting unlinked, before
 474	 * even we get a chance to fsync, we could remove all fc_dentry
 475	 * references while evicting the inode in ext4_fc_del().
 476	 * Also with this, we don't need to loop over all the inodes in
 477	 * sbi->s_fc_q to get the corresponding inode in
 478	 * ext4_fc_commit_dentry_updates().
 479	 */
 480	if (dentry_update->op == EXT4_FC_TAG_CREAT) {
 481		WARN_ON(!list_empty(&ei->i_fc_dilist));
 482		list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
 483	}
 484	spin_unlock(&sbi->s_fc_lock);
 485	mutex_lock(&ei->i_fc_lock);
 486
 487	return 0;
 488}
 489
 490void __ext4_fc_track_unlink(handle_t *handle,
 491		struct inode *inode, struct dentry *dentry)
 492{
 493	struct __track_dentry_update_args args;
 494	int ret;
 495
 496	args.dentry = dentry;
 497	args.op = EXT4_FC_TAG_UNLINK;
 498
 499	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 500					(void *)&args, 0);
 501	trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
 502}
 503
 504void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 505{
 506	struct inode *inode = d_inode(dentry);
 507
 508	if (ext4_fc_disabled(inode->i_sb))
 509		return;
 510
 511	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 512		return;
 513
 514	__ext4_fc_track_unlink(handle, inode, dentry);
 515}
 516
 517void __ext4_fc_track_link(handle_t *handle,
 518	struct inode *inode, struct dentry *dentry)
 519{
 520	struct __track_dentry_update_args args;
 521	int ret;
 522
 523	args.dentry = dentry;
 524	args.op = EXT4_FC_TAG_LINK;
 525
 526	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 527					(void *)&args, 0);
 528	trace_ext4_fc_track_link(handle, inode, dentry, ret);
 529}
 530
 531void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 532{
 533	struct inode *inode = d_inode(dentry);
 534
 535	if (ext4_fc_disabled(inode->i_sb))
 536		return;
 537
 538	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 539		return;
 540
 541	__ext4_fc_track_link(handle, inode, dentry);
 542}
 543
 544void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 545			  struct dentry *dentry)
 546{
 547	struct __track_dentry_update_args args;
 548	int ret;
 549
 550	args.dentry = dentry;
 551	args.op = EXT4_FC_TAG_CREAT;
 552
 553	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 554					(void *)&args, 0);
 555	trace_ext4_fc_track_create(handle, inode, dentry, ret);
 556}
 557
 558void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 559{
 560	struct inode *inode = d_inode(dentry);
 561
 562	if (ext4_fc_disabled(inode->i_sb))
 563		return;
 564
 565	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 566		return;
 567
 568	__ext4_fc_track_create(handle, inode, dentry);
 569}
 570
 571/* __track_fn for inode tracking */
 572static int __track_inode(struct inode *inode, void *arg, bool update)
 573{
 574	if (update)
 575		return -EEXIST;
 576
 577	EXT4_I(inode)->i_fc_lblk_len = 0;
 578
 579	return 0;
 580}
 581
 582void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 583{
 584	int ret;
 585
 586	if (S_ISDIR(inode->i_mode))
 587		return;
 588
 589	if (ext4_fc_disabled(inode->i_sb))
 590		return;
 591
 592	if (ext4_should_journal_data(inode)) {
 593		ext4_fc_mark_ineligible(inode->i_sb,
 594					EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
 595		return;
 596	}
 597
 598	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 599		return;
 600
 601	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 602	trace_ext4_fc_track_inode(handle, inode, ret);
 603}
 604
 605struct __track_range_args {
 606	ext4_lblk_t start, end;
 607};
 608
 609/* __track_fn for tracking data updates */
 610static int __track_range(struct inode *inode, void *arg, bool update)
 611{
 612	struct ext4_inode_info *ei = EXT4_I(inode);
 613	ext4_lblk_t oldstart;
 614	struct __track_range_args *__arg =
 615		(struct __track_range_args *)arg;
 616
 617	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 618		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 619		return -ECANCELED;
 620	}
 621
 622	oldstart = ei->i_fc_lblk_start;
 623
 624	if (update && ei->i_fc_lblk_len > 0) {
 625		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 626		ei->i_fc_lblk_len =
 627			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 628				ei->i_fc_lblk_start + 1;
 629	} else {
 630		ei->i_fc_lblk_start = __arg->start;
 631		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 632	}
 633
 634	return 0;
 635}
 636
 637void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 638			 ext4_lblk_t end)
 639{
 640	struct __track_range_args args;
 641	int ret;
 642
 643	if (S_ISDIR(inode->i_mode))
 644		return;
 645
 646	if (ext4_fc_disabled(inode->i_sb))
 647		return;
 648
 649	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
 650		return;
 651
 652	args.start = start;
 653	args.end = end;
 654
 655	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 656
 657	trace_ext4_fc_track_range(handle, inode, start, end, ret);
 658}
 659
 660static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 661{
 662	blk_opf_t write_flags = REQ_SYNC;
 663	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 664
 665	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 666	if (test_opt(sb, BARRIER) && is_tail)
 667		write_flags |= REQ_FUA | REQ_PREFLUSH;
 668	lock_buffer(bh);
 669	set_buffer_dirty(bh);
 670	set_buffer_uptodate(bh);
 671	bh->b_end_io = ext4_end_buffer_io_sync;
 672	submit_bh(REQ_OP_WRITE | write_flags, bh);
 673	EXT4_SB(sb)->s_fc_bh = NULL;
 674}
 675
 676/* Ext4 commit path routines */
 677
 
 
 
 
 
 
 
 
 
 
 
 
 678/*
 679 * Allocate len bytes on a fast commit buffer.
 680 *
 681 * During the commit time this function is used to manage fast commit
 682 * block space. We don't split a fast commit log onto different
 683 * blocks. So this function makes sure that if there's not enough space
 684 * on the current block, the remaining space in the current block is
 685 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 686 * new block is from jbd2 and CRC is updated to reflect the padding
 687 * we added.
 688 */
 689static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 690{
 691	struct ext4_fc_tl tl;
 692	struct ext4_sb_info *sbi = EXT4_SB(sb);
 693	struct buffer_head *bh;
 694	int bsize = sbi->s_journal->j_blocksize;
 695	int ret, off = sbi->s_fc_bytes % bsize;
 696	int remaining;
 697	u8 *dst;
 698
 699	/*
 700	 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
 701	 * cannot fulfill the request.
 702	 */
 703	if (len > bsize - EXT4_FC_TAG_BASE_LEN)
 704		return NULL;
 705
 706	if (!sbi->s_fc_bh) {
 707		ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 708		if (ret)
 709			return NULL;
 710		sbi->s_fc_bh = bh;
 711	}
 712	dst = sbi->s_fc_bh->b_data + off;
 713
 714	/*
 715	 * Allocate the bytes in the current block if we can do so while still
 716	 * leaving enough space for a PAD tlv.
 717	 */
 718	remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
 719	if (len <= remaining) {
 720		sbi->s_fc_bytes += len;
 721		return dst;
 722	}
 723
 724	/*
 725	 * Else, terminate the current block with a PAD tlv, then allocate a new
 726	 * block and allocate the bytes at the start of that new block.
 727	 */
 728
 729	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 730	tl.fc_len = cpu_to_le16(remaining);
 731	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 732	memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
 733	*crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
 734
 735	ext4_fc_submit_bh(sb, false);
 736
 737	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 738	if (ret)
 739		return NULL;
 740	sbi->s_fc_bh = bh;
 741	sbi->s_fc_bytes += bsize - off + len;
 742	return sbi->s_fc_bh->b_data;
 743}
 744
 
 
 
 
 
 
 
 
 
 745/*
 746 * Complete a fast commit by writing tail tag.
 747 *
 748 * Writing tail tag marks the end of a fast commit. In order to guarantee
 749 * atomicity, after writing tail tag, even if there's space remaining
 750 * in the block, next commit shouldn't use it. That's why tail tag
 751 * has the length as that of the remaining space on the block.
 752 */
 753static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 754{
 755	struct ext4_sb_info *sbi = EXT4_SB(sb);
 756	struct ext4_fc_tl tl;
 757	struct ext4_fc_tail tail;
 758	int off, bsize = sbi->s_journal->j_blocksize;
 759	u8 *dst;
 760
 761	/*
 762	 * ext4_fc_reserve_space takes care of allocating an extra block if
 763	 * there's no enough space on this block for accommodating this tail.
 764	 */
 765	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
 766	if (!dst)
 767		return -ENOSPC;
 768
 769	off = sbi->s_fc_bytes % bsize;
 770
 771	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 772	tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
 773	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 774
 775	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 776	dst += EXT4_FC_TAG_BASE_LEN;
 777	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 778	memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
 779	dst += sizeof(tail.fc_tid);
 780	crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
 781			  dst - (u8 *)sbi->s_fc_bh->b_data);
 782	tail.fc_crc = cpu_to_le32(crc);
 783	memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
 784	dst += sizeof(tail.fc_crc);
 785	memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
 786
 787	ext4_fc_submit_bh(sb, true);
 788
 789	return 0;
 790}
 791
 792/*
 793 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 794 * Returns false if there's not enough space.
 795 */
 796static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 797			   u32 *crc)
 798{
 799	struct ext4_fc_tl tl;
 800	u8 *dst;
 801
 802	dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
 803	if (!dst)
 804		return false;
 805
 806	tl.fc_tag = cpu_to_le16(tag);
 807	tl.fc_len = cpu_to_le16(len);
 808
 809	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 810	memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
 811
 812	return true;
 813}
 814
 815/* Same as above, but adds dentry tlv. */
 816static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 817				   struct ext4_fc_dentry_update *fc_dentry)
 
 
 818{
 819	struct ext4_fc_dentry_info fcd;
 820	struct ext4_fc_tl tl;
 821	int dlen = fc_dentry->fcd_name.len;
 822	u8 *dst = ext4_fc_reserve_space(sb,
 823			EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
 824
 825	if (!dst)
 826		return false;
 827
 828	fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
 829	fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
 830	tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
 831	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 832	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 833	dst += EXT4_FC_TAG_BASE_LEN;
 834	memcpy(dst, &fcd, sizeof(fcd));
 835	dst += sizeof(fcd);
 836	memcpy(dst, fc_dentry->fcd_name.name, dlen);
 
 837
 838	return true;
 839}
 840
 841/*
 842 * Writes inode in the fast commit space under TLV with tag @tag.
 843 * Returns 0 on success, error on failure.
 844 */
 845static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 846{
 847	struct ext4_inode_info *ei = EXT4_I(inode);
 848	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 849	int ret;
 850	struct ext4_iloc iloc;
 851	struct ext4_fc_inode fc_inode;
 852	struct ext4_fc_tl tl;
 853	u8 *dst;
 854
 855	ret = ext4_get_inode_loc(inode, &iloc);
 856	if (ret)
 857		return ret;
 858
 859	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 860		inode_len = EXT4_INODE_SIZE(inode->i_sb);
 861	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 862		inode_len += ei->i_extra_isize;
 863
 864	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 865	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 866	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 867
 868	ret = -ECANCELED;
 869	dst = ext4_fc_reserve_space(inode->i_sb,
 870		EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
 871	if (!dst)
 872		goto err;
 873
 874	memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
 875	dst += EXT4_FC_TAG_BASE_LEN;
 876	memcpy(dst, &fc_inode, sizeof(fc_inode));
 
 
 877	dst += sizeof(fc_inode);
 878	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
 879	ret = 0;
 880err:
 881	brelse(iloc.bh);
 882	return ret;
 883}
 884
 885/*
 886 * Writes updated data ranges for the inode in question. Updates CRC.
 887 * Returns 0 on success, error otherwise.
 888 */
 889static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 890{
 891	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 892	struct ext4_inode_info *ei = EXT4_I(inode);
 893	struct ext4_map_blocks map;
 894	struct ext4_fc_add_range fc_ext;
 895	struct ext4_fc_del_range lrange;
 896	struct ext4_extent *ex;
 897	int ret;
 898
 899	mutex_lock(&ei->i_fc_lock);
 900	if (ei->i_fc_lblk_len == 0) {
 901		mutex_unlock(&ei->i_fc_lock);
 902		return 0;
 903	}
 904	old_blk_size = ei->i_fc_lblk_start;
 905	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 906	ei->i_fc_lblk_len = 0;
 907	mutex_unlock(&ei->i_fc_lock);
 908
 909	cur_lblk_off = old_blk_size;
 910	ext4_debug("will try writing %d to %d for inode %ld\n",
 911		   cur_lblk_off, new_blk_size, inode->i_ino);
 912
 913	while (cur_lblk_off <= new_blk_size) {
 914		map.m_lblk = cur_lblk_off;
 915		map.m_len = new_blk_size - cur_lblk_off + 1;
 916		ret = ext4_map_blocks(NULL, inode, &map, 0);
 917		if (ret < 0)
 918			return -ECANCELED;
 919
 920		if (map.m_len == 0) {
 921			cur_lblk_off++;
 922			continue;
 923		}
 924
 925		if (ret == 0) {
 926			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 927			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 928			lrange.fc_len = cpu_to_le32(map.m_len);
 929			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 930					    sizeof(lrange), (u8 *)&lrange, crc))
 931				return -ENOSPC;
 932		} else {
 933			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 934				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 935
 936			/* Limit the number of blocks in one extent */
 937			map.m_len = min(max, map.m_len);
 938
 939			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 940			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 941			ex->ee_block = cpu_to_le32(map.m_lblk);
 942			ex->ee_len = cpu_to_le16(map.m_len);
 943			ext4_ext_store_pblock(ex, map.m_pblk);
 944			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 945				ext4_ext_mark_unwritten(ex);
 946			else
 947				ext4_ext_mark_initialized(ex);
 948			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 949					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 950				return -ENOSPC;
 951		}
 952
 953		cur_lblk_off += map.m_len;
 954	}
 955
 956	return 0;
 957}
 958
 959
 960/* Submit data for all the fast commit inodes */
 961static int ext4_fc_submit_inode_data_all(journal_t *journal)
 962{
 963	struct super_block *sb = journal->j_private;
 964	struct ext4_sb_info *sbi = EXT4_SB(sb);
 965	struct ext4_inode_info *ei;
 966	int ret = 0;
 967
 968	spin_lock(&sbi->s_fc_lock);
 
 969	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 970		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 971		while (atomic_read(&ei->i_fc_updates)) {
 972			DEFINE_WAIT(wait);
 973
 974			prepare_to_wait(&ei->i_fc_wait, &wait,
 975						TASK_UNINTERRUPTIBLE);
 976			if (atomic_read(&ei->i_fc_updates)) {
 977				spin_unlock(&sbi->s_fc_lock);
 978				schedule();
 979				spin_lock(&sbi->s_fc_lock);
 980			}
 981			finish_wait(&ei->i_fc_wait, &wait);
 982		}
 983		spin_unlock(&sbi->s_fc_lock);
 984		ret = jbd2_submit_inode_data(journal, ei->jinode);
 985		if (ret)
 986			return ret;
 987		spin_lock(&sbi->s_fc_lock);
 988	}
 989	spin_unlock(&sbi->s_fc_lock);
 990
 991	return ret;
 992}
 993
 994/* Wait for completion of data for all the fast commit inodes */
 995static int ext4_fc_wait_inode_data_all(journal_t *journal)
 996{
 997	struct super_block *sb = journal->j_private;
 998	struct ext4_sb_info *sbi = EXT4_SB(sb);
 999	struct ext4_inode_info *pos, *n;
1000	int ret = 0;
1001
1002	spin_lock(&sbi->s_fc_lock);
1003	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1004		if (!ext4_test_inode_state(&pos->vfs_inode,
1005					   EXT4_STATE_FC_COMMITTING))
1006			continue;
1007		spin_unlock(&sbi->s_fc_lock);
1008
1009		ret = jbd2_wait_inode_data(journal, pos->jinode);
1010		if (ret)
1011			return ret;
1012		spin_lock(&sbi->s_fc_lock);
1013	}
1014	spin_unlock(&sbi->s_fc_lock);
1015
1016	return 0;
1017}
1018
1019/* Commit all the directory entry updates */
1020static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1021__acquires(&sbi->s_fc_lock)
1022__releases(&sbi->s_fc_lock)
1023{
1024	struct super_block *sb = journal->j_private;
1025	struct ext4_sb_info *sbi = EXT4_SB(sb);
1026	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1027	struct inode *inode;
1028	struct ext4_inode_info *ei;
1029	int ret;
1030
1031	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1032		return 0;
1033	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1034				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1035		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1036			spin_unlock(&sbi->s_fc_lock);
1037			if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1038				ret = -ENOSPC;
1039				goto lock_and_exit;
1040			}
1041			spin_lock(&sbi->s_fc_lock);
1042			continue;
1043		}
 
 
 
 
 
 
 
 
 
1044		/*
1045		 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1046		 * corresponding inode pointer
1047		 */
1048		WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1049		ei = list_first_entry(&fc_dentry->fcd_dilist,
1050				struct ext4_inode_info, i_fc_dilist);
1051		inode = &ei->vfs_inode;
1052		WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1053
1054		spin_unlock(&sbi->s_fc_lock);
1055
1056		/*
1057		 * We first write the inode and then the create dirent. This
1058		 * allows the recovery code to create an unnamed inode first
1059		 * and then link it to a directory entry. This allows us
1060		 * to use namei.c routines almost as is and simplifies
1061		 * the recovery code.
1062		 */
1063		ret = ext4_fc_write_inode(inode, crc);
1064		if (ret)
1065			goto lock_and_exit;
1066
1067		ret = ext4_fc_write_inode_data(inode, crc);
1068		if (ret)
1069			goto lock_and_exit;
1070
1071		if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
 
 
 
 
1072			ret = -ENOSPC;
1073			goto lock_and_exit;
1074		}
1075
1076		spin_lock(&sbi->s_fc_lock);
1077	}
1078	return 0;
1079lock_and_exit:
1080	spin_lock(&sbi->s_fc_lock);
1081	return ret;
1082}
1083
1084static int ext4_fc_perform_commit(journal_t *journal)
1085{
1086	struct super_block *sb = journal->j_private;
1087	struct ext4_sb_info *sbi = EXT4_SB(sb);
1088	struct ext4_inode_info *iter;
1089	struct ext4_fc_head head;
1090	struct inode *inode;
1091	struct blk_plug plug;
1092	int ret = 0;
1093	u32 crc = 0;
1094
1095	ret = ext4_fc_submit_inode_data_all(journal);
1096	if (ret)
1097		return ret;
1098
1099	ret = ext4_fc_wait_inode_data_all(journal);
1100	if (ret)
1101		return ret;
1102
1103	/*
1104	 * If file system device is different from journal device, issue a cache
1105	 * flush before we start writing fast commit blocks.
1106	 */
1107	if (journal->j_fs_dev != journal->j_dev)
1108		blkdev_issue_flush(journal->j_fs_dev);
1109
1110	blk_start_plug(&plug);
1111	if (sbi->s_fc_bytes == 0) {
1112		/*
1113		 * Add a head tag only if this is the first fast commit
1114		 * in this TID.
1115		 */
1116		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1117		head.fc_tid = cpu_to_le32(
1118			sbi->s_journal->j_running_transaction->t_tid);
1119		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1120			(u8 *)&head, &crc)) {
1121			ret = -ENOSPC;
1122			goto out;
1123		}
1124	}
1125
1126	spin_lock(&sbi->s_fc_lock);
1127	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1128	if (ret) {
1129		spin_unlock(&sbi->s_fc_lock);
1130		goto out;
1131	}
1132
1133	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1134		inode = &iter->vfs_inode;
1135		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1136			continue;
1137
1138		spin_unlock(&sbi->s_fc_lock);
1139		ret = ext4_fc_write_inode_data(inode, &crc);
1140		if (ret)
1141			goto out;
1142		ret = ext4_fc_write_inode(inode, &crc);
1143		if (ret)
1144			goto out;
1145		spin_lock(&sbi->s_fc_lock);
1146	}
1147	spin_unlock(&sbi->s_fc_lock);
1148
1149	ret = ext4_fc_write_tail(sb, crc);
1150
1151out:
1152	blk_finish_plug(&plug);
1153	return ret;
1154}
1155
1156static void ext4_fc_update_stats(struct super_block *sb, int status,
1157				 u64 commit_time, int nblks, tid_t commit_tid)
1158{
1159	struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1160
1161	ext4_debug("Fast commit ended with status = %d for tid %u",
1162			status, commit_tid);
1163	if (status == EXT4_FC_STATUS_OK) {
1164		stats->fc_num_commits++;
1165		stats->fc_numblks += nblks;
1166		if (likely(stats->s_fc_avg_commit_time))
1167			stats->s_fc_avg_commit_time =
1168				(commit_time +
1169				 stats->s_fc_avg_commit_time * 3) / 4;
1170		else
1171			stats->s_fc_avg_commit_time = commit_time;
1172	} else if (status == EXT4_FC_STATUS_FAILED ||
1173		   status == EXT4_FC_STATUS_INELIGIBLE) {
1174		if (status == EXT4_FC_STATUS_FAILED)
1175			stats->fc_failed_commits++;
1176		stats->fc_ineligible_commits++;
1177	} else {
1178		stats->fc_skipped_commits++;
1179	}
1180	trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1181}
1182
1183/*
1184 * The main commit entry point. Performs a fast commit for transaction
1185 * commit_tid if needed. If it's not possible to perform a fast commit
1186 * due to various reasons, we fall back to full commit. Returns 0
1187 * on success, error otherwise.
1188 */
1189int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1190{
1191	struct super_block *sb = journal->j_private;
1192	struct ext4_sb_info *sbi = EXT4_SB(sb);
1193	int nblks = 0, ret, bsize = journal->j_blocksize;
1194	int subtid = atomic_read(&sbi->s_fc_subtid);
1195	int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1196	ktime_t start_time, commit_time;
1197
1198	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1199		return jbd2_complete_transaction(journal, commit_tid);
1200
1201	trace_ext4_fc_commit_start(sb, commit_tid);
1202
1203	start_time = ktime_get();
1204
 
 
 
 
 
 
1205restart_fc:
1206	ret = jbd2_fc_begin_commit(journal, commit_tid);
1207	if (ret == -EALREADY) {
1208		/* There was an ongoing commit, check if we need to restart */
1209		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1210			commit_tid > journal->j_commit_sequence)
1211			goto restart_fc;
1212		ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1213				commit_tid);
1214		return 0;
1215	} else if (ret) {
1216		/*
1217		 * Commit couldn't start. Just update stats and perform a
1218		 * full commit.
1219		 */
1220		ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1221				commit_tid);
1222		return jbd2_complete_transaction(journal, commit_tid);
1223	}
1224
1225	/*
1226	 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1227	 * if we are fast commit ineligible.
1228	 */
1229	if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1230		status = EXT4_FC_STATUS_INELIGIBLE;
1231		goto fallback;
1232	}
1233
1234	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1235	ret = ext4_fc_perform_commit(journal);
1236	if (ret < 0) {
1237		status = EXT4_FC_STATUS_FAILED;
1238		goto fallback;
 
1239	}
1240	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1241	ret = jbd2_fc_wait_bufs(journal, nblks);
1242	if (ret < 0) {
1243		status = EXT4_FC_STATUS_FAILED;
1244		goto fallback;
 
1245	}
1246	atomic_inc(&sbi->s_fc_subtid);
1247	ret = jbd2_fc_end_commit(journal);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1248	/*
1249	 * weight the commit time higher than the average time so we
1250	 * don't react too strongly to vast changes in the commit time
1251	 */
1252	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1253	ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1254	return ret;
1255
1256fallback:
1257	ret = jbd2_fc_end_commit_fallback(journal);
1258	ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1259	return ret;
 
 
 
 
 
 
1260}
1261
1262/*
1263 * Fast commit cleanup routine. This is called after every fast commit and
1264 * full commit. full is true if we are called after a full commit.
1265 */
1266static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1267{
1268	struct super_block *sb = journal->j_private;
1269	struct ext4_sb_info *sbi = EXT4_SB(sb);
1270	struct ext4_inode_info *iter, *iter_n;
1271	struct ext4_fc_dentry_update *fc_dentry;
1272
1273	if (full && sbi->s_fc_bh)
1274		sbi->s_fc_bh = NULL;
1275
1276	trace_ext4_fc_cleanup(journal, full, tid);
1277	jbd2_fc_release_bufs(journal);
1278
1279	spin_lock(&sbi->s_fc_lock);
1280	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1281				 i_fc_list) {
1282		list_del_init(&iter->i_fc_list);
1283		ext4_clear_inode_state(&iter->vfs_inode,
1284				       EXT4_STATE_FC_COMMITTING);
1285		if (iter->i_sync_tid <= tid)
1286			ext4_fc_reset_inode(&iter->vfs_inode);
1287		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1288		smp_mb();
1289#if (BITS_PER_LONG < 64)
1290		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1291#else
1292		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1293#endif
1294	}
1295
1296	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1297		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1298					     struct ext4_fc_dentry_update,
1299					     fcd_list);
1300		list_del_init(&fc_dentry->fcd_list);
1301		list_del_init(&fc_dentry->fcd_dilist);
1302		spin_unlock(&sbi->s_fc_lock);
1303
1304		if (fc_dentry->fcd_name.name &&
1305			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1306			kfree(fc_dentry->fcd_name.name);
1307		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1308		spin_lock(&sbi->s_fc_lock);
1309	}
1310
1311	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1312				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1313	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1314				&sbi->s_fc_q[FC_Q_MAIN]);
1315
1316	if (tid >= sbi->s_fc_ineligible_tid) {
1317		sbi->s_fc_ineligible_tid = 0;
1318		ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1319	}
1320
1321	if (full)
1322		sbi->s_fc_bytes = 0;
1323	spin_unlock(&sbi->s_fc_lock);
1324	trace_ext4_fc_stats(sb);
1325}
1326
1327/* Ext4 Replay Path Routines */
1328
1329/* Helper struct for dentry replay routines */
1330struct dentry_info_args {
1331	int parent_ino, dname_len, ino, inode_len;
1332	char *dname;
1333};
1334
1335/* Same as struct ext4_fc_tl, but uses native endianness fields */
1336struct ext4_fc_tl_mem {
1337	u16 fc_tag;
1338	u16 fc_len;
1339};
1340
1341static inline void tl_to_darg(struct dentry_info_args *darg,
1342			      struct ext4_fc_tl_mem *tl, u8 *val)
1343{
1344	struct ext4_fc_dentry_info fcd;
1345
1346	memcpy(&fcd, val, sizeof(fcd));
1347
1348	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1349	darg->ino = le32_to_cpu(fcd.fc_ino);
1350	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1351	darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1352}
1353
1354static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1355{
1356	struct ext4_fc_tl tl_disk;
1357
1358	memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1359	tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1360	tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1361}
1362
1363/* Unlink replay function */
1364static int ext4_fc_replay_unlink(struct super_block *sb,
1365				 struct ext4_fc_tl_mem *tl, u8 *val)
1366{
1367	struct inode *inode, *old_parent;
1368	struct qstr entry;
1369	struct dentry_info_args darg;
1370	int ret = 0;
1371
1372	tl_to_darg(&darg, tl, val);
1373
1374	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1375			darg.parent_ino, darg.dname_len);
1376
1377	entry.name = darg.dname;
1378	entry.len = darg.dname_len;
1379	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1380
1381	if (IS_ERR(inode)) {
1382		ext4_debug("Inode %d not found", darg.ino);
1383		return 0;
1384	}
1385
1386	old_parent = ext4_iget(sb, darg.parent_ino,
1387				EXT4_IGET_NORMAL);
1388	if (IS_ERR(old_parent)) {
1389		ext4_debug("Dir with inode %d not found", darg.parent_ino);
1390		iput(inode);
1391		return 0;
1392	}
1393
1394	ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1395	/* -ENOENT ok coz it might not exist anymore. */
1396	if (ret == -ENOENT)
1397		ret = 0;
1398	iput(old_parent);
1399	iput(inode);
1400	return ret;
1401}
1402
1403static int ext4_fc_replay_link_internal(struct super_block *sb,
1404				struct dentry_info_args *darg,
1405				struct inode *inode)
1406{
1407	struct inode *dir = NULL;
1408	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1409	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1410	int ret = 0;
1411
1412	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1413	if (IS_ERR(dir)) {
1414		ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1415		dir = NULL;
1416		goto out;
1417	}
1418
1419	dentry_dir = d_obtain_alias(dir);
1420	if (IS_ERR(dentry_dir)) {
1421		ext4_debug("Failed to obtain dentry");
1422		dentry_dir = NULL;
1423		goto out;
1424	}
1425
1426	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1427	if (!dentry_inode) {
1428		ext4_debug("Inode dentry not created.");
1429		ret = -ENOMEM;
1430		goto out;
1431	}
1432
1433	ret = __ext4_link(dir, inode, dentry_inode);
1434	/*
1435	 * It's possible that link already existed since data blocks
1436	 * for the dir in question got persisted before we crashed OR
1437	 * we replayed this tag and crashed before the entire replay
1438	 * could complete.
1439	 */
1440	if (ret && ret != -EEXIST) {
1441		ext4_debug("Failed to link\n");
1442		goto out;
1443	}
1444
1445	ret = 0;
1446out:
1447	if (dentry_dir) {
1448		d_drop(dentry_dir);
1449		dput(dentry_dir);
1450	} else if (dir) {
1451		iput(dir);
1452	}
1453	if (dentry_inode) {
1454		d_drop(dentry_inode);
1455		dput(dentry_inode);
1456	}
1457
1458	return ret;
1459}
1460
1461/* Link replay function */
1462static int ext4_fc_replay_link(struct super_block *sb,
1463			       struct ext4_fc_tl_mem *tl, u8 *val)
1464{
1465	struct inode *inode;
1466	struct dentry_info_args darg;
1467	int ret = 0;
1468
1469	tl_to_darg(&darg, tl, val);
1470	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1471			darg.parent_ino, darg.dname_len);
1472
1473	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1474	if (IS_ERR(inode)) {
1475		ext4_debug("Inode not found.");
1476		return 0;
1477	}
1478
1479	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1480	iput(inode);
1481	return ret;
1482}
1483
1484/*
1485 * Record all the modified inodes during replay. We use this later to setup
1486 * block bitmaps correctly.
1487 */
1488static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1489{
1490	struct ext4_fc_replay_state *state;
1491	int i;
1492
1493	state = &EXT4_SB(sb)->s_fc_replay_state;
1494	for (i = 0; i < state->fc_modified_inodes_used; i++)
1495		if (state->fc_modified_inodes[i] == ino)
1496			return 0;
1497	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1498		int *fc_modified_inodes;
1499
1500		fc_modified_inodes = krealloc(state->fc_modified_inodes,
1501				sizeof(int) * (state->fc_modified_inodes_size +
1502				EXT4_FC_REPLAY_REALLOC_INCREMENT),
1503				GFP_KERNEL);
1504		if (!fc_modified_inodes)
1505			return -ENOMEM;
1506		state->fc_modified_inodes = fc_modified_inodes;
1507		state->fc_modified_inodes_size +=
1508			EXT4_FC_REPLAY_REALLOC_INCREMENT;
 
 
 
 
 
 
1509	}
1510	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1511	return 0;
1512}
1513
1514/*
1515 * Inode replay function
1516 */
1517static int ext4_fc_replay_inode(struct super_block *sb,
1518				struct ext4_fc_tl_mem *tl, u8 *val)
1519{
1520	struct ext4_fc_inode fc_inode;
1521	struct ext4_inode *raw_inode;
1522	struct ext4_inode *raw_fc_inode;
1523	struct inode *inode = NULL;
1524	struct ext4_iloc iloc;
1525	int inode_len, ino, ret, tag = tl->fc_tag;
1526	struct ext4_extent_header *eh;
1527	size_t off_gen = offsetof(struct ext4_inode, i_generation);
1528
1529	memcpy(&fc_inode, val, sizeof(fc_inode));
1530
1531	ino = le32_to_cpu(fc_inode.fc_ino);
1532	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1533
1534	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1535	if (!IS_ERR(inode)) {
1536		ext4_ext_clear_bb(inode);
1537		iput(inode);
1538	}
1539	inode = NULL;
1540
1541	ret = ext4_fc_record_modified_inode(sb, ino);
1542	if (ret)
1543		goto out;
1544
1545	raw_fc_inode = (struct ext4_inode *)
1546		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1547	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1548	if (ret)
1549		goto out;
1550
1551	inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1552	raw_inode = ext4_raw_inode(&iloc);
1553
1554	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1555	memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1556	       inode_len - off_gen);
1557	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1558		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1559		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1560			memset(eh, 0, sizeof(*eh));
1561			eh->eh_magic = EXT4_EXT_MAGIC;
1562			eh->eh_max = cpu_to_le16(
1563				(sizeof(raw_inode->i_block) -
1564				 sizeof(struct ext4_extent_header))
1565				 / sizeof(struct ext4_extent));
1566		}
1567	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1568		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1569			sizeof(raw_inode->i_block));
1570	}
1571
1572	/* Immediately update the inode on disk. */
1573	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1574	if (ret)
1575		goto out;
1576	ret = sync_dirty_buffer(iloc.bh);
1577	if (ret)
1578		goto out;
1579	ret = ext4_mark_inode_used(sb, ino);
1580	if (ret)
1581		goto out;
1582
1583	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1584	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1585	if (IS_ERR(inode)) {
1586		ext4_debug("Inode not found.");
1587		return -EFSCORRUPTED;
1588	}
1589
1590	/*
1591	 * Our allocator could have made different decisions than before
1592	 * crashing. This should be fixed but until then, we calculate
1593	 * the number of blocks the inode.
1594	 */
1595	if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1596		ext4_ext_replay_set_iblocks(inode);
1597
1598	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1599	ext4_reset_inode_seed(inode);
1600
1601	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1602	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1603	sync_dirty_buffer(iloc.bh);
1604	brelse(iloc.bh);
1605out:
1606	iput(inode);
1607	if (!ret)
1608		blkdev_issue_flush(sb->s_bdev);
1609
1610	return 0;
1611}
1612
1613/*
1614 * Dentry create replay function.
1615 *
1616 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1617 * inode for which we are trying to create a dentry here, should already have
1618 * been replayed before we start here.
1619 */
1620static int ext4_fc_replay_create(struct super_block *sb,
1621				 struct ext4_fc_tl_mem *tl, u8 *val)
1622{
1623	int ret = 0;
1624	struct inode *inode = NULL;
1625	struct inode *dir = NULL;
1626	struct dentry_info_args darg;
1627
1628	tl_to_darg(&darg, tl, val);
1629
1630	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1631			darg.parent_ino, darg.dname_len);
1632
1633	/* This takes care of update group descriptor and other metadata */
1634	ret = ext4_mark_inode_used(sb, darg.ino);
1635	if (ret)
1636		goto out;
1637
1638	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1639	if (IS_ERR(inode)) {
1640		ext4_debug("inode %d not found.", darg.ino);
1641		inode = NULL;
1642		ret = -EINVAL;
1643		goto out;
1644	}
1645
1646	if (S_ISDIR(inode->i_mode)) {
1647		/*
1648		 * If we are creating a directory, we need to make sure that the
1649		 * dot and dot dot dirents are setup properly.
1650		 */
1651		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1652		if (IS_ERR(dir)) {
1653			ext4_debug("Dir %d not found.", darg.ino);
1654			goto out;
1655		}
1656		ret = ext4_init_new_dir(NULL, dir, inode);
1657		iput(dir);
1658		if (ret) {
1659			ret = 0;
1660			goto out;
1661		}
1662	}
1663	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1664	if (ret)
1665		goto out;
1666	set_nlink(inode, 1);
1667	ext4_mark_inode_dirty(NULL, inode);
1668out:
1669	iput(inode);
 
1670	return ret;
1671}
1672
1673/*
1674 * Record physical disk regions which are in use as per fast commit area,
1675 * and used by inodes during replay phase. Our simple replay phase
1676 * allocator excludes these regions from allocation.
1677 */
1678int ext4_fc_record_regions(struct super_block *sb, int ino,
1679		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1680{
1681	struct ext4_fc_replay_state *state;
1682	struct ext4_fc_alloc_region *region;
1683
1684	state = &EXT4_SB(sb)->s_fc_replay_state;
1685	/*
1686	 * during replay phase, the fc_regions_valid may not same as
1687	 * fc_regions_used, update it when do new additions.
1688	 */
1689	if (replay && state->fc_regions_used != state->fc_regions_valid)
1690		state->fc_regions_used = state->fc_regions_valid;
1691	if (state->fc_regions_used == state->fc_regions_size) {
1692		struct ext4_fc_alloc_region *fc_regions;
1693
1694		fc_regions = krealloc(state->fc_regions,
1695				      sizeof(struct ext4_fc_alloc_region) *
1696				      (state->fc_regions_size +
1697				       EXT4_FC_REPLAY_REALLOC_INCREMENT),
1698				      GFP_KERNEL);
1699		if (!fc_regions)
1700			return -ENOMEM;
1701		state->fc_regions_size +=
1702			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1703		state->fc_regions = fc_regions;
 
 
 
 
 
 
1704	}
1705	region = &state->fc_regions[state->fc_regions_used++];
1706	region->ino = ino;
1707	region->lblk = lblk;
1708	region->pblk = pblk;
1709	region->len = len;
1710
1711	if (replay)
1712		state->fc_regions_valid++;
1713
1714	return 0;
1715}
1716
1717/* Replay add range tag */
1718static int ext4_fc_replay_add_range(struct super_block *sb,
1719				    struct ext4_fc_tl_mem *tl, u8 *val)
1720{
1721	struct ext4_fc_add_range fc_add_ex;
1722	struct ext4_extent newex, *ex;
1723	struct inode *inode;
1724	ext4_lblk_t start, cur;
1725	int remaining, len;
1726	ext4_fsblk_t start_pblk;
1727	struct ext4_map_blocks map;
1728	struct ext4_ext_path *path = NULL;
1729	int ret;
1730
1731	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1732	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1733
1734	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1735		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1736		ext4_ext_get_actual_len(ex));
1737
1738	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1739	if (IS_ERR(inode)) {
1740		ext4_debug("Inode not found.");
1741		return 0;
1742	}
1743
1744	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1745	if (ret)
1746		goto out;
1747
1748	start = le32_to_cpu(ex->ee_block);
1749	start_pblk = ext4_ext_pblock(ex);
1750	len = ext4_ext_get_actual_len(ex);
1751
1752	cur = start;
1753	remaining = len;
1754	ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1755		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1756		  inode->i_ino);
1757
1758	while (remaining > 0) {
1759		map.m_lblk = cur;
1760		map.m_len = remaining;
1761		map.m_pblk = 0;
1762		ret = ext4_map_blocks(NULL, inode, &map, 0);
1763
1764		if (ret < 0)
1765			goto out;
 
 
1766
1767		if (ret == 0) {
1768			/* Range is not mapped */
1769			path = ext4_find_extent(inode, cur, NULL, 0);
1770			if (IS_ERR(path))
1771				goto out;
 
 
1772			memset(&newex, 0, sizeof(newex));
1773			newex.ee_block = cpu_to_le32(cur);
1774			ext4_ext_store_pblock(
1775				&newex, start_pblk + cur - start);
1776			newex.ee_len = cpu_to_le16(map.m_len);
1777			if (ext4_ext_is_unwritten(ex))
1778				ext4_ext_mark_unwritten(&newex);
1779			down_write(&EXT4_I(inode)->i_data_sem);
1780			ret = ext4_ext_insert_extent(
1781				NULL, inode, &path, &newex, 0);
1782			up_write((&EXT4_I(inode)->i_data_sem));
1783			ext4_free_ext_path(path);
1784			if (ret)
1785				goto out;
 
 
 
1786			goto next;
1787		}
1788
1789		if (start_pblk + cur - start != map.m_pblk) {
1790			/*
1791			 * Logical to physical mapping changed. This can happen
1792			 * if this range was removed and then reallocated to
1793			 * map to new physical blocks during a fast commit.
1794			 */
1795			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1796					ext4_ext_is_unwritten(ex),
1797					start_pblk + cur - start);
1798			if (ret)
1799				goto out;
 
 
1800			/*
1801			 * Mark the old blocks as free since they aren't used
1802			 * anymore. We maintain an array of all the modified
1803			 * inodes. In case these blocks are still used at either
1804			 * a different logical range in the same inode or in
1805			 * some different inode, we will mark them as allocated
1806			 * at the end of the FC replay using our array of
1807			 * modified inodes.
1808			 */
1809			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1810			goto next;
1811		}
1812
1813		/* Range is mapped and needs a state change */
1814		ext4_debug("Converting from %ld to %d %lld",
1815				map.m_flags & EXT4_MAP_UNWRITTEN,
1816			ext4_ext_is_unwritten(ex), map.m_pblk);
1817		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1818					ext4_ext_is_unwritten(ex), map.m_pblk);
1819		if (ret)
1820			goto out;
 
 
1821		/*
1822		 * We may have split the extent tree while toggling the state.
1823		 * Try to shrink the extent tree now.
1824		 */
1825		ext4_ext_replay_shrink_inode(inode, start + len);
1826next:
1827		cur += map.m_len;
1828		remaining -= map.m_len;
1829	}
1830	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1831					sb->s_blocksize_bits);
1832out:
1833	iput(inode);
1834	return 0;
1835}
1836
1837/* Replay DEL_RANGE tag */
1838static int
1839ext4_fc_replay_del_range(struct super_block *sb,
1840			 struct ext4_fc_tl_mem *tl, u8 *val)
1841{
1842	struct inode *inode;
1843	struct ext4_fc_del_range lrange;
1844	struct ext4_map_blocks map;
1845	ext4_lblk_t cur, remaining;
1846	int ret;
1847
1848	memcpy(&lrange, val, sizeof(lrange));
1849	cur = le32_to_cpu(lrange.fc_lblk);
1850	remaining = le32_to_cpu(lrange.fc_len);
1851
1852	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1853		le32_to_cpu(lrange.fc_ino), cur, remaining);
1854
1855	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1856	if (IS_ERR(inode)) {
1857		ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1858		return 0;
1859	}
1860
1861	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1862	if (ret)
1863		goto out;
1864
1865	ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1866			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1867			le32_to_cpu(lrange.fc_len));
1868	while (remaining > 0) {
1869		map.m_lblk = cur;
1870		map.m_len = remaining;
1871
1872		ret = ext4_map_blocks(NULL, inode, &map, 0);
1873		if (ret < 0)
1874			goto out;
 
 
1875		if (ret > 0) {
1876			remaining -= ret;
1877			cur += ret;
1878			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1879		} else {
1880			remaining -= map.m_len;
1881			cur += map.m_len;
1882		}
1883	}
1884
1885	down_write(&EXT4_I(inode)->i_data_sem);
1886	ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1887				le32_to_cpu(lrange.fc_lblk) +
1888				le32_to_cpu(lrange.fc_len) - 1);
1889	up_write(&EXT4_I(inode)->i_data_sem);
1890	if (ret)
1891		goto out;
1892	ext4_ext_replay_shrink_inode(inode,
1893		i_size_read(inode) >> sb->s_blocksize_bits);
1894	ext4_mark_inode_dirty(NULL, inode);
1895out:
1896	iput(inode);
 
1897	return 0;
1898}
1899
1900static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1901{
1902	struct ext4_fc_replay_state *state;
1903	struct inode *inode;
1904	struct ext4_ext_path *path = NULL;
1905	struct ext4_map_blocks map;
1906	int i, ret, j;
1907	ext4_lblk_t cur, end;
1908
1909	state = &EXT4_SB(sb)->s_fc_replay_state;
1910	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1911		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1912			EXT4_IGET_NORMAL);
1913		if (IS_ERR(inode)) {
1914			ext4_debug("Inode %d not found.",
1915				state->fc_modified_inodes[i]);
1916			continue;
1917		}
1918		cur = 0;
1919		end = EXT_MAX_BLOCKS;
1920		if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1921			iput(inode);
1922			continue;
1923		}
1924		while (cur < end) {
1925			map.m_lblk = cur;
1926			map.m_len = end - cur;
1927
1928			ret = ext4_map_blocks(NULL, inode, &map, 0);
1929			if (ret < 0)
1930				break;
1931
1932			if (ret > 0) {
1933				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1934				if (!IS_ERR(path)) {
1935					for (j = 0; j < path->p_depth; j++)
1936						ext4_mb_mark_bb(inode->i_sb,
1937							path[j].p_block, 1, true);
1938					ext4_free_ext_path(path);
 
1939				}
1940				cur += ret;
1941				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1942							map.m_len, true);
1943			} else {
1944				cur = cur + (map.m_len ? map.m_len : 1);
1945			}
1946		}
1947		iput(inode);
1948	}
1949}
1950
1951/*
1952 * Check if block is in excluded regions for block allocation. The simple
1953 * allocator that runs during replay phase is calls this function to see
1954 * if it is okay to use a block.
1955 */
1956bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1957{
1958	int i;
1959	struct ext4_fc_replay_state *state;
1960
1961	state = &EXT4_SB(sb)->s_fc_replay_state;
1962	for (i = 0; i < state->fc_regions_valid; i++) {
1963		if (state->fc_regions[i].ino == 0 ||
1964			state->fc_regions[i].len == 0)
1965			continue;
1966		if (in_range(blk, state->fc_regions[i].pblk,
1967					state->fc_regions[i].len))
1968			return true;
1969	}
1970	return false;
1971}
1972
1973/* Cleanup function called after replay */
1974void ext4_fc_replay_cleanup(struct super_block *sb)
1975{
1976	struct ext4_sb_info *sbi = EXT4_SB(sb);
1977
1978	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1979	kfree(sbi->s_fc_replay_state.fc_regions);
1980	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1981}
1982
1983static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
1984				      int tag, int len)
1985{
1986	switch (tag) {
1987	case EXT4_FC_TAG_ADD_RANGE:
1988		return len == sizeof(struct ext4_fc_add_range);
1989	case EXT4_FC_TAG_DEL_RANGE:
1990		return len == sizeof(struct ext4_fc_del_range);
1991	case EXT4_FC_TAG_CREAT:
1992	case EXT4_FC_TAG_LINK:
1993	case EXT4_FC_TAG_UNLINK:
1994		len -= sizeof(struct ext4_fc_dentry_info);
1995		return len >= 1 && len <= EXT4_NAME_LEN;
1996	case EXT4_FC_TAG_INODE:
1997		len -= sizeof(struct ext4_fc_inode);
1998		return len >= EXT4_GOOD_OLD_INODE_SIZE &&
1999			len <= sbi->s_inode_size;
2000	case EXT4_FC_TAG_PAD:
2001		return true; /* padding can have any length */
2002	case EXT4_FC_TAG_TAIL:
2003		return len >= sizeof(struct ext4_fc_tail);
2004	case EXT4_FC_TAG_HEAD:
2005		return len == sizeof(struct ext4_fc_head);
2006	}
2007	return false;
2008}
2009
2010/*
2011 * Recovery Scan phase handler
2012 *
2013 * This function is called during the scan phase and is responsible
2014 * for doing following things:
2015 * - Make sure the fast commit area has valid tags for replay
2016 * - Count number of tags that need to be replayed by the replay handler
2017 * - Verify CRC
2018 * - Create a list of excluded blocks for allocation during replay phase
2019 *
2020 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2021 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2022 * to indicate that scan has finished and JBD2 can now start replay phase.
2023 * It returns a negative error to indicate that there was an error. At the end
2024 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2025 * to indicate the number of tags that need to replayed during the replay phase.
2026 */
2027static int ext4_fc_replay_scan(journal_t *journal,
2028				struct buffer_head *bh, int off,
2029				tid_t expected_tid)
2030{
2031	struct super_block *sb = journal->j_private;
2032	struct ext4_sb_info *sbi = EXT4_SB(sb);
2033	struct ext4_fc_replay_state *state;
2034	int ret = JBD2_FC_REPLAY_CONTINUE;
2035	struct ext4_fc_add_range ext;
2036	struct ext4_fc_tl_mem tl;
2037	struct ext4_fc_tail tail;
2038	__u8 *start, *end, *cur, *val;
2039	struct ext4_fc_head head;
2040	struct ext4_extent *ex;
2041
2042	state = &sbi->s_fc_replay_state;
2043
2044	start = (u8 *)bh->b_data;
2045	end = start + journal->j_blocksize;
2046
2047	if (state->fc_replay_expected_off == 0) {
2048		state->fc_cur_tag = 0;
2049		state->fc_replay_num_tags = 0;
2050		state->fc_crc = 0;
2051		state->fc_regions = NULL;
2052		state->fc_regions_valid = state->fc_regions_used =
2053			state->fc_regions_size = 0;
2054		/* Check if we can stop early */
2055		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2056			!= EXT4_FC_TAG_HEAD)
2057			return 0;
2058	}
2059
2060	if (off != state->fc_replay_expected_off) {
2061		ret = -EFSCORRUPTED;
2062		goto out_err;
2063	}
2064
2065	state->fc_replay_expected_off++;
2066	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2067	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2068		ext4_fc_get_tl(&tl, cur);
2069		val = cur + EXT4_FC_TAG_BASE_LEN;
2070		if (tl.fc_len > end - val ||
2071		    !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2072			ret = state->fc_replay_num_tags ?
2073				JBD2_FC_REPLAY_STOP : -ECANCELED;
2074			goto out_err;
2075		}
2076		ext4_debug("Scan phase, tag:%s, blk %lld\n",
2077			   tag2str(tl.fc_tag), bh->b_blocknr);
2078		switch (tl.fc_tag) {
2079		case EXT4_FC_TAG_ADD_RANGE:
2080			memcpy(&ext, val, sizeof(ext));
2081			ex = (struct ext4_extent *)&ext.fc_ex;
2082			ret = ext4_fc_record_regions(sb,
2083				le32_to_cpu(ext.fc_ino),
2084				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2085				ext4_ext_get_actual_len(ex), 0);
2086			if (ret < 0)
2087				break;
2088			ret = JBD2_FC_REPLAY_CONTINUE;
2089			fallthrough;
2090		case EXT4_FC_TAG_DEL_RANGE:
2091		case EXT4_FC_TAG_LINK:
2092		case EXT4_FC_TAG_UNLINK:
2093		case EXT4_FC_TAG_CREAT:
2094		case EXT4_FC_TAG_INODE:
2095		case EXT4_FC_TAG_PAD:
2096			state->fc_cur_tag++;
2097			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2098				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2099			break;
2100		case EXT4_FC_TAG_TAIL:
2101			state->fc_cur_tag++;
2102			memcpy(&tail, val, sizeof(tail));
2103			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2104						EXT4_FC_TAG_BASE_LEN +
2105						offsetof(struct ext4_fc_tail,
2106						fc_crc));
2107			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2108				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2109				state->fc_replay_num_tags = state->fc_cur_tag;
2110				state->fc_regions_valid =
2111					state->fc_regions_used;
2112			} else {
2113				ret = state->fc_replay_num_tags ?
2114					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2115			}
2116			state->fc_crc = 0;
2117			break;
2118		case EXT4_FC_TAG_HEAD:
2119			memcpy(&head, val, sizeof(head));
2120			if (le32_to_cpu(head.fc_features) &
2121				~EXT4_FC_SUPPORTED_FEATURES) {
2122				ret = -EOPNOTSUPP;
2123				break;
2124			}
2125			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2126				ret = JBD2_FC_REPLAY_STOP;
2127				break;
2128			}
2129			state->fc_cur_tag++;
2130			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2131				EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2132			break;
2133		default:
2134			ret = state->fc_replay_num_tags ?
2135				JBD2_FC_REPLAY_STOP : -ECANCELED;
2136		}
2137		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2138			break;
2139	}
2140
2141out_err:
2142	trace_ext4_fc_replay_scan(sb, ret, off);
2143	return ret;
2144}
2145
2146/*
2147 * Main recovery path entry point.
2148 * The meaning of return codes is similar as above.
2149 */
2150static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2151				enum passtype pass, int off, tid_t expected_tid)
2152{
2153	struct super_block *sb = journal->j_private;
2154	struct ext4_sb_info *sbi = EXT4_SB(sb);
2155	struct ext4_fc_tl_mem tl;
2156	__u8 *start, *end, *cur, *val;
2157	int ret = JBD2_FC_REPLAY_CONTINUE;
2158	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2159	struct ext4_fc_tail tail;
2160
2161	if (pass == PASS_SCAN) {
2162		state->fc_current_pass = PASS_SCAN;
2163		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2164	}
2165
2166	if (state->fc_current_pass != pass) {
2167		state->fc_current_pass = pass;
2168		sbi->s_mount_state |= EXT4_FC_REPLAY;
2169	}
2170	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2171		ext4_debug("Replay stops\n");
2172		ext4_fc_set_bitmaps_and_counters(sb);
2173		return 0;
2174	}
2175
2176#ifdef CONFIG_EXT4_DEBUG
2177	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2178		pr_warn("Dropping fc block %d because max_replay set\n", off);
2179		return JBD2_FC_REPLAY_STOP;
2180	}
2181#endif
2182
2183	start = (u8 *)bh->b_data;
2184	end = start + journal->j_blocksize;
2185
2186	for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2187	     cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2188		ext4_fc_get_tl(&tl, cur);
2189		val = cur + EXT4_FC_TAG_BASE_LEN;
2190
2191		if (state->fc_replay_num_tags == 0) {
2192			ret = JBD2_FC_REPLAY_STOP;
2193			ext4_fc_set_bitmaps_and_counters(sb);
2194			break;
2195		}
2196
2197		ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2198		state->fc_replay_num_tags--;
2199		switch (tl.fc_tag) {
2200		case EXT4_FC_TAG_LINK:
2201			ret = ext4_fc_replay_link(sb, &tl, val);
2202			break;
2203		case EXT4_FC_TAG_UNLINK:
2204			ret = ext4_fc_replay_unlink(sb, &tl, val);
2205			break;
2206		case EXT4_FC_TAG_ADD_RANGE:
2207			ret = ext4_fc_replay_add_range(sb, &tl, val);
2208			break;
2209		case EXT4_FC_TAG_CREAT:
2210			ret = ext4_fc_replay_create(sb, &tl, val);
2211			break;
2212		case EXT4_FC_TAG_DEL_RANGE:
2213			ret = ext4_fc_replay_del_range(sb, &tl, val);
2214			break;
2215		case EXT4_FC_TAG_INODE:
2216			ret = ext4_fc_replay_inode(sb, &tl, val);
2217			break;
2218		case EXT4_FC_TAG_PAD:
2219			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2220					     tl.fc_len, 0);
2221			break;
2222		case EXT4_FC_TAG_TAIL:
2223			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2224					     0, tl.fc_len, 0);
2225			memcpy(&tail, val, sizeof(tail));
2226			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2227			break;
2228		case EXT4_FC_TAG_HEAD:
2229			break;
2230		default:
2231			trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
 
2232			ret = -ECANCELED;
2233			break;
2234		}
2235		if (ret < 0)
2236			break;
2237		ret = JBD2_FC_REPLAY_CONTINUE;
2238	}
2239	return ret;
2240}
2241
2242void ext4_fc_init(struct super_block *sb, journal_t *journal)
2243{
2244	/*
2245	 * We set replay callback even if fast commit disabled because we may
2246	 * could still have fast commit blocks that need to be replayed even if
2247	 * fast commit has now been turned off.
2248	 */
2249	journal->j_fc_replay_callback = ext4_fc_replay;
2250	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2251		return;
2252	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2253}
2254
2255static const char * const fc_ineligible_reasons[] = {
2256	[EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2257	[EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2258	[EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2259	[EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2260	[EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2261	[EXT4_FC_REASON_RESIZE] = "Resize",
2262	[EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2263	[EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2264	[EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2265	[EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2266};
2267
2268int ext4_fc_info_show(struct seq_file *seq, void *v)
2269{
2270	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2271	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2272	int i;
2273
2274	if (v != SEQ_START_TOKEN)
2275		return 0;
2276
2277	seq_printf(seq,
2278		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2279		   stats->fc_num_commits, stats->fc_ineligible_commits,
2280		   stats->fc_numblks,
2281		   div_u64(stats->s_fc_avg_commit_time, 1000));
2282	seq_puts(seq, "Ineligible reasons:\n");
2283	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2284		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2285			stats->fc_ineligible_reason_count[i]);
2286
2287	return 0;
2288}
2289
2290int __init ext4_fc_init_dentry_cache(void)
2291{
2292	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2293					   SLAB_RECLAIM_ACCOUNT);
2294
2295	if (ext4_fc_dentry_cachep == NULL)
2296		return -ENOMEM;
2297
2298	return 0;
2299}
2300
2301void ext4_fc_destroy_dentry_cache(void)
2302{
2303	kmem_cache_destroy(ext4_fc_dentry_cachep);
2304}
v5.14.15
   1// SPDX-License-Identifier: GPL-2.0
   2
   3/*
   4 * fs/ext4/fast_commit.c
   5 *
   6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7 *
   8 * Ext4 fast commits routines.
   9 */
  10#include "ext4.h"
  11#include "ext4_jbd2.h"
  12#include "ext4_extents.h"
  13#include "mballoc.h"
  14
  15/*
  16 * Ext4 Fast Commits
  17 * -----------------
  18 *
  19 * Ext4 fast commits implement fine grained journalling for Ext4.
  20 *
  21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23 * TLV during the recovery phase. For the scenarios for which we currently
  24 * don't have replay code, fast commit falls back to full commits.
  25 * Fast commits record delta in one of the following three categories.
  26 *
  27 * (A) Directory entry updates:
  28 *
  29 * - EXT4_FC_TAG_UNLINK		- records directory entry unlink
  30 * - EXT4_FC_TAG_LINK		- records directory entry link
  31 * - EXT4_FC_TAG_CREAT		- records inode and directory entry creation
  32 *
  33 * (B) File specific data range updates:
  34 *
  35 * - EXT4_FC_TAG_ADD_RANGE	- records addition of new blocks to an inode
  36 * - EXT4_FC_TAG_DEL_RANGE	- records deletion of blocks from an inode
  37 *
  38 * (C) Inode metadata (mtime / ctime etc):
  39 *
  40 * - EXT4_FC_TAG_INODE		- record the inode that should be replayed
  41 *				  during recovery. Note that iblocks field is
  42 *				  not replayed and instead derived during
  43 *				  replay.
  44 * Commit Operation
  45 * ----------------
  46 * With fast commits, we maintain all the directory entry operations in the
  47 * order in which they are issued in an in-memory queue. This queue is flushed
  48 * to disk during the commit operation. We also maintain a list of inodes
  49 * that need to be committed during a fast commit in another in memory queue of
  50 * inodes. During the commit operation, we commit in the following order:
  51 *
  52 * [1] Lock inodes for any further data updates by setting COMMITTING state
  53 * [2] Submit data buffers of all the inodes
  54 * [3] Wait for [2] to complete
  55 * [4] Commit all the directory entry updates in the fast commit space
  56 * [5] Commit all the changed inode structures
  57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58 *     section for more details).
  59 * [7] Wait for [4], [5] and [6] to complete.
  60 *
  61 * All the inode updates must call ext4_fc_start_update() before starting an
  62 * update. If such an ongoing update is present, fast commit waits for it to
  63 * complete. The completion of such an update is marked by
  64 * ext4_fc_stop_update().
  65 *
  66 * Fast Commit Ineligibility
  67 * -------------------------
 
  68 * Not all operations are supported by fast commits today (e.g extended
  69 * attributes). Fast commit ineligibility is marked by calling one of the
  70 * two following functions:
  71 *
  72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73 *   back to full commit. This is useful in case of transient errors.
  74 *
  75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76 *   the fast commits happening between ext4_fc_start_ineligible() and
  77 *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78 *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79 *   make one more fast commit to fall back to full commit after stop call so
  80 *   that it guaranteed that the fast commit ineligible operation contained
  81 *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82 *   followed by at least 1 full commit.
  83 *
  84 * Atomicity of commits
  85 * --------------------
  86 * In order to guarantee atomicity during the commit operation, fast commit
  87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88 * tag contains CRC of the contents and TID of the transaction after which
  89 * this fast commit should be applied. Recovery code replays fast commit
  90 * logs only if there's at least 1 valid tail present. For every fast commit
  91 * operation, there is 1 tail. This means, we may end up with multiple tails
  92 * in the fast commit space. Here's an example:
  93 *
  94 * - Create a new file A and remove existing file B
  95 * - fsync()
  96 * - Append contents to file A
  97 * - Truncate file A
  98 * - fsync()
  99 *
 100 * The fast commit space at the end of above operations would look like this:
 101 *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102 *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103 *
 104 * Replay code should thus check for all the valid tails in the FC area.
 105 *
 106 * Fast Commit Replay Idempotence
 107 * ------------------------------
 108 *
 109 * Fast commits tags are idempotent in nature provided the recovery code follows
 110 * certain rules. The guiding principle that the commit path follows while
 111 * committing is that it stores the result of a particular operation instead of
 112 * storing the procedure.
 113 *
 114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
 115 * was associated with inode 10. During fast commit, instead of storing this
 116 * operation as a procedure "rename a to b", we store the resulting file system
 117 * state as a "series" of outcomes:
 118 *
 119 * - Link dirent b to inode 10
 120 * - Unlink dirent a
 121 * - Inode <10> with valid refcount
 122 *
 123 * Now when recovery code runs, it needs "enforce" this state on the file
 124 * system. This is what guarantees idempotence of fast commit replay.
 125 *
 126 * Let's take an example of a procedure that is not idempotent and see how fast
 127 * commits make it idempotent. Consider following sequence of operations:
 128 *
 129 *     rm A;    mv B A;    read A
 130 *  (x)     (y)        (z)
 131 *
 132 * (x), (y) and (z) are the points at which we can crash. If we store this
 133 * sequence of operations as is then the replay is not idempotent. Let's say
 134 * while in replay, we crash at (z). During the second replay, file A (which was
 135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
 136 * file named A would be absent when we try to read A. So, this sequence of
 137 * operations is not idempotent. However, as mentioned above, instead of storing
 138 * the procedure fast commits store the outcome of each procedure. Thus the fast
 139 * commit log for above procedure would be as follows:
 140 *
 141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
 142 * inode 11 before the replay)
 143 *
 144 *    [Unlink A]   [Link A to inode 11]   [Unlink B]   [Inode 11]
 145 * (w)          (x)                    (y)          (z)
 146 *
 147 * If we crash at (z), we will have file A linked to inode 11. During the second
 148 * replay, we will remove file A (inode 11). But we will create it back and make
 149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
 150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
 151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
 152 * similarly. Thus, by converting a non-idempotent procedure into a series of
 153 * idempotent outcomes, fast commits ensured idempotence during the replay.
 154 *
 155 * TODOs
 156 * -----
 157 *
 158 * 0) Fast commit replay path hardening: Fast commit replay code should use
 159 *    journal handles to make sure all the updates it does during the replay
 160 *    path are atomic. With that if we crash during fast commit replay, after
 161 *    trying to do recovery again, we will find a file system where fast commit
 162 *    area is invalid (because new full commit would be found). In order to deal
 163 *    with that, fast commit replay code should ensure that the "FC_REPLAY"
 164 *    superblock state is persisted before starting the replay, so that after
 165 *    the crash, fast commit recovery code can look at that flag and perform
 166 *    fast commit recovery even if that area is invalidated by later full
 167 *    commits.
 168 *
 169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 170 *    eligible update must be protected within ext4_fc_start_update() and
 171 *    ext4_fc_stop_update(). These routines are called at much higher
 172 *    routines. This can be made more fine grained by combining with
 173 *    ext4_journal_start().
 174 *
 175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 176 *
 177 * 3) Handle more ineligible cases.
 178 */
 179
 180#include <trace/events/ext4.h>
 181static struct kmem_cache *ext4_fc_dentry_cachep;
 182
 183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 184{
 185	BUFFER_TRACE(bh, "");
 186	if (uptodate) {
 187		ext4_debug("%s: Block %lld up-to-date",
 188			   __func__, bh->b_blocknr);
 189		set_buffer_uptodate(bh);
 190	} else {
 191		ext4_debug("%s: Block %lld not up-to-date",
 192			   __func__, bh->b_blocknr);
 193		clear_buffer_uptodate(bh);
 194	}
 195
 196	unlock_buffer(bh);
 197}
 198
 199static inline void ext4_fc_reset_inode(struct inode *inode)
 200{
 201	struct ext4_inode_info *ei = EXT4_I(inode);
 202
 203	ei->i_fc_lblk_start = 0;
 204	ei->i_fc_lblk_len = 0;
 205}
 206
 207void ext4_fc_init_inode(struct inode *inode)
 208{
 209	struct ext4_inode_info *ei = EXT4_I(inode);
 210
 211	ext4_fc_reset_inode(inode);
 212	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 213	INIT_LIST_HEAD(&ei->i_fc_list);
 
 214	init_waitqueue_head(&ei->i_fc_wait);
 215	atomic_set(&ei->i_fc_updates, 0);
 216}
 217
 218/* This function must be called with sbi->s_fc_lock held. */
 219static void ext4_fc_wait_committing_inode(struct inode *inode)
 220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 221{
 222	wait_queue_head_t *wq;
 223	struct ext4_inode_info *ei = EXT4_I(inode);
 224
 225#if (BITS_PER_LONG < 64)
 226	DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 227			EXT4_STATE_FC_COMMITTING);
 228	wq = bit_waitqueue(&ei->i_state_flags,
 229				EXT4_STATE_FC_COMMITTING);
 230#else
 231	DEFINE_WAIT_BIT(wait, &ei->i_flags,
 232			EXT4_STATE_FC_COMMITTING);
 233	wq = bit_waitqueue(&ei->i_flags,
 234				EXT4_STATE_FC_COMMITTING);
 235#endif
 236	lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 237	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 238	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 239	schedule();
 240	finish_wait(wq, &wait.wq_entry);
 241}
 242
 
 
 
 
 
 
 243/*
 244 * Inform Ext4's fast about start of an inode update
 245 *
 246 * This function is called by the high level call VFS callbacks before
 247 * performing any inode update. This function blocks if there's an ongoing
 248 * fast commit on the inode in question.
 249 */
 250void ext4_fc_start_update(struct inode *inode)
 251{
 252	struct ext4_inode_info *ei = EXT4_I(inode);
 253
 254	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 255	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 256		return;
 257
 258restart:
 259	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 260	if (list_empty(&ei->i_fc_list))
 261		goto out;
 262
 263	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 264		ext4_fc_wait_committing_inode(inode);
 265		goto restart;
 266	}
 267out:
 268	atomic_inc(&ei->i_fc_updates);
 269	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 270}
 271
 272/*
 273 * Stop inode update and wake up waiting fast commits if any.
 274 */
 275void ext4_fc_stop_update(struct inode *inode)
 276{
 277	struct ext4_inode_info *ei = EXT4_I(inode);
 278
 279	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 280	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 281		return;
 282
 283	if (atomic_dec_and_test(&ei->i_fc_updates))
 284		wake_up_all(&ei->i_fc_wait);
 285}
 286
 287/*
 288 * Remove inode from fast commit list. If the inode is being committed
 289 * we wait until inode commit is done.
 290 */
 291void ext4_fc_del(struct inode *inode)
 292{
 293	struct ext4_inode_info *ei = EXT4_I(inode);
 
 
 294
 295	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 296	    (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 297		return;
 298
 299restart:
 300	spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 301	if (list_empty(&ei->i_fc_list)) {
 302		spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 303		return;
 304	}
 305
 306	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 307		ext4_fc_wait_committing_inode(inode);
 308		goto restart;
 309	}
 310	list_del_init(&ei->i_fc_list);
 311	spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 312}
 313
 314/*
 315 * Mark file system as fast commit ineligible. This means that next commit
 316 * operation would result in a full jbd2 commit.
 317 */
 318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 319{
 320	struct ext4_sb_info *sbi = EXT4_SB(sb);
 321
 322	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 323	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 
 
 
 
 324		return;
 
 
 
 
 
 
 
 
 
 325
 326	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 327	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 328	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 
 
 
 329}
 330
 331/*
 332 * Start a fast commit ineligible update. Any commits that happen while
 333 * such an operation is in progress fall back to full commits.
 
 334 */
 335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 336{
 337	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 338
 339	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 340	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 341		return;
 342
 
 
 
 
 
 
 
 
 
 
 
 
 
 343	WARN_ON(reason >= EXT4_FC_REASON_MAX);
 344	sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 345	atomic_inc(&sbi->s_fc_ineligible_updates);
 346}
 347
 348/*
 349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 350 * to ensure that after stopping the ineligible update, at least one full
 351 * commit takes place.
 352 */
 353void ext4_fc_stop_ineligible(struct super_block *sb)
 354{
 355	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 356	    (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 357		return;
 358
 359	ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 360	atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 361}
 362
 363static inline int ext4_fc_is_ineligible(struct super_block *sb)
 364{
 365	return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 366		atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 367}
 368
 369/*
 370 * Generic fast commit tracking function. If this is the first time this we are
 371 * called after a full commit, we initialize fast commit fields and then call
 372 * __fc_track_fn() with update = 0. If we have already been called after a full
 373 * commit, we pass update = 1. Based on that, the track function can determine
 374 * if it needs to track a field for the first time or if it needs to just
 375 * update the previously tracked value.
 376 *
 377 * If enqueue is set, this function enqueues the inode in fast commit list.
 378 */
 379static int ext4_fc_track_template(
 380	handle_t *handle, struct inode *inode,
 381	int (*__fc_track_fn)(struct inode *, void *, bool),
 382	void *args, int enqueue)
 383{
 384	bool update = false;
 385	struct ext4_inode_info *ei = EXT4_I(inode);
 386	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 387	tid_t tid = 0;
 388	int ret;
 389
 390	if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 391	    (sbi->s_mount_state & EXT4_FC_REPLAY))
 392		return -EOPNOTSUPP;
 393
 394	if (ext4_fc_is_ineligible(inode->i_sb))
 395		return -EINVAL;
 396
 397	tid = handle->h_transaction->t_tid;
 398	mutex_lock(&ei->i_fc_lock);
 399	if (tid == ei->i_sync_tid) {
 400		update = true;
 401	} else {
 402		ext4_fc_reset_inode(inode);
 403		ei->i_sync_tid = tid;
 404	}
 405	ret = __fc_track_fn(inode, args, update);
 406	mutex_unlock(&ei->i_fc_lock);
 407
 408	if (!enqueue)
 409		return ret;
 410
 411	spin_lock(&sbi->s_fc_lock);
 412	if (list_empty(&EXT4_I(inode)->i_fc_list))
 413		list_add_tail(&EXT4_I(inode)->i_fc_list,
 414				(ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 
 415				&sbi->s_fc_q[FC_Q_STAGING] :
 416				&sbi->s_fc_q[FC_Q_MAIN]);
 417	spin_unlock(&sbi->s_fc_lock);
 418
 419	return ret;
 420}
 421
 422struct __track_dentry_update_args {
 423	struct dentry *dentry;
 424	int op;
 425};
 426
 427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 429{
 430	struct ext4_fc_dentry_update *node;
 431	struct ext4_inode_info *ei = EXT4_I(inode);
 432	struct __track_dentry_update_args *dentry_update =
 433		(struct __track_dentry_update_args *)arg;
 434	struct dentry *dentry = dentry_update->dentry;
 435	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
 
 436
 437	mutex_unlock(&ei->i_fc_lock);
 
 
 
 
 
 
 
 
 438	node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 439	if (!node) {
 440		ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 441		mutex_lock(&ei->i_fc_lock);
 442		return -ENOMEM;
 443	}
 444
 445	node->fcd_op = dentry_update->op;
 446	node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 447	node->fcd_ino = inode->i_ino;
 448	if (dentry->d_name.len > DNAME_INLINE_LEN) {
 449		node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 450		if (!node->fcd_name.name) {
 451			kmem_cache_free(ext4_fc_dentry_cachep, node);
 452			ext4_fc_mark_ineligible(inode->i_sb,
 453				EXT4_FC_REASON_NOMEM);
 454			mutex_lock(&ei->i_fc_lock);
 455			return -ENOMEM;
 456		}
 457		memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 458			dentry->d_name.len);
 459	} else {
 460		memcpy(node->fcd_iname, dentry->d_name.name,
 461			dentry->d_name.len);
 462		node->fcd_name.name = node->fcd_iname;
 463	}
 464	node->fcd_name.len = dentry->d_name.len;
 465
 466	spin_lock(&sbi->s_fc_lock);
 467	if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 
 468		list_add_tail(&node->fcd_list,
 469				&sbi->s_fc_dentry_q[FC_Q_STAGING]);
 470	else
 471		list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 472	spin_unlock(&sbi->s_fc_lock);
 473	mutex_lock(&ei->i_fc_lock);
 474
 475	return 0;
 476}
 477
 478void __ext4_fc_track_unlink(handle_t *handle,
 479		struct inode *inode, struct dentry *dentry)
 480{
 481	struct __track_dentry_update_args args;
 482	int ret;
 483
 484	args.dentry = dentry;
 485	args.op = EXT4_FC_TAG_UNLINK;
 486
 487	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 488					(void *)&args, 0);
 489	trace_ext4_fc_track_unlink(inode, dentry, ret);
 490}
 491
 492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 493{
 494	__ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 495}
 496
 497void __ext4_fc_track_link(handle_t *handle,
 498	struct inode *inode, struct dentry *dentry)
 499{
 500	struct __track_dentry_update_args args;
 501	int ret;
 502
 503	args.dentry = dentry;
 504	args.op = EXT4_FC_TAG_LINK;
 505
 506	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 507					(void *)&args, 0);
 508	trace_ext4_fc_track_link(inode, dentry, ret);
 509}
 510
 511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 512{
 513	__ext4_fc_track_link(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 514}
 515
 516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 517			  struct dentry *dentry)
 518{
 519	struct __track_dentry_update_args args;
 520	int ret;
 521
 522	args.dentry = dentry;
 523	args.op = EXT4_FC_TAG_CREAT;
 524
 525	ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 526					(void *)&args, 0);
 527	trace_ext4_fc_track_create(inode, dentry, ret);
 528}
 529
 530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 531{
 532	__ext4_fc_track_create(handle, d_inode(dentry), dentry);
 
 
 
 
 
 
 
 
 533}
 534
 535/* __track_fn for inode tracking */
 536static int __track_inode(struct inode *inode, void *arg, bool update)
 537{
 538	if (update)
 539		return -EEXIST;
 540
 541	EXT4_I(inode)->i_fc_lblk_len = 0;
 542
 543	return 0;
 544}
 545
 546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 547{
 548	int ret;
 549
 550	if (S_ISDIR(inode->i_mode))
 551		return;
 552
 
 
 
 553	if (ext4_should_journal_data(inode)) {
 554		ext4_fc_mark_ineligible(inode->i_sb,
 555					EXT4_FC_REASON_INODE_JOURNAL_DATA);
 556		return;
 557	}
 558
 
 
 
 559	ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 560	trace_ext4_fc_track_inode(inode, ret);
 561}
 562
 563struct __track_range_args {
 564	ext4_lblk_t start, end;
 565};
 566
 567/* __track_fn for tracking data updates */
 568static int __track_range(struct inode *inode, void *arg, bool update)
 569{
 570	struct ext4_inode_info *ei = EXT4_I(inode);
 571	ext4_lblk_t oldstart;
 572	struct __track_range_args *__arg =
 573		(struct __track_range_args *)arg;
 574
 575	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 576		ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 577		return -ECANCELED;
 578	}
 579
 580	oldstart = ei->i_fc_lblk_start;
 581
 582	if (update && ei->i_fc_lblk_len > 0) {
 583		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 584		ei->i_fc_lblk_len =
 585			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 586				ei->i_fc_lblk_start + 1;
 587	} else {
 588		ei->i_fc_lblk_start = __arg->start;
 589		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 590	}
 591
 592	return 0;
 593}
 594
 595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 596			 ext4_lblk_t end)
 597{
 598	struct __track_range_args args;
 599	int ret;
 600
 601	if (S_ISDIR(inode->i_mode))
 602		return;
 603
 
 
 
 
 
 
 604	args.start = start;
 605	args.end = end;
 606
 607	ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 608
 609	trace_ext4_fc_track_range(inode, start, end, ret);
 610}
 611
 612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 613{
 614	int write_flags = REQ_SYNC;
 615	struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 616
 617	/* Add REQ_FUA | REQ_PREFLUSH only its tail */
 618	if (test_opt(sb, BARRIER) && is_tail)
 619		write_flags |= REQ_FUA | REQ_PREFLUSH;
 620	lock_buffer(bh);
 621	set_buffer_dirty(bh);
 622	set_buffer_uptodate(bh);
 623	bh->b_end_io = ext4_end_buffer_io_sync;
 624	submit_bh(REQ_OP_WRITE, write_flags, bh);
 625	EXT4_SB(sb)->s_fc_bh = NULL;
 626}
 627
 628/* Ext4 commit path routines */
 629
 630/* memzero and update CRC */
 631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 632				u32 *crc)
 633{
 634	void *ret;
 635
 636	ret = memset(dst, 0, len);
 637	if (crc)
 638		*crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 639	return ret;
 640}
 641
 642/*
 643 * Allocate len bytes on a fast commit buffer.
 644 *
 645 * During the commit time this function is used to manage fast commit
 646 * block space. We don't split a fast commit log onto different
 647 * blocks. So this function makes sure that if there's not enough space
 648 * on the current block, the remaining space in the current block is
 649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 650 * new block is from jbd2 and CRC is updated to reflect the padding
 651 * we added.
 652 */
 653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 654{
 655	struct ext4_fc_tl *tl;
 656	struct ext4_sb_info *sbi = EXT4_SB(sb);
 657	struct buffer_head *bh;
 658	int bsize = sbi->s_journal->j_blocksize;
 659	int ret, off = sbi->s_fc_bytes % bsize;
 660	int pad_len;
 
 661
 662	/*
 663	 * After allocating len, we should have space at least for a 0 byte
 664	 * padding.
 665	 */
 666	if (len + sizeof(struct ext4_fc_tl) > bsize)
 667		return NULL;
 668
 669	if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 670		/*
 671		 * Only allocate from current buffer if we have enough space for
 672		 * this request AND we have space to add a zero byte padding.
 673		 */
 674		if (!sbi->s_fc_bh) {
 675			ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 676			if (ret)
 677				return NULL;
 678			sbi->s_fc_bh = bh;
 679		}
 
 
 
 680		sbi->s_fc_bytes += len;
 681		return sbi->s_fc_bh->b_data + off;
 682	}
 683	/* Need to add PAD tag */
 684	tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 685	tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 686	pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 687	tl->fc_len = cpu_to_le16(pad_len);
 688	if (crc)
 689		*crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 690	if (pad_len > 0)
 691		ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 
 
 
 692	ext4_fc_submit_bh(sb, false);
 693
 694	ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 695	if (ret)
 696		return NULL;
 697	sbi->s_fc_bh = bh;
 698	sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 699	return sbi->s_fc_bh->b_data;
 700}
 701
 702/* memcpy to fc reserved space and update CRC */
 703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 704				int len, u32 *crc)
 705{
 706	if (crc)
 707		*crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 708	return memcpy(dst, src, len);
 709}
 710
 711/*
 712 * Complete a fast commit by writing tail tag.
 713 *
 714 * Writing tail tag marks the end of a fast commit. In order to guarantee
 715 * atomicity, after writing tail tag, even if there's space remaining
 716 * in the block, next commit shouldn't use it. That's why tail tag
 717 * has the length as that of the remaining space on the block.
 718 */
 719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 720{
 721	struct ext4_sb_info *sbi = EXT4_SB(sb);
 722	struct ext4_fc_tl tl;
 723	struct ext4_fc_tail tail;
 724	int off, bsize = sbi->s_journal->j_blocksize;
 725	u8 *dst;
 726
 727	/*
 728	 * ext4_fc_reserve_space takes care of allocating an extra block if
 729	 * there's no enough space on this block for accommodating this tail.
 730	 */
 731	dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 732	if (!dst)
 733		return -ENOSPC;
 734
 735	off = sbi->s_fc_bytes % bsize;
 736
 737	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 738	tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 739	sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 740
 741	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 742	dst += sizeof(tl);
 743	tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 744	ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 745	dst += sizeof(tail.fc_tid);
 
 
 746	tail.fc_crc = cpu_to_le32(crc);
 747	ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 
 
 748
 749	ext4_fc_submit_bh(sb, true);
 750
 751	return 0;
 752}
 753
 754/*
 755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 756 * Returns false if there's not enough space.
 757 */
 758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 759			   u32 *crc)
 760{
 761	struct ext4_fc_tl tl;
 762	u8 *dst;
 763
 764	dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 765	if (!dst)
 766		return false;
 767
 768	tl.fc_tag = cpu_to_le16(tag);
 769	tl.fc_len = cpu_to_le16(len);
 770
 771	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 772	ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 773
 774	return true;
 775}
 776
 777/* Same as above, but adds dentry tlv. */
 778static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 779					int parent_ino, int ino, int dlen,
 780					const unsigned char *dname,
 781					u32 *crc)
 782{
 783	struct ext4_fc_dentry_info fcd;
 784	struct ext4_fc_tl tl;
 785	u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 786					crc);
 
 787
 788	if (!dst)
 789		return false;
 790
 791	fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 792	fcd.fc_ino = cpu_to_le32(ino);
 793	tl.fc_tag = cpu_to_le16(tag);
 794	tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 795	ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 796	dst += sizeof(tl);
 797	ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 798	dst += sizeof(fcd);
 799	ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 800	dst += dlen;
 801
 802	return true;
 803}
 804
 805/*
 806 * Writes inode in the fast commit space under TLV with tag @tag.
 807 * Returns 0 on success, error on failure.
 808 */
 809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 810{
 811	struct ext4_inode_info *ei = EXT4_I(inode);
 812	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 813	int ret;
 814	struct ext4_iloc iloc;
 815	struct ext4_fc_inode fc_inode;
 816	struct ext4_fc_tl tl;
 817	u8 *dst;
 818
 819	ret = ext4_get_inode_loc(inode, &iloc);
 820	if (ret)
 821		return ret;
 822
 823	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 
 
 824		inode_len += ei->i_extra_isize;
 825
 826	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 827	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 828	tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 829
 
 830	dst = ext4_fc_reserve_space(inode->i_sb,
 831			sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 832	if (!dst)
 833		return -ECANCELED;
 834
 835	if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 836		return -ECANCELED;
 837	dst += sizeof(tl);
 838	if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 839		return -ECANCELED;
 840	dst += sizeof(fc_inode);
 841	if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 842					inode_len, crc))
 843		return -ECANCELED;
 844
 845	return 0;
 846}
 847
 848/*
 849 * Writes updated data ranges for the inode in question. Updates CRC.
 850 * Returns 0 on success, error otherwise.
 851 */
 852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 853{
 854	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 855	struct ext4_inode_info *ei = EXT4_I(inode);
 856	struct ext4_map_blocks map;
 857	struct ext4_fc_add_range fc_ext;
 858	struct ext4_fc_del_range lrange;
 859	struct ext4_extent *ex;
 860	int ret;
 861
 862	mutex_lock(&ei->i_fc_lock);
 863	if (ei->i_fc_lblk_len == 0) {
 864		mutex_unlock(&ei->i_fc_lock);
 865		return 0;
 866	}
 867	old_blk_size = ei->i_fc_lblk_start;
 868	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 869	ei->i_fc_lblk_len = 0;
 870	mutex_unlock(&ei->i_fc_lock);
 871
 872	cur_lblk_off = old_blk_size;
 873	jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 874		  __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 875
 876	while (cur_lblk_off <= new_blk_size) {
 877		map.m_lblk = cur_lblk_off;
 878		map.m_len = new_blk_size - cur_lblk_off + 1;
 879		ret = ext4_map_blocks(NULL, inode, &map, 0);
 880		if (ret < 0)
 881			return -ECANCELED;
 882
 883		if (map.m_len == 0) {
 884			cur_lblk_off++;
 885			continue;
 886		}
 887
 888		if (ret == 0) {
 889			lrange.fc_ino = cpu_to_le32(inode->i_ino);
 890			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 891			lrange.fc_len = cpu_to_le32(map.m_len);
 892			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 893					    sizeof(lrange), (u8 *)&lrange, crc))
 894				return -ENOSPC;
 895		} else {
 896			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 897				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 898
 899			/* Limit the number of blocks in one extent */
 900			map.m_len = min(max, map.m_len);
 901
 902			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 903			ex = (struct ext4_extent *)&fc_ext.fc_ex;
 904			ex->ee_block = cpu_to_le32(map.m_lblk);
 905			ex->ee_len = cpu_to_le16(map.m_len);
 906			ext4_ext_store_pblock(ex, map.m_pblk);
 907			if (map.m_flags & EXT4_MAP_UNWRITTEN)
 908				ext4_ext_mark_unwritten(ex);
 909			else
 910				ext4_ext_mark_initialized(ex);
 911			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 912					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
 913				return -ENOSPC;
 914		}
 915
 916		cur_lblk_off += map.m_len;
 917	}
 918
 919	return 0;
 920}
 921
 922
 923/* Submit data for all the fast commit inodes */
 924static int ext4_fc_submit_inode_data_all(journal_t *journal)
 925{
 926	struct super_block *sb = (struct super_block *)(journal->j_private);
 927	struct ext4_sb_info *sbi = EXT4_SB(sb);
 928	struct ext4_inode_info *ei;
 929	int ret = 0;
 930
 931	spin_lock(&sbi->s_fc_lock);
 932	ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 933	list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 934		ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 935		while (atomic_read(&ei->i_fc_updates)) {
 936			DEFINE_WAIT(wait);
 937
 938			prepare_to_wait(&ei->i_fc_wait, &wait,
 939						TASK_UNINTERRUPTIBLE);
 940			if (atomic_read(&ei->i_fc_updates)) {
 941				spin_unlock(&sbi->s_fc_lock);
 942				schedule();
 943				spin_lock(&sbi->s_fc_lock);
 944			}
 945			finish_wait(&ei->i_fc_wait, &wait);
 946		}
 947		spin_unlock(&sbi->s_fc_lock);
 948		ret = jbd2_submit_inode_data(ei->jinode);
 949		if (ret)
 950			return ret;
 951		spin_lock(&sbi->s_fc_lock);
 952	}
 953	spin_unlock(&sbi->s_fc_lock);
 954
 955	return ret;
 956}
 957
 958/* Wait for completion of data for all the fast commit inodes */
 959static int ext4_fc_wait_inode_data_all(journal_t *journal)
 960{
 961	struct super_block *sb = (struct super_block *)(journal->j_private);
 962	struct ext4_sb_info *sbi = EXT4_SB(sb);
 963	struct ext4_inode_info *pos, *n;
 964	int ret = 0;
 965
 966	spin_lock(&sbi->s_fc_lock);
 967	list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 968		if (!ext4_test_inode_state(&pos->vfs_inode,
 969					   EXT4_STATE_FC_COMMITTING))
 970			continue;
 971		spin_unlock(&sbi->s_fc_lock);
 972
 973		ret = jbd2_wait_inode_data(journal, pos->jinode);
 974		if (ret)
 975			return ret;
 976		spin_lock(&sbi->s_fc_lock);
 977	}
 978	spin_unlock(&sbi->s_fc_lock);
 979
 980	return 0;
 981}
 982
 983/* Commit all the directory entry updates */
 984static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 985__acquires(&sbi->s_fc_lock)
 986__releases(&sbi->s_fc_lock)
 987{
 988	struct super_block *sb = (struct super_block *)(journal->j_private);
 989	struct ext4_sb_info *sbi = EXT4_SB(sb);
 990	struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
 991	struct inode *inode;
 992	struct ext4_inode_info *ei, *ei_n;
 993	int ret;
 994
 995	if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 996		return 0;
 997	list_for_each_entry_safe(fc_dentry, fc_dentry_n,
 998				 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 999		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1000			spin_unlock(&sbi->s_fc_lock);
1001			if (!ext4_fc_add_dentry_tlv(
1002				sb, fc_dentry->fcd_op,
1003				fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1004				fc_dentry->fcd_name.len,
1005				fc_dentry->fcd_name.name, crc)) {
1006				ret = -ENOSPC;
1007				goto lock_and_exit;
1008			}
1009			spin_lock(&sbi->s_fc_lock);
1010			continue;
1011		}
1012
1013		inode = NULL;
1014		list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1015					 i_fc_list) {
1016			if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1017				inode = &ei->vfs_inode;
1018				break;
1019			}
1020		}
1021		/*
1022		 * If we don't find inode in our list, then it was deleted,
1023		 * in which case, we don't need to record it's create tag.
1024		 */
1025		if (!inode)
1026			continue;
 
 
 
 
1027		spin_unlock(&sbi->s_fc_lock);
1028
1029		/*
1030		 * We first write the inode and then the create dirent. This
1031		 * allows the recovery code to create an unnamed inode first
1032		 * and then link it to a directory entry. This allows us
1033		 * to use namei.c routines almost as is and simplifies
1034		 * the recovery code.
1035		 */
1036		ret = ext4_fc_write_inode(inode, crc);
1037		if (ret)
1038			goto lock_and_exit;
1039
1040		ret = ext4_fc_write_inode_data(inode, crc);
1041		if (ret)
1042			goto lock_and_exit;
1043
1044		if (!ext4_fc_add_dentry_tlv(
1045			sb, fc_dentry->fcd_op,
1046			fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1047			fc_dentry->fcd_name.len,
1048			fc_dentry->fcd_name.name, crc)) {
1049			ret = -ENOSPC;
1050			goto lock_and_exit;
1051		}
1052
1053		spin_lock(&sbi->s_fc_lock);
1054	}
1055	return 0;
1056lock_and_exit:
1057	spin_lock(&sbi->s_fc_lock);
1058	return ret;
1059}
1060
1061static int ext4_fc_perform_commit(journal_t *journal)
1062{
1063	struct super_block *sb = (struct super_block *)(journal->j_private);
1064	struct ext4_sb_info *sbi = EXT4_SB(sb);
1065	struct ext4_inode_info *iter;
1066	struct ext4_fc_head head;
1067	struct inode *inode;
1068	struct blk_plug plug;
1069	int ret = 0;
1070	u32 crc = 0;
1071
1072	ret = ext4_fc_submit_inode_data_all(journal);
1073	if (ret)
1074		return ret;
1075
1076	ret = ext4_fc_wait_inode_data_all(journal);
1077	if (ret)
1078		return ret;
1079
1080	/*
1081	 * If file system device is different from journal device, issue a cache
1082	 * flush before we start writing fast commit blocks.
1083	 */
1084	if (journal->j_fs_dev != journal->j_dev)
1085		blkdev_issue_flush(journal->j_fs_dev);
1086
1087	blk_start_plug(&plug);
1088	if (sbi->s_fc_bytes == 0) {
1089		/*
1090		 * Add a head tag only if this is the first fast commit
1091		 * in this TID.
1092		 */
1093		head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1094		head.fc_tid = cpu_to_le32(
1095			sbi->s_journal->j_running_transaction->t_tid);
1096		if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1097			(u8 *)&head, &crc)) {
1098			ret = -ENOSPC;
1099			goto out;
1100		}
1101	}
1102
1103	spin_lock(&sbi->s_fc_lock);
1104	ret = ext4_fc_commit_dentry_updates(journal, &crc);
1105	if (ret) {
1106		spin_unlock(&sbi->s_fc_lock);
1107		goto out;
1108	}
1109
1110	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1111		inode = &iter->vfs_inode;
1112		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1113			continue;
1114
1115		spin_unlock(&sbi->s_fc_lock);
1116		ret = ext4_fc_write_inode_data(inode, &crc);
1117		if (ret)
1118			goto out;
1119		ret = ext4_fc_write_inode(inode, &crc);
1120		if (ret)
1121			goto out;
1122		spin_lock(&sbi->s_fc_lock);
1123	}
1124	spin_unlock(&sbi->s_fc_lock);
1125
1126	ret = ext4_fc_write_tail(sb, crc);
1127
1128out:
1129	blk_finish_plug(&plug);
1130	return ret;
1131}
1132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1133/*
1134 * The main commit entry point. Performs a fast commit for transaction
1135 * commit_tid if needed. If it's not possible to perform a fast commit
1136 * due to various reasons, we fall back to full commit. Returns 0
1137 * on success, error otherwise.
1138 */
1139int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1140{
1141	struct super_block *sb = (struct super_block *)(journal->j_private);
1142	struct ext4_sb_info *sbi = EXT4_SB(sb);
1143	int nblks = 0, ret, bsize = journal->j_blocksize;
1144	int subtid = atomic_read(&sbi->s_fc_subtid);
1145	int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1146	ktime_t start_time, commit_time;
1147
1148	trace_ext4_fc_commit_start(sb);
 
 
 
1149
1150	start_time = ktime_get();
1151
1152	if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1153		(ext4_fc_is_ineligible(sb))) {
1154		reason = EXT4_FC_REASON_INELIGIBLE;
1155		goto out;
1156	}
1157
1158restart_fc:
1159	ret = jbd2_fc_begin_commit(journal, commit_tid);
1160	if (ret == -EALREADY) {
1161		/* There was an ongoing commit, check if we need to restart */
1162		if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1163			commit_tid > journal->j_commit_sequence)
1164			goto restart_fc;
1165		reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1166		goto out;
 
1167	} else if (ret) {
1168		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169		reason = EXT4_FC_REASON_FC_START_FAILED;
1170		goto out;
 
 
 
 
 
 
 
 
 
 
 
 
 
1171	}
1172
1173	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1174	ret = ext4_fc_perform_commit(journal);
1175	if (ret < 0) {
1176		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177		reason = EXT4_FC_REASON_FC_FAILED;
1178		goto out;
1179	}
1180	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181	ret = jbd2_fc_wait_bufs(journal, nblks);
1182	if (ret < 0) {
1183		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184		reason = EXT4_FC_REASON_FC_FAILED;
1185		goto out;
1186	}
1187	atomic_inc(&sbi->s_fc_subtid);
1188	jbd2_fc_end_commit(journal);
1189out:
1190	/* Has any ineligible update happened since we started? */
1191	if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1192		sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1193		reason = EXT4_FC_REASON_INELIGIBLE;
1194	}
1195
1196	spin_lock(&sbi->s_fc_lock);
1197	if (reason != EXT4_FC_REASON_OK &&
1198		reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1199		sbi->s_fc_stats.fc_ineligible_commits++;
1200	} else {
1201		sbi->s_fc_stats.fc_num_commits++;
1202		sbi->s_fc_stats.fc_numblks += nblks;
1203	}
1204	spin_unlock(&sbi->s_fc_lock);
1205	nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1206	trace_ext4_fc_commit_stop(sb, nblks, reason);
1207	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1208	/*
1209	 * weight the commit time higher than the average time so we don't
1210	 * react too strongly to vast changes in the commit time
1211	 */
1212	if (likely(sbi->s_fc_avg_commit_time))
1213		sbi->s_fc_avg_commit_time = (commit_time +
1214				sbi->s_fc_avg_commit_time * 3) / 4;
1215	else
1216		sbi->s_fc_avg_commit_time = commit_time;
1217	jbd_debug(1,
1218		"Fast commit ended with blks = %d, reason = %d, subtid - %d",
1219		nblks, reason, subtid);
1220	if (reason == EXT4_FC_REASON_FC_FAILED)
1221		return jbd2_fc_end_commit_fallback(journal);
1222	if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1223		reason == EXT4_FC_REASON_INELIGIBLE)
1224		return jbd2_complete_transaction(journal, commit_tid);
1225	return 0;
1226}
1227
1228/*
1229 * Fast commit cleanup routine. This is called after every fast commit and
1230 * full commit. full is true if we are called after a full commit.
1231 */
1232static void ext4_fc_cleanup(journal_t *journal, int full)
1233{
1234	struct super_block *sb = journal->j_private;
1235	struct ext4_sb_info *sbi = EXT4_SB(sb);
1236	struct ext4_inode_info *iter, *iter_n;
1237	struct ext4_fc_dentry_update *fc_dentry;
1238
1239	if (full && sbi->s_fc_bh)
1240		sbi->s_fc_bh = NULL;
1241
 
1242	jbd2_fc_release_bufs(journal);
1243
1244	spin_lock(&sbi->s_fc_lock);
1245	list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1246				 i_fc_list) {
1247		list_del_init(&iter->i_fc_list);
1248		ext4_clear_inode_state(&iter->vfs_inode,
1249				       EXT4_STATE_FC_COMMITTING);
1250		ext4_fc_reset_inode(&iter->vfs_inode);
 
1251		/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1252		smp_mb();
1253#if (BITS_PER_LONG < 64)
1254		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1255#else
1256		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1257#endif
1258	}
1259
1260	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1261		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1262					     struct ext4_fc_dentry_update,
1263					     fcd_list);
1264		list_del_init(&fc_dentry->fcd_list);
 
1265		spin_unlock(&sbi->s_fc_lock);
1266
1267		if (fc_dentry->fcd_name.name &&
1268			fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1269			kfree(fc_dentry->fcd_name.name);
1270		kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1271		spin_lock(&sbi->s_fc_lock);
1272	}
1273
1274	list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1275				&sbi->s_fc_dentry_q[FC_Q_MAIN]);
1276	list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1277				&sbi->s_fc_q[FC_Q_MAIN]);
1278
1279	ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1280	ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 
 
1281
1282	if (full)
1283		sbi->s_fc_bytes = 0;
1284	spin_unlock(&sbi->s_fc_lock);
1285	trace_ext4_fc_stats(sb);
1286}
1287
1288/* Ext4 Replay Path Routines */
1289
1290/* Helper struct for dentry replay routines */
1291struct dentry_info_args {
1292	int parent_ino, dname_len, ino, inode_len;
1293	char *dname;
1294};
1295
 
 
 
 
 
 
1296static inline void tl_to_darg(struct dentry_info_args *darg,
1297			      struct  ext4_fc_tl *tl, u8 *val)
1298{
1299	struct ext4_fc_dentry_info fcd;
1300
1301	memcpy(&fcd, val, sizeof(fcd));
1302
1303	darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1304	darg->ino = le32_to_cpu(fcd.fc_ino);
1305	darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1306	darg->dname_len = le16_to_cpu(tl->fc_len) -
1307		sizeof(struct ext4_fc_dentry_info);
 
 
 
 
 
 
 
 
1308}
1309
1310/* Unlink replay function */
1311static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1312				 u8 *val)
1313{
1314	struct inode *inode, *old_parent;
1315	struct qstr entry;
1316	struct dentry_info_args darg;
1317	int ret = 0;
1318
1319	tl_to_darg(&darg, tl, val);
1320
1321	trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1322			darg.parent_ino, darg.dname_len);
1323
1324	entry.name = darg.dname;
1325	entry.len = darg.dname_len;
1326	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1327
1328	if (IS_ERR(inode)) {
1329		jbd_debug(1, "Inode %d not found", darg.ino);
1330		return 0;
1331	}
1332
1333	old_parent = ext4_iget(sb, darg.parent_ino,
1334				EXT4_IGET_NORMAL);
1335	if (IS_ERR(old_parent)) {
1336		jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1337		iput(inode);
1338		return 0;
1339	}
1340
1341	ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1342	/* -ENOENT ok coz it might not exist anymore. */
1343	if (ret == -ENOENT)
1344		ret = 0;
1345	iput(old_parent);
1346	iput(inode);
1347	return ret;
1348}
1349
1350static int ext4_fc_replay_link_internal(struct super_block *sb,
1351				struct dentry_info_args *darg,
1352				struct inode *inode)
1353{
1354	struct inode *dir = NULL;
1355	struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1356	struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1357	int ret = 0;
1358
1359	dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1360	if (IS_ERR(dir)) {
1361		jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1362		dir = NULL;
1363		goto out;
1364	}
1365
1366	dentry_dir = d_obtain_alias(dir);
1367	if (IS_ERR(dentry_dir)) {
1368		jbd_debug(1, "Failed to obtain dentry");
1369		dentry_dir = NULL;
1370		goto out;
1371	}
1372
1373	dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1374	if (!dentry_inode) {
1375		jbd_debug(1, "Inode dentry not created.");
1376		ret = -ENOMEM;
1377		goto out;
1378	}
1379
1380	ret = __ext4_link(dir, inode, dentry_inode);
1381	/*
1382	 * It's possible that link already existed since data blocks
1383	 * for the dir in question got persisted before we crashed OR
1384	 * we replayed this tag and crashed before the entire replay
1385	 * could complete.
1386	 */
1387	if (ret && ret != -EEXIST) {
1388		jbd_debug(1, "Failed to link\n");
1389		goto out;
1390	}
1391
1392	ret = 0;
1393out:
1394	if (dentry_dir) {
1395		d_drop(dentry_dir);
1396		dput(dentry_dir);
1397	} else if (dir) {
1398		iput(dir);
1399	}
1400	if (dentry_inode) {
1401		d_drop(dentry_inode);
1402		dput(dentry_inode);
1403	}
1404
1405	return ret;
1406}
1407
1408/* Link replay function */
1409static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1410			       u8 *val)
1411{
1412	struct inode *inode;
1413	struct dentry_info_args darg;
1414	int ret = 0;
1415
1416	tl_to_darg(&darg, tl, val);
1417	trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1418			darg.parent_ino, darg.dname_len);
1419
1420	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421	if (IS_ERR(inode)) {
1422		jbd_debug(1, "Inode not found.");
1423		return 0;
1424	}
1425
1426	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1427	iput(inode);
1428	return ret;
1429}
1430
1431/*
1432 * Record all the modified inodes during replay. We use this later to setup
1433 * block bitmaps correctly.
1434 */
1435static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1436{
1437	struct ext4_fc_replay_state *state;
1438	int i;
1439
1440	state = &EXT4_SB(sb)->s_fc_replay_state;
1441	for (i = 0; i < state->fc_modified_inodes_used; i++)
1442		if (state->fc_modified_inodes[i] == ino)
1443			return 0;
1444	if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
 
 
 
 
 
 
 
 
 
1445		state->fc_modified_inodes_size +=
1446			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1447		state->fc_modified_inodes = krealloc(
1448					state->fc_modified_inodes, sizeof(int) *
1449					state->fc_modified_inodes_size,
1450					GFP_KERNEL);
1451		if (!state->fc_modified_inodes)
1452			return -ENOMEM;
1453	}
1454	state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1455	return 0;
1456}
1457
1458/*
1459 * Inode replay function
1460 */
1461static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1462				u8 *val)
1463{
1464	struct ext4_fc_inode fc_inode;
1465	struct ext4_inode *raw_inode;
1466	struct ext4_inode *raw_fc_inode;
1467	struct inode *inode = NULL;
1468	struct ext4_iloc iloc;
1469	int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1470	struct ext4_extent_header *eh;
 
1471
1472	memcpy(&fc_inode, val, sizeof(fc_inode));
1473
1474	ino = le32_to_cpu(fc_inode.fc_ino);
1475	trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1476
1477	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1478	if (!IS_ERR(inode)) {
1479		ext4_ext_clear_bb(inode);
1480		iput(inode);
1481	}
1482	inode = NULL;
1483
1484	ext4_fc_record_modified_inode(sb, ino);
 
 
1485
1486	raw_fc_inode = (struct ext4_inode *)
1487		(val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1488	ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1489	if (ret)
1490		goto out;
1491
1492	inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1493	raw_inode = ext4_raw_inode(&iloc);
1494
1495	memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1496	memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1497		inode_len - offsetof(struct ext4_inode, i_generation));
1498	if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1499		eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1500		if (eh->eh_magic != EXT4_EXT_MAGIC) {
1501			memset(eh, 0, sizeof(*eh));
1502			eh->eh_magic = EXT4_EXT_MAGIC;
1503			eh->eh_max = cpu_to_le16(
1504				(sizeof(raw_inode->i_block) -
1505				 sizeof(struct ext4_extent_header))
1506				 / sizeof(struct ext4_extent));
1507		}
1508	} else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1509		memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1510			sizeof(raw_inode->i_block));
1511	}
1512
1513	/* Immediately update the inode on disk. */
1514	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515	if (ret)
1516		goto out;
1517	ret = sync_dirty_buffer(iloc.bh);
1518	if (ret)
1519		goto out;
1520	ret = ext4_mark_inode_used(sb, ino);
1521	if (ret)
1522		goto out;
1523
1524	/* Given that we just wrote the inode on disk, this SHOULD succeed. */
1525	inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1526	if (IS_ERR(inode)) {
1527		jbd_debug(1, "Inode not found.");
1528		return -EFSCORRUPTED;
1529	}
1530
1531	/*
1532	 * Our allocator could have made different decisions than before
1533	 * crashing. This should be fixed but until then, we calculate
1534	 * the number of blocks the inode.
1535	 */
1536	ext4_ext_replay_set_iblocks(inode);
 
1537
1538	inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1539	ext4_reset_inode_seed(inode);
1540
1541	ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1542	ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1543	sync_dirty_buffer(iloc.bh);
1544	brelse(iloc.bh);
1545out:
1546	iput(inode);
1547	if (!ret)
1548		blkdev_issue_flush(sb->s_bdev);
1549
1550	return 0;
1551}
1552
1553/*
1554 * Dentry create replay function.
1555 *
1556 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1557 * inode for which we are trying to create a dentry here, should already have
1558 * been replayed before we start here.
1559 */
1560static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1561				 u8 *val)
1562{
1563	int ret = 0;
1564	struct inode *inode = NULL;
1565	struct inode *dir = NULL;
1566	struct dentry_info_args darg;
1567
1568	tl_to_darg(&darg, tl, val);
1569
1570	trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571			darg.parent_ino, darg.dname_len);
1572
1573	/* This takes care of update group descriptor and other metadata */
1574	ret = ext4_mark_inode_used(sb, darg.ino);
1575	if (ret)
1576		goto out;
1577
1578	inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579	if (IS_ERR(inode)) {
1580		jbd_debug(1, "inode %d not found.", darg.ino);
1581		inode = NULL;
1582		ret = -EINVAL;
1583		goto out;
1584	}
1585
1586	if (S_ISDIR(inode->i_mode)) {
1587		/*
1588		 * If we are creating a directory, we need to make sure that the
1589		 * dot and dot dot dirents are setup properly.
1590		 */
1591		dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592		if (IS_ERR(dir)) {
1593			jbd_debug(1, "Dir %d not found.", darg.ino);
1594			goto out;
1595		}
1596		ret = ext4_init_new_dir(NULL, dir, inode);
1597		iput(dir);
1598		if (ret) {
1599			ret = 0;
1600			goto out;
1601		}
1602	}
1603	ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604	if (ret)
1605		goto out;
1606	set_nlink(inode, 1);
1607	ext4_mark_inode_dirty(NULL, inode);
1608out:
1609	if (inode)
1610		iput(inode);
1611	return ret;
1612}
1613
1614/*
1615 * Record physical disk regions which are in use as per fast commit area. Our
1616 * simple replay phase allocator excludes these regions from allocation.
 
1617 */
1618static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619		ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620{
1621	struct ext4_fc_replay_state *state;
1622	struct ext4_fc_alloc_region *region;
1623
1624	state = &EXT4_SB(sb)->s_fc_replay_state;
 
 
 
 
 
 
1625	if (state->fc_regions_used == state->fc_regions_size) {
 
 
 
 
 
 
 
 
 
1626		state->fc_regions_size +=
1627			EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628		state->fc_regions = krealloc(
1629					state->fc_regions,
1630					state->fc_regions_size *
1631					sizeof(struct ext4_fc_alloc_region),
1632					GFP_KERNEL);
1633		if (!state->fc_regions)
1634			return -ENOMEM;
1635	}
1636	region = &state->fc_regions[state->fc_regions_used++];
1637	region->ino = ino;
1638	region->lblk = lblk;
1639	region->pblk = pblk;
1640	region->len = len;
1641
 
 
 
1642	return 0;
1643}
1644
1645/* Replay add range tag */
1646static int ext4_fc_replay_add_range(struct super_block *sb,
1647				    struct ext4_fc_tl *tl, u8 *val)
1648{
1649	struct ext4_fc_add_range fc_add_ex;
1650	struct ext4_extent newex, *ex;
1651	struct inode *inode;
1652	ext4_lblk_t start, cur;
1653	int remaining, len;
1654	ext4_fsblk_t start_pblk;
1655	struct ext4_map_blocks map;
1656	struct ext4_ext_path *path = NULL;
1657	int ret;
1658
1659	memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1660	ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1661
1662	trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663		le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1664		ext4_ext_get_actual_len(ex));
1665
1666	inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1667	if (IS_ERR(inode)) {
1668		jbd_debug(1, "Inode not found.");
1669		return 0;
1670	}
1671
1672	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1673
1674	start = le32_to_cpu(ex->ee_block);
1675	start_pblk = ext4_ext_pblock(ex);
1676	len = ext4_ext_get_actual_len(ex);
1677
1678	cur = start;
1679	remaining = len;
1680	jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1681		  start, start_pblk, len, ext4_ext_is_unwritten(ex),
1682		  inode->i_ino);
1683
1684	while (remaining > 0) {
1685		map.m_lblk = cur;
1686		map.m_len = remaining;
1687		map.m_pblk = 0;
1688		ret = ext4_map_blocks(NULL, inode, &map, 0);
1689
1690		if (ret < 0) {
1691			iput(inode);
1692			return 0;
1693		}
1694
1695		if (ret == 0) {
1696			/* Range is not mapped */
1697			path = ext4_find_extent(inode, cur, NULL, 0);
1698			if (IS_ERR(path)) {
1699				iput(inode);
1700				return 0;
1701			}
1702			memset(&newex, 0, sizeof(newex));
1703			newex.ee_block = cpu_to_le32(cur);
1704			ext4_ext_store_pblock(
1705				&newex, start_pblk + cur - start);
1706			newex.ee_len = cpu_to_le16(map.m_len);
1707			if (ext4_ext_is_unwritten(ex))
1708				ext4_ext_mark_unwritten(&newex);
1709			down_write(&EXT4_I(inode)->i_data_sem);
1710			ret = ext4_ext_insert_extent(
1711				NULL, inode, &path, &newex, 0);
1712			up_write((&EXT4_I(inode)->i_data_sem));
1713			ext4_ext_drop_refs(path);
1714			kfree(path);
1715			if (ret) {
1716				iput(inode);
1717				return 0;
1718			}
1719			goto next;
1720		}
1721
1722		if (start_pblk + cur - start != map.m_pblk) {
1723			/*
1724			 * Logical to physical mapping changed. This can happen
1725			 * if this range was removed and then reallocated to
1726			 * map to new physical blocks during a fast commit.
1727			 */
1728			ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729					ext4_ext_is_unwritten(ex),
1730					start_pblk + cur - start);
1731			if (ret) {
1732				iput(inode);
1733				return 0;
1734			}
1735			/*
1736			 * Mark the old blocks as free since they aren't used
1737			 * anymore. We maintain an array of all the modified
1738			 * inodes. In case these blocks are still used at either
1739			 * a different logical range in the same inode or in
1740			 * some different inode, we will mark them as allocated
1741			 * at the end of the FC replay using our array of
1742			 * modified inodes.
1743			 */
1744			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1745			goto next;
1746		}
1747
1748		/* Range is mapped and needs a state change */
1749		jbd_debug(1, "Converting from %ld to %d %lld",
1750				map.m_flags & EXT4_MAP_UNWRITTEN,
1751			ext4_ext_is_unwritten(ex), map.m_pblk);
1752		ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1753					ext4_ext_is_unwritten(ex), map.m_pblk);
1754		if (ret) {
1755			iput(inode);
1756			return 0;
1757		}
1758		/*
1759		 * We may have split the extent tree while toggling the state.
1760		 * Try to shrink the extent tree now.
1761		 */
1762		ext4_ext_replay_shrink_inode(inode, start + len);
1763next:
1764		cur += map.m_len;
1765		remaining -= map.m_len;
1766	}
1767	ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1768					sb->s_blocksize_bits);
 
1769	iput(inode);
1770	return 0;
1771}
1772
1773/* Replay DEL_RANGE tag */
1774static int
1775ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1776			 u8 *val)
1777{
1778	struct inode *inode;
1779	struct ext4_fc_del_range lrange;
1780	struct ext4_map_blocks map;
1781	ext4_lblk_t cur, remaining;
1782	int ret;
1783
1784	memcpy(&lrange, val, sizeof(lrange));
1785	cur = le32_to_cpu(lrange.fc_lblk);
1786	remaining = le32_to_cpu(lrange.fc_len);
1787
1788	trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789		le32_to_cpu(lrange.fc_ino), cur, remaining);
1790
1791	inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1792	if (IS_ERR(inode)) {
1793		jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1794		return 0;
1795	}
1796
1797	ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
 
 
1798
1799	jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800			inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1801			le32_to_cpu(lrange.fc_len));
1802	while (remaining > 0) {
1803		map.m_lblk = cur;
1804		map.m_len = remaining;
1805
1806		ret = ext4_map_blocks(NULL, inode, &map, 0);
1807		if (ret < 0) {
1808			iput(inode);
1809			return 0;
1810		}
1811		if (ret > 0) {
1812			remaining -= ret;
1813			cur += ret;
1814			ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815		} else {
1816			remaining -= map.m_len;
1817			cur += map.m_len;
1818		}
1819	}
1820
1821	ret = ext4_punch_hole(inode,
1822		le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1823		le32_to_cpu(lrange.fc_len) <<  sb->s_blocksize_bits);
 
 
1824	if (ret)
1825		jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826	ext4_ext_replay_shrink_inode(inode,
1827		i_size_read(inode) >> sb->s_blocksize_bits);
1828	ext4_mark_inode_dirty(NULL, inode);
 
1829	iput(inode);
1830
1831	return 0;
1832}
1833
1834static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1835{
1836	struct ext4_fc_replay_state *state;
1837	struct inode *inode;
1838	struct ext4_ext_path *path = NULL;
1839	struct ext4_map_blocks map;
1840	int i, ret, j;
1841	ext4_lblk_t cur, end;
1842
1843	state = &EXT4_SB(sb)->s_fc_replay_state;
1844	for (i = 0; i < state->fc_modified_inodes_used; i++) {
1845		inode = ext4_iget(sb, state->fc_modified_inodes[i],
1846			EXT4_IGET_NORMAL);
1847		if (IS_ERR(inode)) {
1848			jbd_debug(1, "Inode %d not found.",
1849				state->fc_modified_inodes[i]);
1850			continue;
1851		}
1852		cur = 0;
1853		end = EXT_MAX_BLOCKS;
 
 
 
 
1854		while (cur < end) {
1855			map.m_lblk = cur;
1856			map.m_len = end - cur;
1857
1858			ret = ext4_map_blocks(NULL, inode, &map, 0);
1859			if (ret < 0)
1860				break;
1861
1862			if (ret > 0) {
1863				path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1864				if (!IS_ERR(path)) {
1865					for (j = 0; j < path->p_depth; j++)
1866						ext4_mb_mark_bb(inode->i_sb,
1867							path[j].p_block, 1, 1);
1868					ext4_ext_drop_refs(path);
1869					kfree(path);
1870				}
1871				cur += ret;
1872				ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1873							map.m_len, 1);
1874			} else {
1875				cur = cur + (map.m_len ? map.m_len : 1);
1876			}
1877		}
1878		iput(inode);
1879	}
1880}
1881
1882/*
1883 * Check if block is in excluded regions for block allocation. The simple
1884 * allocator that runs during replay phase is calls this function to see
1885 * if it is okay to use a block.
1886 */
1887bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1888{
1889	int i;
1890	struct ext4_fc_replay_state *state;
1891
1892	state = &EXT4_SB(sb)->s_fc_replay_state;
1893	for (i = 0; i < state->fc_regions_valid; i++) {
1894		if (state->fc_regions[i].ino == 0 ||
1895			state->fc_regions[i].len == 0)
1896			continue;
1897		if (blk >= state->fc_regions[i].pblk &&
1898		    blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1899			return true;
1900	}
1901	return false;
1902}
1903
1904/* Cleanup function called after replay */
1905void ext4_fc_replay_cleanup(struct super_block *sb)
1906{
1907	struct ext4_sb_info *sbi = EXT4_SB(sb);
1908
1909	sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1910	kfree(sbi->s_fc_replay_state.fc_regions);
1911	kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1912}
1913
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1914/*
1915 * Recovery Scan phase handler
1916 *
1917 * This function is called during the scan phase and is responsible
1918 * for doing following things:
1919 * - Make sure the fast commit area has valid tags for replay
1920 * - Count number of tags that need to be replayed by the replay handler
1921 * - Verify CRC
1922 * - Create a list of excluded blocks for allocation during replay phase
1923 *
1924 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1925 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1926 * to indicate that scan has finished and JBD2 can now start replay phase.
1927 * It returns a negative error to indicate that there was an error. At the end
1928 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1929 * to indicate the number of tags that need to replayed during the replay phase.
1930 */
1931static int ext4_fc_replay_scan(journal_t *journal,
1932				struct buffer_head *bh, int off,
1933				tid_t expected_tid)
1934{
1935	struct super_block *sb = journal->j_private;
1936	struct ext4_sb_info *sbi = EXT4_SB(sb);
1937	struct ext4_fc_replay_state *state;
1938	int ret = JBD2_FC_REPLAY_CONTINUE;
1939	struct ext4_fc_add_range ext;
1940	struct ext4_fc_tl tl;
1941	struct ext4_fc_tail tail;
1942	__u8 *start, *end, *cur, *val;
1943	struct ext4_fc_head head;
1944	struct ext4_extent *ex;
1945
1946	state = &sbi->s_fc_replay_state;
1947
1948	start = (u8 *)bh->b_data;
1949	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1950
1951	if (state->fc_replay_expected_off == 0) {
1952		state->fc_cur_tag = 0;
1953		state->fc_replay_num_tags = 0;
1954		state->fc_crc = 0;
1955		state->fc_regions = NULL;
1956		state->fc_regions_valid = state->fc_regions_used =
1957			state->fc_regions_size = 0;
1958		/* Check if we can stop early */
1959		if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1960			!= EXT4_FC_TAG_HEAD)
1961			return 0;
1962	}
1963
1964	if (off != state->fc_replay_expected_off) {
1965		ret = -EFSCORRUPTED;
1966		goto out_err;
1967	}
1968
1969	state->fc_replay_expected_off++;
1970	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1971		memcpy(&tl, cur, sizeof(tl));
1972		val = cur + sizeof(tl);
1973		jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1974			  tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1975		switch (le16_to_cpu(tl.fc_tag)) {
 
 
 
 
 
 
 
1976		case EXT4_FC_TAG_ADD_RANGE:
1977			memcpy(&ext, val, sizeof(ext));
1978			ex = (struct ext4_extent *)&ext.fc_ex;
1979			ret = ext4_fc_record_regions(sb,
1980				le32_to_cpu(ext.fc_ino),
1981				le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1982				ext4_ext_get_actual_len(ex));
1983			if (ret < 0)
1984				break;
1985			ret = JBD2_FC_REPLAY_CONTINUE;
1986			fallthrough;
1987		case EXT4_FC_TAG_DEL_RANGE:
1988		case EXT4_FC_TAG_LINK:
1989		case EXT4_FC_TAG_UNLINK:
1990		case EXT4_FC_TAG_CREAT:
1991		case EXT4_FC_TAG_INODE:
1992		case EXT4_FC_TAG_PAD:
1993			state->fc_cur_tag++;
1994			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995					sizeof(tl) + le16_to_cpu(tl.fc_len));
1996			break;
1997		case EXT4_FC_TAG_TAIL:
1998			state->fc_cur_tag++;
1999			memcpy(&tail, val, sizeof(tail));
2000			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2001						sizeof(tl) +
2002						offsetof(struct ext4_fc_tail,
2003						fc_crc));
2004			if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2005				le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2006				state->fc_replay_num_tags = state->fc_cur_tag;
2007				state->fc_regions_valid =
2008					state->fc_regions_used;
2009			} else {
2010				ret = state->fc_replay_num_tags ?
2011					JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2012			}
2013			state->fc_crc = 0;
2014			break;
2015		case EXT4_FC_TAG_HEAD:
2016			memcpy(&head, val, sizeof(head));
2017			if (le32_to_cpu(head.fc_features) &
2018				~EXT4_FC_SUPPORTED_FEATURES) {
2019				ret = -EOPNOTSUPP;
2020				break;
2021			}
2022			if (le32_to_cpu(head.fc_tid) != expected_tid) {
2023				ret = JBD2_FC_REPLAY_STOP;
2024				break;
2025			}
2026			state->fc_cur_tag++;
2027			state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2028					    sizeof(tl) + le16_to_cpu(tl.fc_len));
2029			break;
2030		default:
2031			ret = state->fc_replay_num_tags ?
2032				JBD2_FC_REPLAY_STOP : -ECANCELED;
2033		}
2034		if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2035			break;
2036	}
2037
2038out_err:
2039	trace_ext4_fc_replay_scan(sb, ret, off);
2040	return ret;
2041}
2042
2043/*
2044 * Main recovery path entry point.
2045 * The meaning of return codes is similar as above.
2046 */
2047static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2048				enum passtype pass, int off, tid_t expected_tid)
2049{
2050	struct super_block *sb = journal->j_private;
2051	struct ext4_sb_info *sbi = EXT4_SB(sb);
2052	struct ext4_fc_tl tl;
2053	__u8 *start, *end, *cur, *val;
2054	int ret = JBD2_FC_REPLAY_CONTINUE;
2055	struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2056	struct ext4_fc_tail tail;
2057
2058	if (pass == PASS_SCAN) {
2059		state->fc_current_pass = PASS_SCAN;
2060		return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2061	}
2062
2063	if (state->fc_current_pass != pass) {
2064		state->fc_current_pass = pass;
2065		sbi->s_mount_state |= EXT4_FC_REPLAY;
2066	}
2067	if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2068		jbd_debug(1, "Replay stops\n");
2069		ext4_fc_set_bitmaps_and_counters(sb);
2070		return 0;
2071	}
2072
2073#ifdef CONFIG_EXT4_DEBUG
2074	if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2075		pr_warn("Dropping fc block %d because max_replay set\n", off);
2076		return JBD2_FC_REPLAY_STOP;
2077	}
2078#endif
2079
2080	start = (u8 *)bh->b_data;
2081	end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2082
2083	for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2084		memcpy(&tl, cur, sizeof(tl));
2085		val = cur + sizeof(tl);
 
2086
2087		if (state->fc_replay_num_tags == 0) {
2088			ret = JBD2_FC_REPLAY_STOP;
2089			ext4_fc_set_bitmaps_and_counters(sb);
2090			break;
2091		}
2092		jbd_debug(3, "Replay phase, tag:%s\n",
2093				tag2str(le16_to_cpu(tl.fc_tag)));
2094		state->fc_replay_num_tags--;
2095		switch (le16_to_cpu(tl.fc_tag)) {
2096		case EXT4_FC_TAG_LINK:
2097			ret = ext4_fc_replay_link(sb, &tl, val);
2098			break;
2099		case EXT4_FC_TAG_UNLINK:
2100			ret = ext4_fc_replay_unlink(sb, &tl, val);
2101			break;
2102		case EXT4_FC_TAG_ADD_RANGE:
2103			ret = ext4_fc_replay_add_range(sb, &tl, val);
2104			break;
2105		case EXT4_FC_TAG_CREAT:
2106			ret = ext4_fc_replay_create(sb, &tl, val);
2107			break;
2108		case EXT4_FC_TAG_DEL_RANGE:
2109			ret = ext4_fc_replay_del_range(sb, &tl, val);
2110			break;
2111		case EXT4_FC_TAG_INODE:
2112			ret = ext4_fc_replay_inode(sb, &tl, val);
2113			break;
2114		case EXT4_FC_TAG_PAD:
2115			trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2116					     le16_to_cpu(tl.fc_len), 0);
2117			break;
2118		case EXT4_FC_TAG_TAIL:
2119			trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2120					     le16_to_cpu(tl.fc_len), 0);
2121			memcpy(&tail, val, sizeof(tail));
2122			WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2123			break;
2124		case EXT4_FC_TAG_HEAD:
2125			break;
2126		default:
2127			trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2128					     le16_to_cpu(tl.fc_len), 0);
2129			ret = -ECANCELED;
2130			break;
2131		}
2132		if (ret < 0)
2133			break;
2134		ret = JBD2_FC_REPLAY_CONTINUE;
2135	}
2136	return ret;
2137}
2138
2139void ext4_fc_init(struct super_block *sb, journal_t *journal)
2140{
2141	/*
2142	 * We set replay callback even if fast commit disabled because we may
2143	 * could still have fast commit blocks that need to be replayed even if
2144	 * fast commit has now been turned off.
2145	 */
2146	journal->j_fc_replay_callback = ext4_fc_replay;
2147	if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2148		return;
2149	journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2150}
2151
2152static const char *fc_ineligible_reasons[] = {
2153	"Extended attributes changed",
2154	"Cross rename",
2155	"Journal flag changed",
2156	"Insufficient memory",
2157	"Swap boot",
2158	"Resize",
2159	"Dir renamed",
2160	"Falloc range op",
2161	"Data journalling",
2162	"FC Commit Failed"
2163};
2164
2165int ext4_fc_info_show(struct seq_file *seq, void *v)
2166{
2167	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2168	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2169	int i;
2170
2171	if (v != SEQ_START_TOKEN)
2172		return 0;
2173
2174	seq_printf(seq,
2175		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2176		   stats->fc_num_commits, stats->fc_ineligible_commits,
2177		   stats->fc_numblks,
2178		   div_u64(sbi->s_fc_avg_commit_time, 1000));
2179	seq_puts(seq, "Ineligible reasons:\n");
2180	for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2181		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2182			stats->fc_ineligible_reason_count[i]);
2183
2184	return 0;
2185}
2186
2187int __init ext4_fc_init_dentry_cache(void)
2188{
2189	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2190					   SLAB_RECLAIM_ACCOUNT);
2191
2192	if (ext4_fc_dentry_cachep == NULL)
2193		return -ENOMEM;
2194
2195	return 0;
 
 
 
 
 
2196}