Loading...
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 * Not all operations are supported by fast commits today (e.g extended
69 * attributes). Fast commit ineligibility is marked by calling one of the
70 * two following functions:
71 *
72 * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
73 * back to full commit. This is useful in case of transient errors.
74 *
75 * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
76 * the fast commits happening between ext4_fc_start_ineligible() and
77 * ext4_fc_stop_ineligible() and one fast commit after the call to
78 * ext4_fc_stop_ineligible() to fall back to full commits. It is important to
79 * make one more fast commit to fall back to full commit after stop call so
80 * that it guaranteed that the fast commit ineligible operation contained
81 * within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
82 * followed by at least 1 full commit.
83 *
84 * Atomicity of commits
85 * --------------------
86 * In order to guarantee atomicity during the commit operation, fast commit
87 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
88 * tag contains CRC of the contents and TID of the transaction after which
89 * this fast commit should be applied. Recovery code replays fast commit
90 * logs only if there's at least 1 valid tail present. For every fast commit
91 * operation, there is 1 tail. This means, we may end up with multiple tails
92 * in the fast commit space. Here's an example:
93 *
94 * - Create a new file A and remove existing file B
95 * - fsync()
96 * - Append contents to file A
97 * - Truncate file A
98 * - fsync()
99 *
100 * The fast commit space at the end of above operations would look like this:
101 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
102 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
103 *
104 * Replay code should thus check for all the valid tails in the FC area.
105 *
106 * Fast Commit Replay Idempotence
107 * ------------------------------
108 *
109 * Fast commits tags are idempotent in nature provided the recovery code follows
110 * certain rules. The guiding principle that the commit path follows while
111 * committing is that it stores the result of a particular operation instead of
112 * storing the procedure.
113 *
114 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
115 * was associated with inode 10. During fast commit, instead of storing this
116 * operation as a procedure "rename a to b", we store the resulting file system
117 * state as a "series" of outcomes:
118 *
119 * - Link dirent b to inode 10
120 * - Unlink dirent a
121 * - Inode <10> with valid refcount
122 *
123 * Now when recovery code runs, it needs "enforce" this state on the file
124 * system. This is what guarantees idempotence of fast commit replay.
125 *
126 * Let's take an example of a procedure that is not idempotent and see how fast
127 * commits make it idempotent. Consider following sequence of operations:
128 *
129 * rm A; mv B A; read A
130 * (x) (y) (z)
131 *
132 * (x), (y) and (z) are the points at which we can crash. If we store this
133 * sequence of operations as is then the replay is not idempotent. Let's say
134 * while in replay, we crash at (z). During the second replay, file A (which was
135 * actually created as a result of "mv B A" operation) would get deleted. Thus,
136 * file named A would be absent when we try to read A. So, this sequence of
137 * operations is not idempotent. However, as mentioned above, instead of storing
138 * the procedure fast commits store the outcome of each procedure. Thus the fast
139 * commit log for above procedure would be as follows:
140 *
141 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
142 * inode 11 before the replay)
143 *
144 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
145 * (w) (x) (y) (z)
146 *
147 * If we crash at (z), we will have file A linked to inode 11. During the second
148 * replay, we will remove file A (inode 11). But we will create it back and make
149 * it point to inode 11. We won't find B, so we'll just skip that step. At this
150 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
151 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
152 * similarly. Thus, by converting a non-idempotent procedure into a series of
153 * idempotent outcomes, fast commits ensured idempotence during the replay.
154 *
155 * TODOs
156 * -----
157 *
158 * 0) Fast commit replay path hardening: Fast commit replay code should use
159 * journal handles to make sure all the updates it does during the replay
160 * path are atomic. With that if we crash during fast commit replay, after
161 * trying to do recovery again, we will find a file system where fast commit
162 * area is invalid (because new full commit would be found). In order to deal
163 * with that, fast commit replay code should ensure that the "FC_REPLAY"
164 * superblock state is persisted before starting the replay, so that after
165 * the crash, fast commit recovery code can look at that flag and perform
166 * fast commit recovery even if that area is invalidated by later full
167 * commits.
168 *
169 * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
170 * eligible update must be protected within ext4_fc_start_update() and
171 * ext4_fc_stop_update(). These routines are called at much higher
172 * routines. This can be made more fine grained by combining with
173 * ext4_journal_start().
174 *
175 * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
176 *
177 * 3) Handle more ineligible cases.
178 */
179
180#include <trace/events/ext4.h>
181static struct kmem_cache *ext4_fc_dentry_cachep;
182
183static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
184{
185 BUFFER_TRACE(bh, "");
186 if (uptodate) {
187 ext4_debug("%s: Block %lld up-to-date",
188 __func__, bh->b_blocknr);
189 set_buffer_uptodate(bh);
190 } else {
191 ext4_debug("%s: Block %lld not up-to-date",
192 __func__, bh->b_blocknr);
193 clear_buffer_uptodate(bh);
194 }
195
196 unlock_buffer(bh);
197}
198
199static inline void ext4_fc_reset_inode(struct inode *inode)
200{
201 struct ext4_inode_info *ei = EXT4_I(inode);
202
203 ei->i_fc_lblk_start = 0;
204 ei->i_fc_lblk_len = 0;
205}
206
207void ext4_fc_init_inode(struct inode *inode)
208{
209 struct ext4_inode_info *ei = EXT4_I(inode);
210
211 ext4_fc_reset_inode(inode);
212 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
213 INIT_LIST_HEAD(&ei->i_fc_list);
214 init_waitqueue_head(&ei->i_fc_wait);
215 atomic_set(&ei->i_fc_updates, 0);
216}
217
218/* This function must be called with sbi->s_fc_lock held. */
219static void ext4_fc_wait_committing_inode(struct inode *inode)
220__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
221{
222 wait_queue_head_t *wq;
223 struct ext4_inode_info *ei = EXT4_I(inode);
224
225#if (BITS_PER_LONG < 64)
226 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
227 EXT4_STATE_FC_COMMITTING);
228 wq = bit_waitqueue(&ei->i_state_flags,
229 EXT4_STATE_FC_COMMITTING);
230#else
231 DEFINE_WAIT_BIT(wait, &ei->i_flags,
232 EXT4_STATE_FC_COMMITTING);
233 wq = bit_waitqueue(&ei->i_flags,
234 EXT4_STATE_FC_COMMITTING);
235#endif
236 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
237 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
238 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
239 schedule();
240 finish_wait(wq, &wait.wq_entry);
241}
242
243/*
244 * Inform Ext4's fast about start of an inode update
245 *
246 * This function is called by the high level call VFS callbacks before
247 * performing any inode update. This function blocks if there's an ongoing
248 * fast commit on the inode in question.
249 */
250void ext4_fc_start_update(struct inode *inode)
251{
252 struct ext4_inode_info *ei = EXT4_I(inode);
253
254 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
255 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
256 return;
257
258restart:
259 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
260 if (list_empty(&ei->i_fc_list))
261 goto out;
262
263 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
264 ext4_fc_wait_committing_inode(inode);
265 goto restart;
266 }
267out:
268 atomic_inc(&ei->i_fc_updates);
269 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
270}
271
272/*
273 * Stop inode update and wake up waiting fast commits if any.
274 */
275void ext4_fc_stop_update(struct inode *inode)
276{
277 struct ext4_inode_info *ei = EXT4_I(inode);
278
279 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
280 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
281 return;
282
283 if (atomic_dec_and_test(&ei->i_fc_updates))
284 wake_up_all(&ei->i_fc_wait);
285}
286
287/*
288 * Remove inode from fast commit list. If the inode is being committed
289 * we wait until inode commit is done.
290 */
291void ext4_fc_del(struct inode *inode)
292{
293 struct ext4_inode_info *ei = EXT4_I(inode);
294
295 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
296 (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
297 return;
298
299restart:
300 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
301 if (list_empty(&ei->i_fc_list)) {
302 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
303 return;
304 }
305
306 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
307 ext4_fc_wait_committing_inode(inode);
308 goto restart;
309 }
310 list_del_init(&ei->i_fc_list);
311 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
312}
313
314/*
315 * Mark file system as fast commit ineligible. This means that next commit
316 * operation would result in a full jbd2 commit.
317 */
318void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
319{
320 struct ext4_sb_info *sbi = EXT4_SB(sb);
321
322 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
323 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
324 return;
325
326 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
327 WARN_ON(reason >= EXT4_FC_REASON_MAX);
328 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
329}
330
331/*
332 * Start a fast commit ineligible update. Any commits that happen while
333 * such an operation is in progress fall back to full commits.
334 */
335void ext4_fc_start_ineligible(struct super_block *sb, int reason)
336{
337 struct ext4_sb_info *sbi = EXT4_SB(sb);
338
339 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
340 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
341 return;
342
343 WARN_ON(reason >= EXT4_FC_REASON_MAX);
344 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
345 atomic_inc(&sbi->s_fc_ineligible_updates);
346}
347
348/*
349 * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
350 * to ensure that after stopping the ineligible update, at least one full
351 * commit takes place.
352 */
353void ext4_fc_stop_ineligible(struct super_block *sb)
354{
355 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
356 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
357 return;
358
359 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
361}
362
363static inline int ext4_fc_is_ineligible(struct super_block *sb)
364{
365 return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
366 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
367}
368
369/*
370 * Generic fast commit tracking function. If this is the first time this we are
371 * called after a full commit, we initialize fast commit fields and then call
372 * __fc_track_fn() with update = 0. If we have already been called after a full
373 * commit, we pass update = 1. Based on that, the track function can determine
374 * if it needs to track a field for the first time or if it needs to just
375 * update the previously tracked value.
376 *
377 * If enqueue is set, this function enqueues the inode in fast commit list.
378 */
379static int ext4_fc_track_template(
380 handle_t *handle, struct inode *inode,
381 int (*__fc_track_fn)(struct inode *, void *, bool),
382 void *args, int enqueue)
383{
384 bool update = false;
385 struct ext4_inode_info *ei = EXT4_I(inode);
386 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
387 tid_t tid = 0;
388 int ret;
389
390 if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
391 (sbi->s_mount_state & EXT4_FC_REPLAY))
392 return -EOPNOTSUPP;
393
394 if (ext4_fc_is_ineligible(inode->i_sb))
395 return -EINVAL;
396
397 tid = handle->h_transaction->t_tid;
398 mutex_lock(&ei->i_fc_lock);
399 if (tid == ei->i_sync_tid) {
400 update = true;
401 } else {
402 ext4_fc_reset_inode(inode);
403 ei->i_sync_tid = tid;
404 }
405 ret = __fc_track_fn(inode, args, update);
406 mutex_unlock(&ei->i_fc_lock);
407
408 if (!enqueue)
409 return ret;
410
411 spin_lock(&sbi->s_fc_lock);
412 if (list_empty(&EXT4_I(inode)->i_fc_list))
413 list_add_tail(&EXT4_I(inode)->i_fc_list,
414 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
415 &sbi->s_fc_q[FC_Q_STAGING] :
416 &sbi->s_fc_q[FC_Q_MAIN]);
417 spin_unlock(&sbi->s_fc_lock);
418
419 return ret;
420}
421
422struct __track_dentry_update_args {
423 struct dentry *dentry;
424 int op;
425};
426
427/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
428static int __track_dentry_update(struct inode *inode, void *arg, bool update)
429{
430 struct ext4_fc_dentry_update *node;
431 struct ext4_inode_info *ei = EXT4_I(inode);
432 struct __track_dentry_update_args *dentry_update =
433 (struct __track_dentry_update_args *)arg;
434 struct dentry *dentry = dentry_update->dentry;
435 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
436
437 mutex_unlock(&ei->i_fc_lock);
438 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
439 if (!node) {
440 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
441 mutex_lock(&ei->i_fc_lock);
442 return -ENOMEM;
443 }
444
445 node->fcd_op = dentry_update->op;
446 node->fcd_parent = dentry->d_parent->d_inode->i_ino;
447 node->fcd_ino = inode->i_ino;
448 if (dentry->d_name.len > DNAME_INLINE_LEN) {
449 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
450 if (!node->fcd_name.name) {
451 kmem_cache_free(ext4_fc_dentry_cachep, node);
452 ext4_fc_mark_ineligible(inode->i_sb,
453 EXT4_FC_REASON_NOMEM);
454 mutex_lock(&ei->i_fc_lock);
455 return -ENOMEM;
456 }
457 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
458 dentry->d_name.len);
459 } else {
460 memcpy(node->fcd_iname, dentry->d_name.name,
461 dentry->d_name.len);
462 node->fcd_name.name = node->fcd_iname;
463 }
464 node->fcd_name.len = dentry->d_name.len;
465
466 spin_lock(&sbi->s_fc_lock);
467 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
468 list_add_tail(&node->fcd_list,
469 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
470 else
471 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
472 spin_unlock(&sbi->s_fc_lock);
473 mutex_lock(&ei->i_fc_lock);
474
475 return 0;
476}
477
478void __ext4_fc_track_unlink(handle_t *handle,
479 struct inode *inode, struct dentry *dentry)
480{
481 struct __track_dentry_update_args args;
482 int ret;
483
484 args.dentry = dentry;
485 args.op = EXT4_FC_TAG_UNLINK;
486
487 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
488 (void *)&args, 0);
489 trace_ext4_fc_track_unlink(inode, dentry, ret);
490}
491
492void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
493{
494 __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
495}
496
497void __ext4_fc_track_link(handle_t *handle,
498 struct inode *inode, struct dentry *dentry)
499{
500 struct __track_dentry_update_args args;
501 int ret;
502
503 args.dentry = dentry;
504 args.op = EXT4_FC_TAG_LINK;
505
506 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
507 (void *)&args, 0);
508 trace_ext4_fc_track_link(inode, dentry, ret);
509}
510
511void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
512{
513 __ext4_fc_track_link(handle, d_inode(dentry), dentry);
514}
515
516void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
517 struct dentry *dentry)
518{
519 struct __track_dentry_update_args args;
520 int ret;
521
522 args.dentry = dentry;
523 args.op = EXT4_FC_TAG_CREAT;
524
525 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
526 (void *)&args, 0);
527 trace_ext4_fc_track_create(inode, dentry, ret);
528}
529
530void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
531{
532 __ext4_fc_track_create(handle, d_inode(dentry), dentry);
533}
534
535/* __track_fn for inode tracking */
536static int __track_inode(struct inode *inode, void *arg, bool update)
537{
538 if (update)
539 return -EEXIST;
540
541 EXT4_I(inode)->i_fc_lblk_len = 0;
542
543 return 0;
544}
545
546void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
547{
548 int ret;
549
550 if (S_ISDIR(inode->i_mode))
551 return;
552
553 if (ext4_should_journal_data(inode)) {
554 ext4_fc_mark_ineligible(inode->i_sb,
555 EXT4_FC_REASON_INODE_JOURNAL_DATA);
556 return;
557 }
558
559 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
560 trace_ext4_fc_track_inode(inode, ret);
561}
562
563struct __track_range_args {
564 ext4_lblk_t start, end;
565};
566
567/* __track_fn for tracking data updates */
568static int __track_range(struct inode *inode, void *arg, bool update)
569{
570 struct ext4_inode_info *ei = EXT4_I(inode);
571 ext4_lblk_t oldstart;
572 struct __track_range_args *__arg =
573 (struct __track_range_args *)arg;
574
575 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
576 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
577 return -ECANCELED;
578 }
579
580 oldstart = ei->i_fc_lblk_start;
581
582 if (update && ei->i_fc_lblk_len > 0) {
583 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
584 ei->i_fc_lblk_len =
585 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
586 ei->i_fc_lblk_start + 1;
587 } else {
588 ei->i_fc_lblk_start = __arg->start;
589 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
590 }
591
592 return 0;
593}
594
595void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
596 ext4_lblk_t end)
597{
598 struct __track_range_args args;
599 int ret;
600
601 if (S_ISDIR(inode->i_mode))
602 return;
603
604 args.start = start;
605 args.end = end;
606
607 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
608
609 trace_ext4_fc_track_range(inode, start, end, ret);
610}
611
612static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
613{
614 int write_flags = REQ_SYNC;
615 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
616
617 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
618 if (test_opt(sb, BARRIER) && is_tail)
619 write_flags |= REQ_FUA | REQ_PREFLUSH;
620 lock_buffer(bh);
621 set_buffer_dirty(bh);
622 set_buffer_uptodate(bh);
623 bh->b_end_io = ext4_end_buffer_io_sync;
624 submit_bh(REQ_OP_WRITE, write_flags, bh);
625 EXT4_SB(sb)->s_fc_bh = NULL;
626}
627
628/* Ext4 commit path routines */
629
630/* memzero and update CRC */
631static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
632 u32 *crc)
633{
634 void *ret;
635
636 ret = memset(dst, 0, len);
637 if (crc)
638 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
639 return ret;
640}
641
642/*
643 * Allocate len bytes on a fast commit buffer.
644 *
645 * During the commit time this function is used to manage fast commit
646 * block space. We don't split a fast commit log onto different
647 * blocks. So this function makes sure that if there's not enough space
648 * on the current block, the remaining space in the current block is
649 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
650 * new block is from jbd2 and CRC is updated to reflect the padding
651 * we added.
652 */
653static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
654{
655 struct ext4_fc_tl *tl;
656 struct ext4_sb_info *sbi = EXT4_SB(sb);
657 struct buffer_head *bh;
658 int bsize = sbi->s_journal->j_blocksize;
659 int ret, off = sbi->s_fc_bytes % bsize;
660 int pad_len;
661
662 /*
663 * After allocating len, we should have space at least for a 0 byte
664 * padding.
665 */
666 if (len + sizeof(struct ext4_fc_tl) > bsize)
667 return NULL;
668
669 if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
670 /*
671 * Only allocate from current buffer if we have enough space for
672 * this request AND we have space to add a zero byte padding.
673 */
674 if (!sbi->s_fc_bh) {
675 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
676 if (ret)
677 return NULL;
678 sbi->s_fc_bh = bh;
679 }
680 sbi->s_fc_bytes += len;
681 return sbi->s_fc_bh->b_data + off;
682 }
683 /* Need to add PAD tag */
684 tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
685 tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
686 pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
687 tl->fc_len = cpu_to_le16(pad_len);
688 if (crc)
689 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
690 if (pad_len > 0)
691 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
692 ext4_fc_submit_bh(sb, false);
693
694 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
695 if (ret)
696 return NULL;
697 sbi->s_fc_bh = bh;
698 sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
699 return sbi->s_fc_bh->b_data;
700}
701
702/* memcpy to fc reserved space and update CRC */
703static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
704 int len, u32 *crc)
705{
706 if (crc)
707 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
708 return memcpy(dst, src, len);
709}
710
711/*
712 * Complete a fast commit by writing tail tag.
713 *
714 * Writing tail tag marks the end of a fast commit. In order to guarantee
715 * atomicity, after writing tail tag, even if there's space remaining
716 * in the block, next commit shouldn't use it. That's why tail tag
717 * has the length as that of the remaining space on the block.
718 */
719static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
720{
721 struct ext4_sb_info *sbi = EXT4_SB(sb);
722 struct ext4_fc_tl tl;
723 struct ext4_fc_tail tail;
724 int off, bsize = sbi->s_journal->j_blocksize;
725 u8 *dst;
726
727 /*
728 * ext4_fc_reserve_space takes care of allocating an extra block if
729 * there's no enough space on this block for accommodating this tail.
730 */
731 dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
732 if (!dst)
733 return -ENOSPC;
734
735 off = sbi->s_fc_bytes % bsize;
736
737 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
738 tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
739 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
740
741 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
742 dst += sizeof(tl);
743 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
744 ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
745 dst += sizeof(tail.fc_tid);
746 tail.fc_crc = cpu_to_le32(crc);
747 ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
748
749 ext4_fc_submit_bh(sb, true);
750
751 return 0;
752}
753
754/*
755 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
756 * Returns false if there's not enough space.
757 */
758static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
759 u32 *crc)
760{
761 struct ext4_fc_tl tl;
762 u8 *dst;
763
764 dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
765 if (!dst)
766 return false;
767
768 tl.fc_tag = cpu_to_le16(tag);
769 tl.fc_len = cpu_to_le16(len);
770
771 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
772 ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
773
774 return true;
775}
776
777/* Same as above, but adds dentry tlv. */
778static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
779 int parent_ino, int ino, int dlen,
780 const unsigned char *dname,
781 u32 *crc)
782{
783 struct ext4_fc_dentry_info fcd;
784 struct ext4_fc_tl tl;
785 u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
786 crc);
787
788 if (!dst)
789 return false;
790
791 fcd.fc_parent_ino = cpu_to_le32(parent_ino);
792 fcd.fc_ino = cpu_to_le32(ino);
793 tl.fc_tag = cpu_to_le16(tag);
794 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
795 ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
796 dst += sizeof(tl);
797 ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
798 dst += sizeof(fcd);
799 ext4_fc_memcpy(sb, dst, dname, dlen, crc);
800 dst += dlen;
801
802 return true;
803}
804
805/*
806 * Writes inode in the fast commit space under TLV with tag @tag.
807 * Returns 0 on success, error on failure.
808 */
809static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
810{
811 struct ext4_inode_info *ei = EXT4_I(inode);
812 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
813 int ret;
814 struct ext4_iloc iloc;
815 struct ext4_fc_inode fc_inode;
816 struct ext4_fc_tl tl;
817 u8 *dst;
818
819 ret = ext4_get_inode_loc(inode, &iloc);
820 if (ret)
821 return ret;
822
823 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
824 inode_len += ei->i_extra_isize;
825
826 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
827 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
828 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
829
830 dst = ext4_fc_reserve_space(inode->i_sb,
831 sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
832 if (!dst)
833 return -ECANCELED;
834
835 if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
836 return -ECANCELED;
837 dst += sizeof(tl);
838 if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
839 return -ECANCELED;
840 dst += sizeof(fc_inode);
841 if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
842 inode_len, crc))
843 return -ECANCELED;
844
845 return 0;
846}
847
848/*
849 * Writes updated data ranges for the inode in question. Updates CRC.
850 * Returns 0 on success, error otherwise.
851 */
852static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
853{
854 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
855 struct ext4_inode_info *ei = EXT4_I(inode);
856 struct ext4_map_blocks map;
857 struct ext4_fc_add_range fc_ext;
858 struct ext4_fc_del_range lrange;
859 struct ext4_extent *ex;
860 int ret;
861
862 mutex_lock(&ei->i_fc_lock);
863 if (ei->i_fc_lblk_len == 0) {
864 mutex_unlock(&ei->i_fc_lock);
865 return 0;
866 }
867 old_blk_size = ei->i_fc_lblk_start;
868 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
869 ei->i_fc_lblk_len = 0;
870 mutex_unlock(&ei->i_fc_lock);
871
872 cur_lblk_off = old_blk_size;
873 jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
874 __func__, cur_lblk_off, new_blk_size, inode->i_ino);
875
876 while (cur_lblk_off <= new_blk_size) {
877 map.m_lblk = cur_lblk_off;
878 map.m_len = new_blk_size - cur_lblk_off + 1;
879 ret = ext4_map_blocks(NULL, inode, &map, 0);
880 if (ret < 0)
881 return -ECANCELED;
882
883 if (map.m_len == 0) {
884 cur_lblk_off++;
885 continue;
886 }
887
888 if (ret == 0) {
889 lrange.fc_ino = cpu_to_le32(inode->i_ino);
890 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
891 lrange.fc_len = cpu_to_le32(map.m_len);
892 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
893 sizeof(lrange), (u8 *)&lrange, crc))
894 return -ENOSPC;
895 } else {
896 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
897 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
898
899 /* Limit the number of blocks in one extent */
900 map.m_len = min(max, map.m_len);
901
902 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
903 ex = (struct ext4_extent *)&fc_ext.fc_ex;
904 ex->ee_block = cpu_to_le32(map.m_lblk);
905 ex->ee_len = cpu_to_le16(map.m_len);
906 ext4_ext_store_pblock(ex, map.m_pblk);
907 if (map.m_flags & EXT4_MAP_UNWRITTEN)
908 ext4_ext_mark_unwritten(ex);
909 else
910 ext4_ext_mark_initialized(ex);
911 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
912 sizeof(fc_ext), (u8 *)&fc_ext, crc))
913 return -ENOSPC;
914 }
915
916 cur_lblk_off += map.m_len;
917 }
918
919 return 0;
920}
921
922
923/* Submit data for all the fast commit inodes */
924static int ext4_fc_submit_inode_data_all(journal_t *journal)
925{
926 struct super_block *sb = (struct super_block *)(journal->j_private);
927 struct ext4_sb_info *sbi = EXT4_SB(sb);
928 struct ext4_inode_info *ei;
929 int ret = 0;
930
931 spin_lock(&sbi->s_fc_lock);
932 ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
933 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
934 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
935 while (atomic_read(&ei->i_fc_updates)) {
936 DEFINE_WAIT(wait);
937
938 prepare_to_wait(&ei->i_fc_wait, &wait,
939 TASK_UNINTERRUPTIBLE);
940 if (atomic_read(&ei->i_fc_updates)) {
941 spin_unlock(&sbi->s_fc_lock);
942 schedule();
943 spin_lock(&sbi->s_fc_lock);
944 }
945 finish_wait(&ei->i_fc_wait, &wait);
946 }
947 spin_unlock(&sbi->s_fc_lock);
948 ret = jbd2_submit_inode_data(ei->jinode);
949 if (ret)
950 return ret;
951 spin_lock(&sbi->s_fc_lock);
952 }
953 spin_unlock(&sbi->s_fc_lock);
954
955 return ret;
956}
957
958/* Wait for completion of data for all the fast commit inodes */
959static int ext4_fc_wait_inode_data_all(journal_t *journal)
960{
961 struct super_block *sb = (struct super_block *)(journal->j_private);
962 struct ext4_sb_info *sbi = EXT4_SB(sb);
963 struct ext4_inode_info *pos, *n;
964 int ret = 0;
965
966 spin_lock(&sbi->s_fc_lock);
967 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
968 if (!ext4_test_inode_state(&pos->vfs_inode,
969 EXT4_STATE_FC_COMMITTING))
970 continue;
971 spin_unlock(&sbi->s_fc_lock);
972
973 ret = jbd2_wait_inode_data(journal, pos->jinode);
974 if (ret)
975 return ret;
976 spin_lock(&sbi->s_fc_lock);
977 }
978 spin_unlock(&sbi->s_fc_lock);
979
980 return 0;
981}
982
983/* Commit all the directory entry updates */
984static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
985__acquires(&sbi->s_fc_lock)
986__releases(&sbi->s_fc_lock)
987{
988 struct super_block *sb = (struct super_block *)(journal->j_private);
989 struct ext4_sb_info *sbi = EXT4_SB(sb);
990 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
991 struct inode *inode;
992 struct ext4_inode_info *ei, *ei_n;
993 int ret;
994
995 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
996 return 0;
997 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
998 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
999 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1000 spin_unlock(&sbi->s_fc_lock);
1001 if (!ext4_fc_add_dentry_tlv(
1002 sb, fc_dentry->fcd_op,
1003 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1004 fc_dentry->fcd_name.len,
1005 fc_dentry->fcd_name.name, crc)) {
1006 ret = -ENOSPC;
1007 goto lock_and_exit;
1008 }
1009 spin_lock(&sbi->s_fc_lock);
1010 continue;
1011 }
1012
1013 inode = NULL;
1014 list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN],
1015 i_fc_list) {
1016 if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
1017 inode = &ei->vfs_inode;
1018 break;
1019 }
1020 }
1021 /*
1022 * If we don't find inode in our list, then it was deleted,
1023 * in which case, we don't need to record it's create tag.
1024 */
1025 if (!inode)
1026 continue;
1027 spin_unlock(&sbi->s_fc_lock);
1028
1029 /*
1030 * We first write the inode and then the create dirent. This
1031 * allows the recovery code to create an unnamed inode first
1032 * and then link it to a directory entry. This allows us
1033 * to use namei.c routines almost as is and simplifies
1034 * the recovery code.
1035 */
1036 ret = ext4_fc_write_inode(inode, crc);
1037 if (ret)
1038 goto lock_and_exit;
1039
1040 ret = ext4_fc_write_inode_data(inode, crc);
1041 if (ret)
1042 goto lock_and_exit;
1043
1044 if (!ext4_fc_add_dentry_tlv(
1045 sb, fc_dentry->fcd_op,
1046 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
1047 fc_dentry->fcd_name.len,
1048 fc_dentry->fcd_name.name, crc)) {
1049 ret = -ENOSPC;
1050 goto lock_and_exit;
1051 }
1052
1053 spin_lock(&sbi->s_fc_lock);
1054 }
1055 return 0;
1056lock_and_exit:
1057 spin_lock(&sbi->s_fc_lock);
1058 return ret;
1059}
1060
1061static int ext4_fc_perform_commit(journal_t *journal)
1062{
1063 struct super_block *sb = (struct super_block *)(journal->j_private);
1064 struct ext4_sb_info *sbi = EXT4_SB(sb);
1065 struct ext4_inode_info *iter;
1066 struct ext4_fc_head head;
1067 struct inode *inode;
1068 struct blk_plug plug;
1069 int ret = 0;
1070 u32 crc = 0;
1071
1072 ret = ext4_fc_submit_inode_data_all(journal);
1073 if (ret)
1074 return ret;
1075
1076 ret = ext4_fc_wait_inode_data_all(journal);
1077 if (ret)
1078 return ret;
1079
1080 /*
1081 * If file system device is different from journal device, issue a cache
1082 * flush before we start writing fast commit blocks.
1083 */
1084 if (journal->j_fs_dev != journal->j_dev)
1085 blkdev_issue_flush(journal->j_fs_dev);
1086
1087 blk_start_plug(&plug);
1088 if (sbi->s_fc_bytes == 0) {
1089 /*
1090 * Add a head tag only if this is the first fast commit
1091 * in this TID.
1092 */
1093 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1094 head.fc_tid = cpu_to_le32(
1095 sbi->s_journal->j_running_transaction->t_tid);
1096 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1097 (u8 *)&head, &crc)) {
1098 ret = -ENOSPC;
1099 goto out;
1100 }
1101 }
1102
1103 spin_lock(&sbi->s_fc_lock);
1104 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1105 if (ret) {
1106 spin_unlock(&sbi->s_fc_lock);
1107 goto out;
1108 }
1109
1110 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1111 inode = &iter->vfs_inode;
1112 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1113 continue;
1114
1115 spin_unlock(&sbi->s_fc_lock);
1116 ret = ext4_fc_write_inode_data(inode, &crc);
1117 if (ret)
1118 goto out;
1119 ret = ext4_fc_write_inode(inode, &crc);
1120 if (ret)
1121 goto out;
1122 spin_lock(&sbi->s_fc_lock);
1123 }
1124 spin_unlock(&sbi->s_fc_lock);
1125
1126 ret = ext4_fc_write_tail(sb, crc);
1127
1128out:
1129 blk_finish_plug(&plug);
1130 return ret;
1131}
1132
1133/*
1134 * The main commit entry point. Performs a fast commit for transaction
1135 * commit_tid if needed. If it's not possible to perform a fast commit
1136 * due to various reasons, we fall back to full commit. Returns 0
1137 * on success, error otherwise.
1138 */
1139int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1140{
1141 struct super_block *sb = (struct super_block *)(journal->j_private);
1142 struct ext4_sb_info *sbi = EXT4_SB(sb);
1143 int nblks = 0, ret, bsize = journal->j_blocksize;
1144 int subtid = atomic_read(&sbi->s_fc_subtid);
1145 int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1146 ktime_t start_time, commit_time;
1147
1148 trace_ext4_fc_commit_start(sb);
1149
1150 start_time = ktime_get();
1151
1152 if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1153 (ext4_fc_is_ineligible(sb))) {
1154 reason = EXT4_FC_REASON_INELIGIBLE;
1155 goto out;
1156 }
1157
1158restart_fc:
1159 ret = jbd2_fc_begin_commit(journal, commit_tid);
1160 if (ret == -EALREADY) {
1161 /* There was an ongoing commit, check if we need to restart */
1162 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1163 commit_tid > journal->j_commit_sequence)
1164 goto restart_fc;
1165 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1166 goto out;
1167 } else if (ret) {
1168 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1169 reason = EXT4_FC_REASON_FC_START_FAILED;
1170 goto out;
1171 }
1172
1173 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1174 ret = ext4_fc_perform_commit(journal);
1175 if (ret < 0) {
1176 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1177 reason = EXT4_FC_REASON_FC_FAILED;
1178 goto out;
1179 }
1180 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1181 ret = jbd2_fc_wait_bufs(journal, nblks);
1182 if (ret < 0) {
1183 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1184 reason = EXT4_FC_REASON_FC_FAILED;
1185 goto out;
1186 }
1187 atomic_inc(&sbi->s_fc_subtid);
1188 jbd2_fc_end_commit(journal);
1189out:
1190 /* Has any ineligible update happened since we started? */
1191 if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1192 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1193 reason = EXT4_FC_REASON_INELIGIBLE;
1194 }
1195
1196 spin_lock(&sbi->s_fc_lock);
1197 if (reason != EXT4_FC_REASON_OK &&
1198 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1199 sbi->s_fc_stats.fc_ineligible_commits++;
1200 } else {
1201 sbi->s_fc_stats.fc_num_commits++;
1202 sbi->s_fc_stats.fc_numblks += nblks;
1203 }
1204 spin_unlock(&sbi->s_fc_lock);
1205 nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1206 trace_ext4_fc_commit_stop(sb, nblks, reason);
1207 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1208 /*
1209 * weight the commit time higher than the average time so we don't
1210 * react too strongly to vast changes in the commit time
1211 */
1212 if (likely(sbi->s_fc_avg_commit_time))
1213 sbi->s_fc_avg_commit_time = (commit_time +
1214 sbi->s_fc_avg_commit_time * 3) / 4;
1215 else
1216 sbi->s_fc_avg_commit_time = commit_time;
1217 jbd_debug(1,
1218 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1219 nblks, reason, subtid);
1220 if (reason == EXT4_FC_REASON_FC_FAILED)
1221 return jbd2_fc_end_commit_fallback(journal);
1222 if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1223 reason == EXT4_FC_REASON_INELIGIBLE)
1224 return jbd2_complete_transaction(journal, commit_tid);
1225 return 0;
1226}
1227
1228/*
1229 * Fast commit cleanup routine. This is called after every fast commit and
1230 * full commit. full is true if we are called after a full commit.
1231 */
1232static void ext4_fc_cleanup(journal_t *journal, int full)
1233{
1234 struct super_block *sb = journal->j_private;
1235 struct ext4_sb_info *sbi = EXT4_SB(sb);
1236 struct ext4_inode_info *iter, *iter_n;
1237 struct ext4_fc_dentry_update *fc_dentry;
1238
1239 if (full && sbi->s_fc_bh)
1240 sbi->s_fc_bh = NULL;
1241
1242 jbd2_fc_release_bufs(journal);
1243
1244 spin_lock(&sbi->s_fc_lock);
1245 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1246 i_fc_list) {
1247 list_del_init(&iter->i_fc_list);
1248 ext4_clear_inode_state(&iter->vfs_inode,
1249 EXT4_STATE_FC_COMMITTING);
1250 ext4_fc_reset_inode(&iter->vfs_inode);
1251 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1252 smp_mb();
1253#if (BITS_PER_LONG < 64)
1254 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1255#else
1256 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1257#endif
1258 }
1259
1260 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1261 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1262 struct ext4_fc_dentry_update,
1263 fcd_list);
1264 list_del_init(&fc_dentry->fcd_list);
1265 spin_unlock(&sbi->s_fc_lock);
1266
1267 if (fc_dentry->fcd_name.name &&
1268 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1269 kfree(fc_dentry->fcd_name.name);
1270 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1271 spin_lock(&sbi->s_fc_lock);
1272 }
1273
1274 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1275 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1276 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1277 &sbi->s_fc_q[FC_Q_MAIN]);
1278
1279 ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1280 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1281
1282 if (full)
1283 sbi->s_fc_bytes = 0;
1284 spin_unlock(&sbi->s_fc_lock);
1285 trace_ext4_fc_stats(sb);
1286}
1287
1288/* Ext4 Replay Path Routines */
1289
1290/* Helper struct for dentry replay routines */
1291struct dentry_info_args {
1292 int parent_ino, dname_len, ino, inode_len;
1293 char *dname;
1294};
1295
1296static inline void tl_to_darg(struct dentry_info_args *darg,
1297 struct ext4_fc_tl *tl, u8 *val)
1298{
1299 struct ext4_fc_dentry_info fcd;
1300
1301 memcpy(&fcd, val, sizeof(fcd));
1302
1303 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1304 darg->ino = le32_to_cpu(fcd.fc_ino);
1305 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1306 darg->dname_len = le16_to_cpu(tl->fc_len) -
1307 sizeof(struct ext4_fc_dentry_info);
1308}
1309
1310/* Unlink replay function */
1311static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1312 u8 *val)
1313{
1314 struct inode *inode, *old_parent;
1315 struct qstr entry;
1316 struct dentry_info_args darg;
1317 int ret = 0;
1318
1319 tl_to_darg(&darg, tl, val);
1320
1321 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1322 darg.parent_ino, darg.dname_len);
1323
1324 entry.name = darg.dname;
1325 entry.len = darg.dname_len;
1326 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1327
1328 if (IS_ERR(inode)) {
1329 jbd_debug(1, "Inode %d not found", darg.ino);
1330 return 0;
1331 }
1332
1333 old_parent = ext4_iget(sb, darg.parent_ino,
1334 EXT4_IGET_NORMAL);
1335 if (IS_ERR(old_parent)) {
1336 jbd_debug(1, "Dir with inode %d not found", darg.parent_ino);
1337 iput(inode);
1338 return 0;
1339 }
1340
1341 ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1342 /* -ENOENT ok coz it might not exist anymore. */
1343 if (ret == -ENOENT)
1344 ret = 0;
1345 iput(old_parent);
1346 iput(inode);
1347 return ret;
1348}
1349
1350static int ext4_fc_replay_link_internal(struct super_block *sb,
1351 struct dentry_info_args *darg,
1352 struct inode *inode)
1353{
1354 struct inode *dir = NULL;
1355 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1356 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1357 int ret = 0;
1358
1359 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1360 if (IS_ERR(dir)) {
1361 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1362 dir = NULL;
1363 goto out;
1364 }
1365
1366 dentry_dir = d_obtain_alias(dir);
1367 if (IS_ERR(dentry_dir)) {
1368 jbd_debug(1, "Failed to obtain dentry");
1369 dentry_dir = NULL;
1370 goto out;
1371 }
1372
1373 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1374 if (!dentry_inode) {
1375 jbd_debug(1, "Inode dentry not created.");
1376 ret = -ENOMEM;
1377 goto out;
1378 }
1379
1380 ret = __ext4_link(dir, inode, dentry_inode);
1381 /*
1382 * It's possible that link already existed since data blocks
1383 * for the dir in question got persisted before we crashed OR
1384 * we replayed this tag and crashed before the entire replay
1385 * could complete.
1386 */
1387 if (ret && ret != -EEXIST) {
1388 jbd_debug(1, "Failed to link\n");
1389 goto out;
1390 }
1391
1392 ret = 0;
1393out:
1394 if (dentry_dir) {
1395 d_drop(dentry_dir);
1396 dput(dentry_dir);
1397 } else if (dir) {
1398 iput(dir);
1399 }
1400 if (dentry_inode) {
1401 d_drop(dentry_inode);
1402 dput(dentry_inode);
1403 }
1404
1405 return ret;
1406}
1407
1408/* Link replay function */
1409static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1410 u8 *val)
1411{
1412 struct inode *inode;
1413 struct dentry_info_args darg;
1414 int ret = 0;
1415
1416 tl_to_darg(&darg, tl, val);
1417 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1418 darg.parent_ino, darg.dname_len);
1419
1420 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1421 if (IS_ERR(inode)) {
1422 jbd_debug(1, "Inode not found.");
1423 return 0;
1424 }
1425
1426 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1427 iput(inode);
1428 return ret;
1429}
1430
1431/*
1432 * Record all the modified inodes during replay. We use this later to setup
1433 * block bitmaps correctly.
1434 */
1435static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1436{
1437 struct ext4_fc_replay_state *state;
1438 int i;
1439
1440 state = &EXT4_SB(sb)->s_fc_replay_state;
1441 for (i = 0; i < state->fc_modified_inodes_used; i++)
1442 if (state->fc_modified_inodes[i] == ino)
1443 return 0;
1444 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1445 state->fc_modified_inodes_size +=
1446 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1447 state->fc_modified_inodes = krealloc(
1448 state->fc_modified_inodes, sizeof(int) *
1449 state->fc_modified_inodes_size,
1450 GFP_KERNEL);
1451 if (!state->fc_modified_inodes)
1452 return -ENOMEM;
1453 }
1454 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1455 return 0;
1456}
1457
1458/*
1459 * Inode replay function
1460 */
1461static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1462 u8 *val)
1463{
1464 struct ext4_fc_inode fc_inode;
1465 struct ext4_inode *raw_inode;
1466 struct ext4_inode *raw_fc_inode;
1467 struct inode *inode = NULL;
1468 struct ext4_iloc iloc;
1469 int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1470 struct ext4_extent_header *eh;
1471
1472 memcpy(&fc_inode, val, sizeof(fc_inode));
1473
1474 ino = le32_to_cpu(fc_inode.fc_ino);
1475 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1476
1477 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1478 if (!IS_ERR(inode)) {
1479 ext4_ext_clear_bb(inode);
1480 iput(inode);
1481 }
1482 inode = NULL;
1483
1484 ext4_fc_record_modified_inode(sb, ino);
1485
1486 raw_fc_inode = (struct ext4_inode *)
1487 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1488 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1489 if (ret)
1490 goto out;
1491
1492 inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1493 raw_inode = ext4_raw_inode(&iloc);
1494
1495 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1496 memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1497 inode_len - offsetof(struct ext4_inode, i_generation));
1498 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1499 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1500 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1501 memset(eh, 0, sizeof(*eh));
1502 eh->eh_magic = EXT4_EXT_MAGIC;
1503 eh->eh_max = cpu_to_le16(
1504 (sizeof(raw_inode->i_block) -
1505 sizeof(struct ext4_extent_header))
1506 / sizeof(struct ext4_extent));
1507 }
1508 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1509 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1510 sizeof(raw_inode->i_block));
1511 }
1512
1513 /* Immediately update the inode on disk. */
1514 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1515 if (ret)
1516 goto out;
1517 ret = sync_dirty_buffer(iloc.bh);
1518 if (ret)
1519 goto out;
1520 ret = ext4_mark_inode_used(sb, ino);
1521 if (ret)
1522 goto out;
1523
1524 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1525 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1526 if (IS_ERR(inode)) {
1527 jbd_debug(1, "Inode not found.");
1528 return -EFSCORRUPTED;
1529 }
1530
1531 /*
1532 * Our allocator could have made different decisions than before
1533 * crashing. This should be fixed but until then, we calculate
1534 * the number of blocks the inode.
1535 */
1536 ext4_ext_replay_set_iblocks(inode);
1537
1538 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1539 ext4_reset_inode_seed(inode);
1540
1541 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1542 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1543 sync_dirty_buffer(iloc.bh);
1544 brelse(iloc.bh);
1545out:
1546 iput(inode);
1547 if (!ret)
1548 blkdev_issue_flush(sb->s_bdev);
1549
1550 return 0;
1551}
1552
1553/*
1554 * Dentry create replay function.
1555 *
1556 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1557 * inode for which we are trying to create a dentry here, should already have
1558 * been replayed before we start here.
1559 */
1560static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1561 u8 *val)
1562{
1563 int ret = 0;
1564 struct inode *inode = NULL;
1565 struct inode *dir = NULL;
1566 struct dentry_info_args darg;
1567
1568 tl_to_darg(&darg, tl, val);
1569
1570 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1571 darg.parent_ino, darg.dname_len);
1572
1573 /* This takes care of update group descriptor and other metadata */
1574 ret = ext4_mark_inode_used(sb, darg.ino);
1575 if (ret)
1576 goto out;
1577
1578 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1579 if (IS_ERR(inode)) {
1580 jbd_debug(1, "inode %d not found.", darg.ino);
1581 inode = NULL;
1582 ret = -EINVAL;
1583 goto out;
1584 }
1585
1586 if (S_ISDIR(inode->i_mode)) {
1587 /*
1588 * If we are creating a directory, we need to make sure that the
1589 * dot and dot dot dirents are setup properly.
1590 */
1591 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1592 if (IS_ERR(dir)) {
1593 jbd_debug(1, "Dir %d not found.", darg.ino);
1594 goto out;
1595 }
1596 ret = ext4_init_new_dir(NULL, dir, inode);
1597 iput(dir);
1598 if (ret) {
1599 ret = 0;
1600 goto out;
1601 }
1602 }
1603 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1604 if (ret)
1605 goto out;
1606 set_nlink(inode, 1);
1607 ext4_mark_inode_dirty(NULL, inode);
1608out:
1609 if (inode)
1610 iput(inode);
1611 return ret;
1612}
1613
1614/*
1615 * Record physical disk regions which are in use as per fast commit area. Our
1616 * simple replay phase allocator excludes these regions from allocation.
1617 */
1618static int ext4_fc_record_regions(struct super_block *sb, int ino,
1619 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len)
1620{
1621 struct ext4_fc_replay_state *state;
1622 struct ext4_fc_alloc_region *region;
1623
1624 state = &EXT4_SB(sb)->s_fc_replay_state;
1625 if (state->fc_regions_used == state->fc_regions_size) {
1626 state->fc_regions_size +=
1627 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1628 state->fc_regions = krealloc(
1629 state->fc_regions,
1630 state->fc_regions_size *
1631 sizeof(struct ext4_fc_alloc_region),
1632 GFP_KERNEL);
1633 if (!state->fc_regions)
1634 return -ENOMEM;
1635 }
1636 region = &state->fc_regions[state->fc_regions_used++];
1637 region->ino = ino;
1638 region->lblk = lblk;
1639 region->pblk = pblk;
1640 region->len = len;
1641
1642 return 0;
1643}
1644
1645/* Replay add range tag */
1646static int ext4_fc_replay_add_range(struct super_block *sb,
1647 struct ext4_fc_tl *tl, u8 *val)
1648{
1649 struct ext4_fc_add_range fc_add_ex;
1650 struct ext4_extent newex, *ex;
1651 struct inode *inode;
1652 ext4_lblk_t start, cur;
1653 int remaining, len;
1654 ext4_fsblk_t start_pblk;
1655 struct ext4_map_blocks map;
1656 struct ext4_ext_path *path = NULL;
1657 int ret;
1658
1659 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1660 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1661
1662 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1663 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1664 ext4_ext_get_actual_len(ex));
1665
1666 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1667 if (IS_ERR(inode)) {
1668 jbd_debug(1, "Inode not found.");
1669 return 0;
1670 }
1671
1672 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1673
1674 start = le32_to_cpu(ex->ee_block);
1675 start_pblk = ext4_ext_pblock(ex);
1676 len = ext4_ext_get_actual_len(ex);
1677
1678 cur = start;
1679 remaining = len;
1680 jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1681 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1682 inode->i_ino);
1683
1684 while (remaining > 0) {
1685 map.m_lblk = cur;
1686 map.m_len = remaining;
1687 map.m_pblk = 0;
1688 ret = ext4_map_blocks(NULL, inode, &map, 0);
1689
1690 if (ret < 0) {
1691 iput(inode);
1692 return 0;
1693 }
1694
1695 if (ret == 0) {
1696 /* Range is not mapped */
1697 path = ext4_find_extent(inode, cur, NULL, 0);
1698 if (IS_ERR(path)) {
1699 iput(inode);
1700 return 0;
1701 }
1702 memset(&newex, 0, sizeof(newex));
1703 newex.ee_block = cpu_to_le32(cur);
1704 ext4_ext_store_pblock(
1705 &newex, start_pblk + cur - start);
1706 newex.ee_len = cpu_to_le16(map.m_len);
1707 if (ext4_ext_is_unwritten(ex))
1708 ext4_ext_mark_unwritten(&newex);
1709 down_write(&EXT4_I(inode)->i_data_sem);
1710 ret = ext4_ext_insert_extent(
1711 NULL, inode, &path, &newex, 0);
1712 up_write((&EXT4_I(inode)->i_data_sem));
1713 ext4_ext_drop_refs(path);
1714 kfree(path);
1715 if (ret) {
1716 iput(inode);
1717 return 0;
1718 }
1719 goto next;
1720 }
1721
1722 if (start_pblk + cur - start != map.m_pblk) {
1723 /*
1724 * Logical to physical mapping changed. This can happen
1725 * if this range was removed and then reallocated to
1726 * map to new physical blocks during a fast commit.
1727 */
1728 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1729 ext4_ext_is_unwritten(ex),
1730 start_pblk + cur - start);
1731 if (ret) {
1732 iput(inode);
1733 return 0;
1734 }
1735 /*
1736 * Mark the old blocks as free since they aren't used
1737 * anymore. We maintain an array of all the modified
1738 * inodes. In case these blocks are still used at either
1739 * a different logical range in the same inode or in
1740 * some different inode, we will mark them as allocated
1741 * at the end of the FC replay using our array of
1742 * modified inodes.
1743 */
1744 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1745 goto next;
1746 }
1747
1748 /* Range is mapped and needs a state change */
1749 jbd_debug(1, "Converting from %ld to %d %lld",
1750 map.m_flags & EXT4_MAP_UNWRITTEN,
1751 ext4_ext_is_unwritten(ex), map.m_pblk);
1752 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1753 ext4_ext_is_unwritten(ex), map.m_pblk);
1754 if (ret) {
1755 iput(inode);
1756 return 0;
1757 }
1758 /*
1759 * We may have split the extent tree while toggling the state.
1760 * Try to shrink the extent tree now.
1761 */
1762 ext4_ext_replay_shrink_inode(inode, start + len);
1763next:
1764 cur += map.m_len;
1765 remaining -= map.m_len;
1766 }
1767 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1768 sb->s_blocksize_bits);
1769 iput(inode);
1770 return 0;
1771}
1772
1773/* Replay DEL_RANGE tag */
1774static int
1775ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1776 u8 *val)
1777{
1778 struct inode *inode;
1779 struct ext4_fc_del_range lrange;
1780 struct ext4_map_blocks map;
1781 ext4_lblk_t cur, remaining;
1782 int ret;
1783
1784 memcpy(&lrange, val, sizeof(lrange));
1785 cur = le32_to_cpu(lrange.fc_lblk);
1786 remaining = le32_to_cpu(lrange.fc_len);
1787
1788 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1789 le32_to_cpu(lrange.fc_ino), cur, remaining);
1790
1791 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1792 if (IS_ERR(inode)) {
1793 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1794 return 0;
1795 }
1796
1797 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1798
1799 jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1800 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1801 le32_to_cpu(lrange.fc_len));
1802 while (remaining > 0) {
1803 map.m_lblk = cur;
1804 map.m_len = remaining;
1805
1806 ret = ext4_map_blocks(NULL, inode, &map, 0);
1807 if (ret < 0) {
1808 iput(inode);
1809 return 0;
1810 }
1811 if (ret > 0) {
1812 remaining -= ret;
1813 cur += ret;
1814 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1815 } else {
1816 remaining -= map.m_len;
1817 cur += map.m_len;
1818 }
1819 }
1820
1821 ret = ext4_punch_hole(inode,
1822 le32_to_cpu(lrange.fc_lblk) << sb->s_blocksize_bits,
1823 le32_to_cpu(lrange.fc_len) << sb->s_blocksize_bits);
1824 if (ret)
1825 jbd_debug(1, "ext4_punch_hole returned %d", ret);
1826 ext4_ext_replay_shrink_inode(inode,
1827 i_size_read(inode) >> sb->s_blocksize_bits);
1828 ext4_mark_inode_dirty(NULL, inode);
1829 iput(inode);
1830
1831 return 0;
1832}
1833
1834static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1835{
1836 struct ext4_fc_replay_state *state;
1837 struct inode *inode;
1838 struct ext4_ext_path *path = NULL;
1839 struct ext4_map_blocks map;
1840 int i, ret, j;
1841 ext4_lblk_t cur, end;
1842
1843 state = &EXT4_SB(sb)->s_fc_replay_state;
1844 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1845 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1846 EXT4_IGET_NORMAL);
1847 if (IS_ERR(inode)) {
1848 jbd_debug(1, "Inode %d not found.",
1849 state->fc_modified_inodes[i]);
1850 continue;
1851 }
1852 cur = 0;
1853 end = EXT_MAX_BLOCKS;
1854 while (cur < end) {
1855 map.m_lblk = cur;
1856 map.m_len = end - cur;
1857
1858 ret = ext4_map_blocks(NULL, inode, &map, 0);
1859 if (ret < 0)
1860 break;
1861
1862 if (ret > 0) {
1863 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1864 if (!IS_ERR(path)) {
1865 for (j = 0; j < path->p_depth; j++)
1866 ext4_mb_mark_bb(inode->i_sb,
1867 path[j].p_block, 1, 1);
1868 ext4_ext_drop_refs(path);
1869 kfree(path);
1870 }
1871 cur += ret;
1872 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1873 map.m_len, 1);
1874 } else {
1875 cur = cur + (map.m_len ? map.m_len : 1);
1876 }
1877 }
1878 iput(inode);
1879 }
1880}
1881
1882/*
1883 * Check if block is in excluded regions for block allocation. The simple
1884 * allocator that runs during replay phase is calls this function to see
1885 * if it is okay to use a block.
1886 */
1887bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1888{
1889 int i;
1890 struct ext4_fc_replay_state *state;
1891
1892 state = &EXT4_SB(sb)->s_fc_replay_state;
1893 for (i = 0; i < state->fc_regions_valid; i++) {
1894 if (state->fc_regions[i].ino == 0 ||
1895 state->fc_regions[i].len == 0)
1896 continue;
1897 if (blk >= state->fc_regions[i].pblk &&
1898 blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1899 return true;
1900 }
1901 return false;
1902}
1903
1904/* Cleanup function called after replay */
1905void ext4_fc_replay_cleanup(struct super_block *sb)
1906{
1907 struct ext4_sb_info *sbi = EXT4_SB(sb);
1908
1909 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1910 kfree(sbi->s_fc_replay_state.fc_regions);
1911 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1912}
1913
1914/*
1915 * Recovery Scan phase handler
1916 *
1917 * This function is called during the scan phase and is responsible
1918 * for doing following things:
1919 * - Make sure the fast commit area has valid tags for replay
1920 * - Count number of tags that need to be replayed by the replay handler
1921 * - Verify CRC
1922 * - Create a list of excluded blocks for allocation during replay phase
1923 *
1924 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1925 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1926 * to indicate that scan has finished and JBD2 can now start replay phase.
1927 * It returns a negative error to indicate that there was an error. At the end
1928 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1929 * to indicate the number of tags that need to replayed during the replay phase.
1930 */
1931static int ext4_fc_replay_scan(journal_t *journal,
1932 struct buffer_head *bh, int off,
1933 tid_t expected_tid)
1934{
1935 struct super_block *sb = journal->j_private;
1936 struct ext4_sb_info *sbi = EXT4_SB(sb);
1937 struct ext4_fc_replay_state *state;
1938 int ret = JBD2_FC_REPLAY_CONTINUE;
1939 struct ext4_fc_add_range ext;
1940 struct ext4_fc_tl tl;
1941 struct ext4_fc_tail tail;
1942 __u8 *start, *end, *cur, *val;
1943 struct ext4_fc_head head;
1944 struct ext4_extent *ex;
1945
1946 state = &sbi->s_fc_replay_state;
1947
1948 start = (u8 *)bh->b_data;
1949 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1950
1951 if (state->fc_replay_expected_off == 0) {
1952 state->fc_cur_tag = 0;
1953 state->fc_replay_num_tags = 0;
1954 state->fc_crc = 0;
1955 state->fc_regions = NULL;
1956 state->fc_regions_valid = state->fc_regions_used =
1957 state->fc_regions_size = 0;
1958 /* Check if we can stop early */
1959 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1960 != EXT4_FC_TAG_HEAD)
1961 return 0;
1962 }
1963
1964 if (off != state->fc_replay_expected_off) {
1965 ret = -EFSCORRUPTED;
1966 goto out_err;
1967 }
1968
1969 state->fc_replay_expected_off++;
1970 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1971 memcpy(&tl, cur, sizeof(tl));
1972 val = cur + sizeof(tl);
1973 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1974 tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1975 switch (le16_to_cpu(tl.fc_tag)) {
1976 case EXT4_FC_TAG_ADD_RANGE:
1977 memcpy(&ext, val, sizeof(ext));
1978 ex = (struct ext4_extent *)&ext.fc_ex;
1979 ret = ext4_fc_record_regions(sb,
1980 le32_to_cpu(ext.fc_ino),
1981 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1982 ext4_ext_get_actual_len(ex));
1983 if (ret < 0)
1984 break;
1985 ret = JBD2_FC_REPLAY_CONTINUE;
1986 fallthrough;
1987 case EXT4_FC_TAG_DEL_RANGE:
1988 case EXT4_FC_TAG_LINK:
1989 case EXT4_FC_TAG_UNLINK:
1990 case EXT4_FC_TAG_CREAT:
1991 case EXT4_FC_TAG_INODE:
1992 case EXT4_FC_TAG_PAD:
1993 state->fc_cur_tag++;
1994 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1995 sizeof(tl) + le16_to_cpu(tl.fc_len));
1996 break;
1997 case EXT4_FC_TAG_TAIL:
1998 state->fc_cur_tag++;
1999 memcpy(&tail, val, sizeof(tail));
2000 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2001 sizeof(tl) +
2002 offsetof(struct ext4_fc_tail,
2003 fc_crc));
2004 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2005 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2006 state->fc_replay_num_tags = state->fc_cur_tag;
2007 state->fc_regions_valid =
2008 state->fc_regions_used;
2009 } else {
2010 ret = state->fc_replay_num_tags ?
2011 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2012 }
2013 state->fc_crc = 0;
2014 break;
2015 case EXT4_FC_TAG_HEAD:
2016 memcpy(&head, val, sizeof(head));
2017 if (le32_to_cpu(head.fc_features) &
2018 ~EXT4_FC_SUPPORTED_FEATURES) {
2019 ret = -EOPNOTSUPP;
2020 break;
2021 }
2022 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2023 ret = JBD2_FC_REPLAY_STOP;
2024 break;
2025 }
2026 state->fc_cur_tag++;
2027 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2028 sizeof(tl) + le16_to_cpu(tl.fc_len));
2029 break;
2030 default:
2031 ret = state->fc_replay_num_tags ?
2032 JBD2_FC_REPLAY_STOP : -ECANCELED;
2033 }
2034 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2035 break;
2036 }
2037
2038out_err:
2039 trace_ext4_fc_replay_scan(sb, ret, off);
2040 return ret;
2041}
2042
2043/*
2044 * Main recovery path entry point.
2045 * The meaning of return codes is similar as above.
2046 */
2047static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2048 enum passtype pass, int off, tid_t expected_tid)
2049{
2050 struct super_block *sb = journal->j_private;
2051 struct ext4_sb_info *sbi = EXT4_SB(sb);
2052 struct ext4_fc_tl tl;
2053 __u8 *start, *end, *cur, *val;
2054 int ret = JBD2_FC_REPLAY_CONTINUE;
2055 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2056 struct ext4_fc_tail tail;
2057
2058 if (pass == PASS_SCAN) {
2059 state->fc_current_pass = PASS_SCAN;
2060 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2061 }
2062
2063 if (state->fc_current_pass != pass) {
2064 state->fc_current_pass = pass;
2065 sbi->s_mount_state |= EXT4_FC_REPLAY;
2066 }
2067 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2068 jbd_debug(1, "Replay stops\n");
2069 ext4_fc_set_bitmaps_and_counters(sb);
2070 return 0;
2071 }
2072
2073#ifdef CONFIG_EXT4_DEBUG
2074 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2075 pr_warn("Dropping fc block %d because max_replay set\n", off);
2076 return JBD2_FC_REPLAY_STOP;
2077 }
2078#endif
2079
2080 start = (u8 *)bh->b_data;
2081 end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2082
2083 for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2084 memcpy(&tl, cur, sizeof(tl));
2085 val = cur + sizeof(tl);
2086
2087 if (state->fc_replay_num_tags == 0) {
2088 ret = JBD2_FC_REPLAY_STOP;
2089 ext4_fc_set_bitmaps_and_counters(sb);
2090 break;
2091 }
2092 jbd_debug(3, "Replay phase, tag:%s\n",
2093 tag2str(le16_to_cpu(tl.fc_tag)));
2094 state->fc_replay_num_tags--;
2095 switch (le16_to_cpu(tl.fc_tag)) {
2096 case EXT4_FC_TAG_LINK:
2097 ret = ext4_fc_replay_link(sb, &tl, val);
2098 break;
2099 case EXT4_FC_TAG_UNLINK:
2100 ret = ext4_fc_replay_unlink(sb, &tl, val);
2101 break;
2102 case EXT4_FC_TAG_ADD_RANGE:
2103 ret = ext4_fc_replay_add_range(sb, &tl, val);
2104 break;
2105 case EXT4_FC_TAG_CREAT:
2106 ret = ext4_fc_replay_create(sb, &tl, val);
2107 break;
2108 case EXT4_FC_TAG_DEL_RANGE:
2109 ret = ext4_fc_replay_del_range(sb, &tl, val);
2110 break;
2111 case EXT4_FC_TAG_INODE:
2112 ret = ext4_fc_replay_inode(sb, &tl, val);
2113 break;
2114 case EXT4_FC_TAG_PAD:
2115 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2116 le16_to_cpu(tl.fc_len), 0);
2117 break;
2118 case EXT4_FC_TAG_TAIL:
2119 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2120 le16_to_cpu(tl.fc_len), 0);
2121 memcpy(&tail, val, sizeof(tail));
2122 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2123 break;
2124 case EXT4_FC_TAG_HEAD:
2125 break;
2126 default:
2127 trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2128 le16_to_cpu(tl.fc_len), 0);
2129 ret = -ECANCELED;
2130 break;
2131 }
2132 if (ret < 0)
2133 break;
2134 ret = JBD2_FC_REPLAY_CONTINUE;
2135 }
2136 return ret;
2137}
2138
2139void ext4_fc_init(struct super_block *sb, journal_t *journal)
2140{
2141 /*
2142 * We set replay callback even if fast commit disabled because we may
2143 * could still have fast commit blocks that need to be replayed even if
2144 * fast commit has now been turned off.
2145 */
2146 journal->j_fc_replay_callback = ext4_fc_replay;
2147 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2148 return;
2149 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2150}
2151
2152static const char *fc_ineligible_reasons[] = {
2153 "Extended attributes changed",
2154 "Cross rename",
2155 "Journal flag changed",
2156 "Insufficient memory",
2157 "Swap boot",
2158 "Resize",
2159 "Dir renamed",
2160 "Falloc range op",
2161 "Data journalling",
2162 "FC Commit Failed"
2163};
2164
2165int ext4_fc_info_show(struct seq_file *seq, void *v)
2166{
2167 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2168 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2169 int i;
2170
2171 if (v != SEQ_START_TOKEN)
2172 return 0;
2173
2174 seq_printf(seq,
2175 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2176 stats->fc_num_commits, stats->fc_ineligible_commits,
2177 stats->fc_numblks,
2178 div_u64(sbi->s_fc_avg_commit_time, 1000));
2179 seq_puts(seq, "Ineligible reasons:\n");
2180 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2181 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2182 stats->fc_ineligible_reason_count[i]);
2183
2184 return 0;
2185}
2186
2187int __init ext4_fc_init_dentry_cache(void)
2188{
2189 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2190 SLAB_RECLAIM_ACCOUNT);
2191
2192 if (ext4_fc_dentry_cachep == NULL)
2193 return -ENOMEM;
2194
2195 return 0;
2196}
1// SPDX-License-Identifier: GPL-2.0
2
3/*
4 * fs/ext4/fast_commit.c
5 *
6 * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
7 *
8 * Ext4 fast commits routines.
9 */
10#include "ext4.h"
11#include "ext4_jbd2.h"
12#include "ext4_extents.h"
13#include "mballoc.h"
14
15/*
16 * Ext4 Fast Commits
17 * -----------------
18 *
19 * Ext4 fast commits implement fine grained journalling for Ext4.
20 *
21 * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
22 * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
23 * TLV during the recovery phase. For the scenarios for which we currently
24 * don't have replay code, fast commit falls back to full commits.
25 * Fast commits record delta in one of the following three categories.
26 *
27 * (A) Directory entry updates:
28 *
29 * - EXT4_FC_TAG_UNLINK - records directory entry unlink
30 * - EXT4_FC_TAG_LINK - records directory entry link
31 * - EXT4_FC_TAG_CREAT - records inode and directory entry creation
32 *
33 * (B) File specific data range updates:
34 *
35 * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode
36 * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode
37 *
38 * (C) Inode metadata (mtime / ctime etc):
39 *
40 * - EXT4_FC_TAG_INODE - record the inode that should be replayed
41 * during recovery. Note that iblocks field is
42 * not replayed and instead derived during
43 * replay.
44 * Commit Operation
45 * ----------------
46 * With fast commits, we maintain all the directory entry operations in the
47 * order in which they are issued in an in-memory queue. This queue is flushed
48 * to disk during the commit operation. We also maintain a list of inodes
49 * that need to be committed during a fast commit in another in memory queue of
50 * inodes. During the commit operation, we commit in the following order:
51 *
52 * [1] Lock inodes for any further data updates by setting COMMITTING state
53 * [2] Submit data buffers of all the inodes
54 * [3] Wait for [2] to complete
55 * [4] Commit all the directory entry updates in the fast commit space
56 * [5] Commit all the changed inode structures
57 * [6] Write tail tag (this tag ensures the atomicity, please read the following
58 * section for more details).
59 * [7] Wait for [4], [5] and [6] to complete.
60 *
61 * All the inode updates must call ext4_fc_start_update() before starting an
62 * update. If such an ongoing update is present, fast commit waits for it to
63 * complete. The completion of such an update is marked by
64 * ext4_fc_stop_update().
65 *
66 * Fast Commit Ineligibility
67 * -------------------------
68 *
69 * Not all operations are supported by fast commits today (e.g extended
70 * attributes). Fast commit ineligibility is marked by calling
71 * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back
72 * to full commit.
73 *
74 * Atomicity of commits
75 * --------------------
76 * In order to guarantee atomicity during the commit operation, fast commit
77 * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
78 * tag contains CRC of the contents and TID of the transaction after which
79 * this fast commit should be applied. Recovery code replays fast commit
80 * logs only if there's at least 1 valid tail present. For every fast commit
81 * operation, there is 1 tail. This means, we may end up with multiple tails
82 * in the fast commit space. Here's an example:
83 *
84 * - Create a new file A and remove existing file B
85 * - fsync()
86 * - Append contents to file A
87 * - Truncate file A
88 * - fsync()
89 *
90 * The fast commit space at the end of above operations would look like this:
91 * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
92 * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->|
93 *
94 * Replay code should thus check for all the valid tails in the FC area.
95 *
96 * Fast Commit Replay Idempotence
97 * ------------------------------
98 *
99 * Fast commits tags are idempotent in nature provided the recovery code follows
100 * certain rules. The guiding principle that the commit path follows while
101 * committing is that it stores the result of a particular operation instead of
102 * storing the procedure.
103 *
104 * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
105 * was associated with inode 10. During fast commit, instead of storing this
106 * operation as a procedure "rename a to b", we store the resulting file system
107 * state as a "series" of outcomes:
108 *
109 * - Link dirent b to inode 10
110 * - Unlink dirent a
111 * - Inode <10> with valid refcount
112 *
113 * Now when recovery code runs, it needs "enforce" this state on the file
114 * system. This is what guarantees idempotence of fast commit replay.
115 *
116 * Let's take an example of a procedure that is not idempotent and see how fast
117 * commits make it idempotent. Consider following sequence of operations:
118 *
119 * rm A; mv B A; read A
120 * (x) (y) (z)
121 *
122 * (x), (y) and (z) are the points at which we can crash. If we store this
123 * sequence of operations as is then the replay is not idempotent. Let's say
124 * while in replay, we crash at (z). During the second replay, file A (which was
125 * actually created as a result of "mv B A" operation) would get deleted. Thus,
126 * file named A would be absent when we try to read A. So, this sequence of
127 * operations is not idempotent. However, as mentioned above, instead of storing
128 * the procedure fast commits store the outcome of each procedure. Thus the fast
129 * commit log for above procedure would be as follows:
130 *
131 * (Let's assume dirent A was linked to inode 10 and dirent B was linked to
132 * inode 11 before the replay)
133 *
134 * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
135 * (w) (x) (y) (z)
136 *
137 * If we crash at (z), we will have file A linked to inode 11. During the second
138 * replay, we will remove file A (inode 11). But we will create it back and make
139 * it point to inode 11. We won't find B, so we'll just skip that step. At this
140 * point, the refcount for inode 11 is not reliable, but that gets fixed by the
141 * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
142 * similarly. Thus, by converting a non-idempotent procedure into a series of
143 * idempotent outcomes, fast commits ensured idempotence during the replay.
144 *
145 * TODOs
146 * -----
147 *
148 * 0) Fast commit replay path hardening: Fast commit replay code should use
149 * journal handles to make sure all the updates it does during the replay
150 * path are atomic. With that if we crash during fast commit replay, after
151 * trying to do recovery again, we will find a file system where fast commit
152 * area is invalid (because new full commit would be found). In order to deal
153 * with that, fast commit replay code should ensure that the "FC_REPLAY"
154 * superblock state is persisted before starting the replay, so that after
155 * the crash, fast commit recovery code can look at that flag and perform
156 * fast commit recovery even if that area is invalidated by later full
157 * commits.
158 *
159 * 1) Fast commit's commit path locks the entire file system during fast
160 * commit. This has significant performance penalty. Instead of that, we
161 * should use ext4_fc_start/stop_update functions to start inode level
162 * updates from ext4_journal_start/stop. Once we do that we can drop file
163 * system locking during commit path.
164 *
165 * 2) Handle more ineligible cases.
166 */
167
168#include <trace/events/ext4.h>
169static struct kmem_cache *ext4_fc_dentry_cachep;
170
171static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
172{
173 BUFFER_TRACE(bh, "");
174 if (uptodate) {
175 ext4_debug("%s: Block %lld up-to-date",
176 __func__, bh->b_blocknr);
177 set_buffer_uptodate(bh);
178 } else {
179 ext4_debug("%s: Block %lld not up-to-date",
180 __func__, bh->b_blocknr);
181 clear_buffer_uptodate(bh);
182 }
183
184 unlock_buffer(bh);
185}
186
187static inline void ext4_fc_reset_inode(struct inode *inode)
188{
189 struct ext4_inode_info *ei = EXT4_I(inode);
190
191 ei->i_fc_lblk_start = 0;
192 ei->i_fc_lblk_len = 0;
193}
194
195void ext4_fc_init_inode(struct inode *inode)
196{
197 struct ext4_inode_info *ei = EXT4_I(inode);
198
199 ext4_fc_reset_inode(inode);
200 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
201 INIT_LIST_HEAD(&ei->i_fc_list);
202 INIT_LIST_HEAD(&ei->i_fc_dilist);
203 init_waitqueue_head(&ei->i_fc_wait);
204 atomic_set(&ei->i_fc_updates, 0);
205}
206
207/* This function must be called with sbi->s_fc_lock held. */
208static void ext4_fc_wait_committing_inode(struct inode *inode)
209__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
210{
211 wait_queue_head_t *wq;
212 struct ext4_inode_info *ei = EXT4_I(inode);
213
214#if (BITS_PER_LONG < 64)
215 DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
216 EXT4_STATE_FC_COMMITTING);
217 wq = bit_waitqueue(&ei->i_state_flags,
218 EXT4_STATE_FC_COMMITTING);
219#else
220 DEFINE_WAIT_BIT(wait, &ei->i_flags,
221 EXT4_STATE_FC_COMMITTING);
222 wq = bit_waitqueue(&ei->i_flags,
223 EXT4_STATE_FC_COMMITTING);
224#endif
225 lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
226 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
227 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
228 schedule();
229 finish_wait(wq, &wait.wq_entry);
230}
231
232static bool ext4_fc_disabled(struct super_block *sb)
233{
234 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
235 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY));
236}
237
238/*
239 * Inform Ext4's fast about start of an inode update
240 *
241 * This function is called by the high level call VFS callbacks before
242 * performing any inode update. This function blocks if there's an ongoing
243 * fast commit on the inode in question.
244 */
245void ext4_fc_start_update(struct inode *inode)
246{
247 struct ext4_inode_info *ei = EXT4_I(inode);
248
249 if (ext4_fc_disabled(inode->i_sb))
250 return;
251
252restart:
253 spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
254 if (list_empty(&ei->i_fc_list))
255 goto out;
256
257 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
258 ext4_fc_wait_committing_inode(inode);
259 goto restart;
260 }
261out:
262 atomic_inc(&ei->i_fc_updates);
263 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
264}
265
266/*
267 * Stop inode update and wake up waiting fast commits if any.
268 */
269void ext4_fc_stop_update(struct inode *inode)
270{
271 struct ext4_inode_info *ei = EXT4_I(inode);
272
273 if (ext4_fc_disabled(inode->i_sb))
274 return;
275
276 if (atomic_dec_and_test(&ei->i_fc_updates))
277 wake_up_all(&ei->i_fc_wait);
278}
279
280/*
281 * Remove inode from fast commit list. If the inode is being committed
282 * we wait until inode commit is done.
283 */
284void ext4_fc_del(struct inode *inode)
285{
286 struct ext4_inode_info *ei = EXT4_I(inode);
287 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
288 struct ext4_fc_dentry_update *fc_dentry;
289
290 if (ext4_fc_disabled(inode->i_sb))
291 return;
292
293restart:
294 spin_lock(&sbi->s_fc_lock);
295 if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
296 spin_unlock(&sbi->s_fc_lock);
297 return;
298 }
299
300 if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
301 ext4_fc_wait_committing_inode(inode);
302 goto restart;
303 }
304
305 if (!list_empty(&ei->i_fc_list))
306 list_del_init(&ei->i_fc_list);
307
308 /*
309 * Since this inode is getting removed, let's also remove all FC
310 * dentry create references, since it is not needed to log it anyways.
311 */
312 if (list_empty(&ei->i_fc_dilist)) {
313 spin_unlock(&sbi->s_fc_lock);
314 return;
315 }
316
317 fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
318 WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
319 list_del_init(&fc_dentry->fcd_list);
320 list_del_init(&fc_dentry->fcd_dilist);
321
322 WARN_ON(!list_empty(&ei->i_fc_dilist));
323 spin_unlock(&sbi->s_fc_lock);
324
325 if (fc_dentry->fcd_name.name &&
326 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
327 kfree(fc_dentry->fcd_name.name);
328 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
329
330 return;
331}
332
333/*
334 * Mark file system as fast commit ineligible, and record latest
335 * ineligible transaction tid. This means until the recorded
336 * transaction, commit operation would result in a full jbd2 commit.
337 */
338void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle)
339{
340 struct ext4_sb_info *sbi = EXT4_SB(sb);
341 tid_t tid;
342 bool has_transaction = true;
343 bool is_ineligible;
344
345 if (ext4_fc_disabled(sb))
346 return;
347
348 if (handle && !IS_ERR(handle))
349 tid = handle->h_transaction->t_tid;
350 else {
351 read_lock(&sbi->s_journal->j_state_lock);
352 if (sbi->s_journal->j_running_transaction)
353 tid = sbi->s_journal->j_running_transaction->t_tid;
354 else
355 has_transaction = false;
356 read_unlock(&sbi->s_journal->j_state_lock);
357 }
358 spin_lock(&sbi->s_fc_lock);
359 is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
360 if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
361 sbi->s_fc_ineligible_tid = tid;
362 ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
363 spin_unlock(&sbi->s_fc_lock);
364 WARN_ON(reason >= EXT4_FC_REASON_MAX);
365 sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
366}
367
368/*
369 * Generic fast commit tracking function. If this is the first time this we are
370 * called after a full commit, we initialize fast commit fields and then call
371 * __fc_track_fn() with update = 0. If we have already been called after a full
372 * commit, we pass update = 1. Based on that, the track function can determine
373 * if it needs to track a field for the first time or if it needs to just
374 * update the previously tracked value.
375 *
376 * If enqueue is set, this function enqueues the inode in fast commit list.
377 */
378static int ext4_fc_track_template(
379 handle_t *handle, struct inode *inode,
380 int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool),
381 void *args, int enqueue)
382{
383 bool update = false;
384 struct ext4_inode_info *ei = EXT4_I(inode);
385 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
386 tid_t tid = 0;
387 int ret;
388
389 tid = handle->h_transaction->t_tid;
390 mutex_lock(&ei->i_fc_lock);
391 if (tid == ei->i_sync_tid) {
392 update = true;
393 } else {
394 ext4_fc_reset_inode(inode);
395 ei->i_sync_tid = tid;
396 }
397 ret = __fc_track_fn(handle, inode, args, update);
398 mutex_unlock(&ei->i_fc_lock);
399
400 if (!enqueue)
401 return ret;
402
403 spin_lock(&sbi->s_fc_lock);
404 if (list_empty(&EXT4_I(inode)->i_fc_list))
405 list_add_tail(&EXT4_I(inode)->i_fc_list,
406 (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
407 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
408 &sbi->s_fc_q[FC_Q_STAGING] :
409 &sbi->s_fc_q[FC_Q_MAIN]);
410 spin_unlock(&sbi->s_fc_lock);
411
412 return ret;
413}
414
415struct __track_dentry_update_args {
416 struct dentry *dentry;
417 int op;
418};
419
420/* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
421static int __track_dentry_update(handle_t *handle, struct inode *inode,
422 void *arg, bool update)
423{
424 struct ext4_fc_dentry_update *node;
425 struct ext4_inode_info *ei = EXT4_I(inode);
426 struct __track_dentry_update_args *dentry_update =
427 (struct __track_dentry_update_args *)arg;
428 struct dentry *dentry = dentry_update->dentry;
429 struct inode *dir = dentry->d_parent->d_inode;
430 struct super_block *sb = inode->i_sb;
431 struct ext4_sb_info *sbi = EXT4_SB(sb);
432
433 mutex_unlock(&ei->i_fc_lock);
434
435 if (IS_ENCRYPTED(dir)) {
436 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
437 handle);
438 mutex_lock(&ei->i_fc_lock);
439 return -EOPNOTSUPP;
440 }
441
442 node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
443 if (!node) {
444 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
445 mutex_lock(&ei->i_fc_lock);
446 return -ENOMEM;
447 }
448
449 node->fcd_op = dentry_update->op;
450 node->fcd_parent = dir->i_ino;
451 node->fcd_ino = inode->i_ino;
452 if (dentry->d_name.len > DNAME_INLINE_LEN) {
453 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
454 if (!node->fcd_name.name) {
455 kmem_cache_free(ext4_fc_dentry_cachep, node);
456 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
457 mutex_lock(&ei->i_fc_lock);
458 return -ENOMEM;
459 }
460 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
461 dentry->d_name.len);
462 } else {
463 memcpy(node->fcd_iname, dentry->d_name.name,
464 dentry->d_name.len);
465 node->fcd_name.name = node->fcd_iname;
466 }
467 node->fcd_name.len = dentry->d_name.len;
468 INIT_LIST_HEAD(&node->fcd_dilist);
469 spin_lock(&sbi->s_fc_lock);
470 if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
471 sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
472 list_add_tail(&node->fcd_list,
473 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
474 else
475 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
476
477 /*
478 * This helps us keep a track of all fc_dentry updates which is part of
479 * this ext4 inode. So in case the inode is getting unlinked, before
480 * even we get a chance to fsync, we could remove all fc_dentry
481 * references while evicting the inode in ext4_fc_del().
482 * Also with this, we don't need to loop over all the inodes in
483 * sbi->s_fc_q to get the corresponding inode in
484 * ext4_fc_commit_dentry_updates().
485 */
486 if (dentry_update->op == EXT4_FC_TAG_CREAT) {
487 WARN_ON(!list_empty(&ei->i_fc_dilist));
488 list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
489 }
490 spin_unlock(&sbi->s_fc_lock);
491 mutex_lock(&ei->i_fc_lock);
492
493 return 0;
494}
495
496void __ext4_fc_track_unlink(handle_t *handle,
497 struct inode *inode, struct dentry *dentry)
498{
499 struct __track_dentry_update_args args;
500 int ret;
501
502 args.dentry = dentry;
503 args.op = EXT4_FC_TAG_UNLINK;
504
505 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
506 (void *)&args, 0);
507 trace_ext4_fc_track_unlink(handle, inode, dentry, ret);
508}
509
510void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
511{
512 struct inode *inode = d_inode(dentry);
513
514 if (ext4_fc_disabled(inode->i_sb))
515 return;
516
517 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
518 return;
519
520 __ext4_fc_track_unlink(handle, inode, dentry);
521}
522
523void __ext4_fc_track_link(handle_t *handle,
524 struct inode *inode, struct dentry *dentry)
525{
526 struct __track_dentry_update_args args;
527 int ret;
528
529 args.dentry = dentry;
530 args.op = EXT4_FC_TAG_LINK;
531
532 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
533 (void *)&args, 0);
534 trace_ext4_fc_track_link(handle, inode, dentry, ret);
535}
536
537void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
538{
539 struct inode *inode = d_inode(dentry);
540
541 if (ext4_fc_disabled(inode->i_sb))
542 return;
543
544 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
545 return;
546
547 __ext4_fc_track_link(handle, inode, dentry);
548}
549
550void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
551 struct dentry *dentry)
552{
553 struct __track_dentry_update_args args;
554 int ret;
555
556 args.dentry = dentry;
557 args.op = EXT4_FC_TAG_CREAT;
558
559 ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
560 (void *)&args, 0);
561 trace_ext4_fc_track_create(handle, inode, dentry, ret);
562}
563
564void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
565{
566 struct inode *inode = d_inode(dentry);
567
568 if (ext4_fc_disabled(inode->i_sb))
569 return;
570
571 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
572 return;
573
574 __ext4_fc_track_create(handle, inode, dentry);
575}
576
577/* __track_fn for inode tracking */
578static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
579 bool update)
580{
581 if (update)
582 return -EEXIST;
583
584 EXT4_I(inode)->i_fc_lblk_len = 0;
585
586 return 0;
587}
588
589void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
590{
591 int ret;
592
593 if (S_ISDIR(inode->i_mode))
594 return;
595
596 if (ext4_fc_disabled(inode->i_sb))
597 return;
598
599 if (ext4_should_journal_data(inode)) {
600 ext4_fc_mark_ineligible(inode->i_sb,
601 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle);
602 return;
603 }
604
605 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
606 return;
607
608 ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
609 trace_ext4_fc_track_inode(handle, inode, ret);
610}
611
612struct __track_range_args {
613 ext4_lblk_t start, end;
614};
615
616/* __track_fn for tracking data updates */
617static int __track_range(handle_t *handle, struct inode *inode, void *arg,
618 bool update)
619{
620 struct ext4_inode_info *ei = EXT4_I(inode);
621 ext4_lblk_t oldstart;
622 struct __track_range_args *__arg =
623 (struct __track_range_args *)arg;
624
625 if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
626 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
627 return -ECANCELED;
628 }
629
630 oldstart = ei->i_fc_lblk_start;
631
632 if (update && ei->i_fc_lblk_len > 0) {
633 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
634 ei->i_fc_lblk_len =
635 max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
636 ei->i_fc_lblk_start + 1;
637 } else {
638 ei->i_fc_lblk_start = __arg->start;
639 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
640 }
641
642 return 0;
643}
644
645void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
646 ext4_lblk_t end)
647{
648 struct __track_range_args args;
649 int ret;
650
651 if (S_ISDIR(inode->i_mode))
652 return;
653
654 if (ext4_fc_disabled(inode->i_sb))
655 return;
656
657 if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
658 return;
659
660 if (ext4_has_inline_data(inode)) {
661 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
662 handle);
663 return;
664 }
665
666 args.start = start;
667 args.end = end;
668
669 ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1);
670
671 trace_ext4_fc_track_range(handle, inode, start, end, ret);
672}
673
674static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
675{
676 blk_opf_t write_flags = REQ_SYNC;
677 struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
678
679 /* Add REQ_FUA | REQ_PREFLUSH only its tail */
680 if (test_opt(sb, BARRIER) && is_tail)
681 write_flags |= REQ_FUA | REQ_PREFLUSH;
682 lock_buffer(bh);
683 set_buffer_dirty(bh);
684 set_buffer_uptodate(bh);
685 bh->b_end_io = ext4_end_buffer_io_sync;
686 submit_bh(REQ_OP_WRITE | write_flags, bh);
687 EXT4_SB(sb)->s_fc_bh = NULL;
688}
689
690/* Ext4 commit path routines */
691
692/*
693 * Allocate len bytes on a fast commit buffer.
694 *
695 * During the commit time this function is used to manage fast commit
696 * block space. We don't split a fast commit log onto different
697 * blocks. So this function makes sure that if there's not enough space
698 * on the current block, the remaining space in the current block is
699 * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
700 * new block is from jbd2 and CRC is updated to reflect the padding
701 * we added.
702 */
703static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
704{
705 struct ext4_fc_tl tl;
706 struct ext4_sb_info *sbi = EXT4_SB(sb);
707 struct buffer_head *bh;
708 int bsize = sbi->s_journal->j_blocksize;
709 int ret, off = sbi->s_fc_bytes % bsize;
710 int remaining;
711 u8 *dst;
712
713 /*
714 * If 'len' is too long to fit in any block alongside a PAD tlv, then we
715 * cannot fulfill the request.
716 */
717 if (len > bsize - EXT4_FC_TAG_BASE_LEN)
718 return NULL;
719
720 if (!sbi->s_fc_bh) {
721 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
722 if (ret)
723 return NULL;
724 sbi->s_fc_bh = bh;
725 }
726 dst = sbi->s_fc_bh->b_data + off;
727
728 /*
729 * Allocate the bytes in the current block if we can do so while still
730 * leaving enough space for a PAD tlv.
731 */
732 remaining = bsize - EXT4_FC_TAG_BASE_LEN - off;
733 if (len <= remaining) {
734 sbi->s_fc_bytes += len;
735 return dst;
736 }
737
738 /*
739 * Else, terminate the current block with a PAD tlv, then allocate a new
740 * block and allocate the bytes at the start of that new block.
741 */
742
743 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
744 tl.fc_len = cpu_to_le16(remaining);
745 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
746 memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
747 *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
748
749 ext4_fc_submit_bh(sb, false);
750
751 ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
752 if (ret)
753 return NULL;
754 sbi->s_fc_bh = bh;
755 sbi->s_fc_bytes += bsize - off + len;
756 return sbi->s_fc_bh->b_data;
757}
758
759/*
760 * Complete a fast commit by writing tail tag.
761 *
762 * Writing tail tag marks the end of a fast commit. In order to guarantee
763 * atomicity, after writing tail tag, even if there's space remaining
764 * in the block, next commit shouldn't use it. That's why tail tag
765 * has the length as that of the remaining space on the block.
766 */
767static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
768{
769 struct ext4_sb_info *sbi = EXT4_SB(sb);
770 struct ext4_fc_tl tl;
771 struct ext4_fc_tail tail;
772 int off, bsize = sbi->s_journal->j_blocksize;
773 u8 *dst;
774
775 /*
776 * ext4_fc_reserve_space takes care of allocating an extra block if
777 * there's no enough space on this block for accommodating this tail.
778 */
779 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc);
780 if (!dst)
781 return -ENOSPC;
782
783 off = sbi->s_fc_bytes % bsize;
784
785 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
786 tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail));
787 sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
788
789 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
790 dst += EXT4_FC_TAG_BASE_LEN;
791 tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
792 memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
793 dst += sizeof(tail.fc_tid);
794 crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
795 dst - (u8 *)sbi->s_fc_bh->b_data);
796 tail.fc_crc = cpu_to_le32(crc);
797 memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
798 dst += sizeof(tail.fc_crc);
799 memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */
800
801 ext4_fc_submit_bh(sb, true);
802
803 return 0;
804}
805
806/*
807 * Adds tag, length, value and updates CRC. Returns true if tlv was added.
808 * Returns false if there's not enough space.
809 */
810static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
811 u32 *crc)
812{
813 struct ext4_fc_tl tl;
814 u8 *dst;
815
816 dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc);
817 if (!dst)
818 return false;
819
820 tl.fc_tag = cpu_to_le16(tag);
821 tl.fc_len = cpu_to_le16(len);
822
823 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
824 memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len);
825
826 return true;
827}
828
829/* Same as above, but adds dentry tlv. */
830static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
831 struct ext4_fc_dentry_update *fc_dentry)
832{
833 struct ext4_fc_dentry_info fcd;
834 struct ext4_fc_tl tl;
835 int dlen = fc_dentry->fcd_name.len;
836 u8 *dst = ext4_fc_reserve_space(sb,
837 EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc);
838
839 if (!dst)
840 return false;
841
842 fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent);
843 fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino);
844 tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op);
845 tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
846 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
847 dst += EXT4_FC_TAG_BASE_LEN;
848 memcpy(dst, &fcd, sizeof(fcd));
849 dst += sizeof(fcd);
850 memcpy(dst, fc_dentry->fcd_name.name, dlen);
851
852 return true;
853}
854
855/*
856 * Writes inode in the fast commit space under TLV with tag @tag.
857 * Returns 0 on success, error on failure.
858 */
859static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
860{
861 struct ext4_inode_info *ei = EXT4_I(inode);
862 int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
863 int ret;
864 struct ext4_iloc iloc;
865 struct ext4_fc_inode fc_inode;
866 struct ext4_fc_tl tl;
867 u8 *dst;
868
869 ret = ext4_get_inode_loc(inode, &iloc);
870 if (ret)
871 return ret;
872
873 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
874 inode_len = EXT4_INODE_SIZE(inode->i_sb);
875 else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
876 inode_len += ei->i_extra_isize;
877
878 fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
879 tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
880 tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
881
882 ret = -ECANCELED;
883 dst = ext4_fc_reserve_space(inode->i_sb,
884 EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc);
885 if (!dst)
886 goto err;
887
888 memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
889 dst += EXT4_FC_TAG_BASE_LEN;
890 memcpy(dst, &fc_inode, sizeof(fc_inode));
891 dst += sizeof(fc_inode);
892 memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
893 ret = 0;
894err:
895 brelse(iloc.bh);
896 return ret;
897}
898
899/*
900 * Writes updated data ranges for the inode in question. Updates CRC.
901 * Returns 0 on success, error otherwise.
902 */
903static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
904{
905 ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
906 struct ext4_inode_info *ei = EXT4_I(inode);
907 struct ext4_map_blocks map;
908 struct ext4_fc_add_range fc_ext;
909 struct ext4_fc_del_range lrange;
910 struct ext4_extent *ex;
911 int ret;
912
913 mutex_lock(&ei->i_fc_lock);
914 if (ei->i_fc_lblk_len == 0) {
915 mutex_unlock(&ei->i_fc_lock);
916 return 0;
917 }
918 old_blk_size = ei->i_fc_lblk_start;
919 new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
920 ei->i_fc_lblk_len = 0;
921 mutex_unlock(&ei->i_fc_lock);
922
923 cur_lblk_off = old_blk_size;
924 ext4_debug("will try writing %d to %d for inode %ld\n",
925 cur_lblk_off, new_blk_size, inode->i_ino);
926
927 while (cur_lblk_off <= new_blk_size) {
928 map.m_lblk = cur_lblk_off;
929 map.m_len = new_blk_size - cur_lblk_off + 1;
930 ret = ext4_map_blocks(NULL, inode, &map, 0);
931 if (ret < 0)
932 return -ECANCELED;
933
934 if (map.m_len == 0) {
935 cur_lblk_off++;
936 continue;
937 }
938
939 if (ret == 0) {
940 lrange.fc_ino = cpu_to_le32(inode->i_ino);
941 lrange.fc_lblk = cpu_to_le32(map.m_lblk);
942 lrange.fc_len = cpu_to_le32(map.m_len);
943 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
944 sizeof(lrange), (u8 *)&lrange, crc))
945 return -ENOSPC;
946 } else {
947 unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
948 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
949
950 /* Limit the number of blocks in one extent */
951 map.m_len = min(max, map.m_len);
952
953 fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
954 ex = (struct ext4_extent *)&fc_ext.fc_ex;
955 ex->ee_block = cpu_to_le32(map.m_lblk);
956 ex->ee_len = cpu_to_le16(map.m_len);
957 ext4_ext_store_pblock(ex, map.m_pblk);
958 if (map.m_flags & EXT4_MAP_UNWRITTEN)
959 ext4_ext_mark_unwritten(ex);
960 else
961 ext4_ext_mark_initialized(ex);
962 if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
963 sizeof(fc_ext), (u8 *)&fc_ext, crc))
964 return -ENOSPC;
965 }
966
967 cur_lblk_off += map.m_len;
968 }
969
970 return 0;
971}
972
973
974/* Submit data for all the fast commit inodes */
975static int ext4_fc_submit_inode_data_all(journal_t *journal)
976{
977 struct super_block *sb = journal->j_private;
978 struct ext4_sb_info *sbi = EXT4_SB(sb);
979 struct ext4_inode_info *ei;
980 int ret = 0;
981
982 spin_lock(&sbi->s_fc_lock);
983 list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
984 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
985 while (atomic_read(&ei->i_fc_updates)) {
986 DEFINE_WAIT(wait);
987
988 prepare_to_wait(&ei->i_fc_wait, &wait,
989 TASK_UNINTERRUPTIBLE);
990 if (atomic_read(&ei->i_fc_updates)) {
991 spin_unlock(&sbi->s_fc_lock);
992 schedule();
993 spin_lock(&sbi->s_fc_lock);
994 }
995 finish_wait(&ei->i_fc_wait, &wait);
996 }
997 spin_unlock(&sbi->s_fc_lock);
998 ret = jbd2_submit_inode_data(journal, ei->jinode);
999 if (ret)
1000 return ret;
1001 spin_lock(&sbi->s_fc_lock);
1002 }
1003 spin_unlock(&sbi->s_fc_lock);
1004
1005 return ret;
1006}
1007
1008/* Wait for completion of data for all the fast commit inodes */
1009static int ext4_fc_wait_inode_data_all(journal_t *journal)
1010{
1011 struct super_block *sb = journal->j_private;
1012 struct ext4_sb_info *sbi = EXT4_SB(sb);
1013 struct ext4_inode_info *pos, *n;
1014 int ret = 0;
1015
1016 spin_lock(&sbi->s_fc_lock);
1017 list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1018 if (!ext4_test_inode_state(&pos->vfs_inode,
1019 EXT4_STATE_FC_COMMITTING))
1020 continue;
1021 spin_unlock(&sbi->s_fc_lock);
1022
1023 ret = jbd2_wait_inode_data(journal, pos->jinode);
1024 if (ret)
1025 return ret;
1026 spin_lock(&sbi->s_fc_lock);
1027 }
1028 spin_unlock(&sbi->s_fc_lock);
1029
1030 return 0;
1031}
1032
1033/* Commit all the directory entry updates */
1034static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
1035__acquires(&sbi->s_fc_lock)
1036__releases(&sbi->s_fc_lock)
1037{
1038 struct super_block *sb = journal->j_private;
1039 struct ext4_sb_info *sbi = EXT4_SB(sb);
1040 struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n;
1041 struct inode *inode;
1042 struct ext4_inode_info *ei;
1043 int ret;
1044
1045 if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
1046 return 0;
1047 list_for_each_entry_safe(fc_dentry, fc_dentry_n,
1048 &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
1049 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
1050 spin_unlock(&sbi->s_fc_lock);
1051 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1052 ret = -ENOSPC;
1053 goto lock_and_exit;
1054 }
1055 spin_lock(&sbi->s_fc_lock);
1056 continue;
1057 }
1058 /*
1059 * With fcd_dilist we need not loop in sbi->s_fc_q to get the
1060 * corresponding inode pointer
1061 */
1062 WARN_ON(list_empty(&fc_dentry->fcd_dilist));
1063 ei = list_first_entry(&fc_dentry->fcd_dilist,
1064 struct ext4_inode_info, i_fc_dilist);
1065 inode = &ei->vfs_inode;
1066 WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
1067
1068 spin_unlock(&sbi->s_fc_lock);
1069
1070 /*
1071 * We first write the inode and then the create dirent. This
1072 * allows the recovery code to create an unnamed inode first
1073 * and then link it to a directory entry. This allows us
1074 * to use namei.c routines almost as is and simplifies
1075 * the recovery code.
1076 */
1077 ret = ext4_fc_write_inode(inode, crc);
1078 if (ret)
1079 goto lock_and_exit;
1080
1081 ret = ext4_fc_write_inode_data(inode, crc);
1082 if (ret)
1083 goto lock_and_exit;
1084
1085 if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
1086 ret = -ENOSPC;
1087 goto lock_and_exit;
1088 }
1089
1090 spin_lock(&sbi->s_fc_lock);
1091 }
1092 return 0;
1093lock_and_exit:
1094 spin_lock(&sbi->s_fc_lock);
1095 return ret;
1096}
1097
1098static int ext4_fc_perform_commit(journal_t *journal)
1099{
1100 struct super_block *sb = journal->j_private;
1101 struct ext4_sb_info *sbi = EXT4_SB(sb);
1102 struct ext4_inode_info *iter;
1103 struct ext4_fc_head head;
1104 struct inode *inode;
1105 struct blk_plug plug;
1106 int ret = 0;
1107 u32 crc = 0;
1108
1109 ret = ext4_fc_submit_inode_data_all(journal);
1110 if (ret)
1111 return ret;
1112
1113 ret = ext4_fc_wait_inode_data_all(journal);
1114 if (ret)
1115 return ret;
1116
1117 /*
1118 * If file system device is different from journal device, issue a cache
1119 * flush before we start writing fast commit blocks.
1120 */
1121 if (journal->j_fs_dev != journal->j_dev)
1122 blkdev_issue_flush(journal->j_fs_dev);
1123
1124 blk_start_plug(&plug);
1125 if (sbi->s_fc_bytes == 0) {
1126 /*
1127 * Add a head tag only if this is the first fast commit
1128 * in this TID.
1129 */
1130 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1131 head.fc_tid = cpu_to_le32(
1132 sbi->s_journal->j_running_transaction->t_tid);
1133 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1134 (u8 *)&head, &crc)) {
1135 ret = -ENOSPC;
1136 goto out;
1137 }
1138 }
1139
1140 spin_lock(&sbi->s_fc_lock);
1141 ret = ext4_fc_commit_dentry_updates(journal, &crc);
1142 if (ret) {
1143 spin_unlock(&sbi->s_fc_lock);
1144 goto out;
1145 }
1146
1147 list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
1148 inode = &iter->vfs_inode;
1149 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1150 continue;
1151
1152 spin_unlock(&sbi->s_fc_lock);
1153 ret = ext4_fc_write_inode_data(inode, &crc);
1154 if (ret)
1155 goto out;
1156 ret = ext4_fc_write_inode(inode, &crc);
1157 if (ret)
1158 goto out;
1159 spin_lock(&sbi->s_fc_lock);
1160 }
1161 spin_unlock(&sbi->s_fc_lock);
1162
1163 ret = ext4_fc_write_tail(sb, crc);
1164
1165out:
1166 blk_finish_plug(&plug);
1167 return ret;
1168}
1169
1170static void ext4_fc_update_stats(struct super_block *sb, int status,
1171 u64 commit_time, int nblks, tid_t commit_tid)
1172{
1173 struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats;
1174
1175 ext4_debug("Fast commit ended with status = %d for tid %u",
1176 status, commit_tid);
1177 if (status == EXT4_FC_STATUS_OK) {
1178 stats->fc_num_commits++;
1179 stats->fc_numblks += nblks;
1180 if (likely(stats->s_fc_avg_commit_time))
1181 stats->s_fc_avg_commit_time =
1182 (commit_time +
1183 stats->s_fc_avg_commit_time * 3) / 4;
1184 else
1185 stats->s_fc_avg_commit_time = commit_time;
1186 } else if (status == EXT4_FC_STATUS_FAILED ||
1187 status == EXT4_FC_STATUS_INELIGIBLE) {
1188 if (status == EXT4_FC_STATUS_FAILED)
1189 stats->fc_failed_commits++;
1190 stats->fc_ineligible_commits++;
1191 } else {
1192 stats->fc_skipped_commits++;
1193 }
1194 trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid);
1195}
1196
1197/*
1198 * The main commit entry point. Performs a fast commit for transaction
1199 * commit_tid if needed. If it's not possible to perform a fast commit
1200 * due to various reasons, we fall back to full commit. Returns 0
1201 * on success, error otherwise.
1202 */
1203int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1204{
1205 struct super_block *sb = journal->j_private;
1206 struct ext4_sb_info *sbi = EXT4_SB(sb);
1207 int nblks = 0, ret, bsize = journal->j_blocksize;
1208 int subtid = atomic_read(&sbi->s_fc_subtid);
1209 int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
1210 ktime_t start_time, commit_time;
1211
1212 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
1213 return jbd2_complete_transaction(journal, commit_tid);
1214
1215 trace_ext4_fc_commit_start(sb, commit_tid);
1216
1217 start_time = ktime_get();
1218
1219restart_fc:
1220 ret = jbd2_fc_begin_commit(journal, commit_tid);
1221 if (ret == -EALREADY) {
1222 /* There was an ongoing commit, check if we need to restart */
1223 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1224 tid_gt(commit_tid, journal->j_commit_sequence))
1225 goto restart_fc;
1226 ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
1227 commit_tid);
1228 return 0;
1229 } else if (ret) {
1230 /*
1231 * Commit couldn't start. Just update stats and perform a
1232 * full commit.
1233 */
1234 ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0,
1235 commit_tid);
1236 return jbd2_complete_transaction(journal, commit_tid);
1237 }
1238
1239 /*
1240 * After establishing journal barrier via jbd2_fc_begin_commit(), check
1241 * if we are fast commit ineligible.
1242 */
1243 if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) {
1244 status = EXT4_FC_STATUS_INELIGIBLE;
1245 goto fallback;
1246 }
1247
1248 fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1249 ret = ext4_fc_perform_commit(journal);
1250 if (ret < 0) {
1251 status = EXT4_FC_STATUS_FAILED;
1252 goto fallback;
1253 }
1254 nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1255 ret = jbd2_fc_wait_bufs(journal, nblks);
1256 if (ret < 0) {
1257 status = EXT4_FC_STATUS_FAILED;
1258 goto fallback;
1259 }
1260 atomic_inc(&sbi->s_fc_subtid);
1261 ret = jbd2_fc_end_commit(journal);
1262 /*
1263 * weight the commit time higher than the average time so we
1264 * don't react too strongly to vast changes in the commit time
1265 */
1266 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1267 ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid);
1268 return ret;
1269
1270fallback:
1271 ret = jbd2_fc_end_commit_fallback(journal);
1272 ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
1273 return ret;
1274}
1275
1276/*
1277 * Fast commit cleanup routine. This is called after every fast commit and
1278 * full commit. full is true if we are called after a full commit.
1279 */
1280static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
1281{
1282 struct super_block *sb = journal->j_private;
1283 struct ext4_sb_info *sbi = EXT4_SB(sb);
1284 struct ext4_inode_info *iter, *iter_n;
1285 struct ext4_fc_dentry_update *fc_dentry;
1286
1287 if (full && sbi->s_fc_bh)
1288 sbi->s_fc_bh = NULL;
1289
1290 trace_ext4_fc_cleanup(journal, full, tid);
1291 jbd2_fc_release_bufs(journal);
1292
1293 spin_lock(&sbi->s_fc_lock);
1294 list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
1295 i_fc_list) {
1296 list_del_init(&iter->i_fc_list);
1297 ext4_clear_inode_state(&iter->vfs_inode,
1298 EXT4_STATE_FC_COMMITTING);
1299 if (tid_geq(tid, iter->i_sync_tid)) {
1300 ext4_fc_reset_inode(&iter->vfs_inode);
1301 } else if (full) {
1302 /*
1303 * We are called after a full commit, inode has been
1304 * modified while the commit was running. Re-enqueue
1305 * the inode into STAGING, which will then be splice
1306 * back into MAIN. This cannot happen during
1307 * fastcommit because the journal is locked all the
1308 * time in that case (and tid doesn't increase so
1309 * tid check above isn't reliable).
1310 */
1311 list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
1312 &sbi->s_fc_q[FC_Q_STAGING]);
1313 }
1314 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1315 smp_mb();
1316#if (BITS_PER_LONG < 64)
1317 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1318#else
1319 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1320#endif
1321 }
1322
1323 while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1324 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1325 struct ext4_fc_dentry_update,
1326 fcd_list);
1327 list_del_init(&fc_dentry->fcd_list);
1328 list_del_init(&fc_dentry->fcd_dilist);
1329 spin_unlock(&sbi->s_fc_lock);
1330
1331 if (fc_dentry->fcd_name.name &&
1332 fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1333 kfree(fc_dentry->fcd_name.name);
1334 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1335 spin_lock(&sbi->s_fc_lock);
1336 }
1337
1338 list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1339 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1340 list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1341 &sbi->s_fc_q[FC_Q_MAIN]);
1342
1343 if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
1344 sbi->s_fc_ineligible_tid = 0;
1345 ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1346 }
1347
1348 if (full)
1349 sbi->s_fc_bytes = 0;
1350 spin_unlock(&sbi->s_fc_lock);
1351 trace_ext4_fc_stats(sb);
1352}
1353
1354/* Ext4 Replay Path Routines */
1355
1356/* Helper struct for dentry replay routines */
1357struct dentry_info_args {
1358 int parent_ino, dname_len, ino, inode_len;
1359 char *dname;
1360};
1361
1362/* Same as struct ext4_fc_tl, but uses native endianness fields */
1363struct ext4_fc_tl_mem {
1364 u16 fc_tag;
1365 u16 fc_len;
1366};
1367
1368static inline void tl_to_darg(struct dentry_info_args *darg,
1369 struct ext4_fc_tl_mem *tl, u8 *val)
1370{
1371 struct ext4_fc_dentry_info fcd;
1372
1373 memcpy(&fcd, val, sizeof(fcd));
1374
1375 darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1376 darg->ino = le32_to_cpu(fcd.fc_ino);
1377 darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1378 darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info);
1379}
1380
1381static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val)
1382{
1383 struct ext4_fc_tl tl_disk;
1384
1385 memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN);
1386 tl->fc_len = le16_to_cpu(tl_disk.fc_len);
1387 tl->fc_tag = le16_to_cpu(tl_disk.fc_tag);
1388}
1389
1390/* Unlink replay function */
1391static int ext4_fc_replay_unlink(struct super_block *sb,
1392 struct ext4_fc_tl_mem *tl, u8 *val)
1393{
1394 struct inode *inode, *old_parent;
1395 struct qstr entry;
1396 struct dentry_info_args darg;
1397 int ret = 0;
1398
1399 tl_to_darg(&darg, tl, val);
1400
1401 trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1402 darg.parent_ino, darg.dname_len);
1403
1404 entry.name = darg.dname;
1405 entry.len = darg.dname_len;
1406 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1407
1408 if (IS_ERR(inode)) {
1409 ext4_debug("Inode %d not found", darg.ino);
1410 return 0;
1411 }
1412
1413 old_parent = ext4_iget(sb, darg.parent_ino,
1414 EXT4_IGET_NORMAL);
1415 if (IS_ERR(old_parent)) {
1416 ext4_debug("Dir with inode %d not found", darg.parent_ino);
1417 iput(inode);
1418 return 0;
1419 }
1420
1421 ret = __ext4_unlink(old_parent, &entry, inode, NULL);
1422 /* -ENOENT ok coz it might not exist anymore. */
1423 if (ret == -ENOENT)
1424 ret = 0;
1425 iput(old_parent);
1426 iput(inode);
1427 return ret;
1428}
1429
1430static int ext4_fc_replay_link_internal(struct super_block *sb,
1431 struct dentry_info_args *darg,
1432 struct inode *inode)
1433{
1434 struct inode *dir = NULL;
1435 struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1436 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1437 int ret = 0;
1438
1439 dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1440 if (IS_ERR(dir)) {
1441 ext4_debug("Dir with inode %d not found.", darg->parent_ino);
1442 dir = NULL;
1443 goto out;
1444 }
1445
1446 dentry_dir = d_obtain_alias(dir);
1447 if (IS_ERR(dentry_dir)) {
1448 ext4_debug("Failed to obtain dentry");
1449 dentry_dir = NULL;
1450 goto out;
1451 }
1452
1453 dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1454 if (!dentry_inode) {
1455 ext4_debug("Inode dentry not created.");
1456 ret = -ENOMEM;
1457 goto out;
1458 }
1459
1460 ret = __ext4_link(dir, inode, dentry_inode);
1461 /*
1462 * It's possible that link already existed since data blocks
1463 * for the dir in question got persisted before we crashed OR
1464 * we replayed this tag and crashed before the entire replay
1465 * could complete.
1466 */
1467 if (ret && ret != -EEXIST) {
1468 ext4_debug("Failed to link\n");
1469 goto out;
1470 }
1471
1472 ret = 0;
1473out:
1474 if (dentry_dir) {
1475 d_drop(dentry_dir);
1476 dput(dentry_dir);
1477 } else if (dir) {
1478 iput(dir);
1479 }
1480 if (dentry_inode) {
1481 d_drop(dentry_inode);
1482 dput(dentry_inode);
1483 }
1484
1485 return ret;
1486}
1487
1488/* Link replay function */
1489static int ext4_fc_replay_link(struct super_block *sb,
1490 struct ext4_fc_tl_mem *tl, u8 *val)
1491{
1492 struct inode *inode;
1493 struct dentry_info_args darg;
1494 int ret = 0;
1495
1496 tl_to_darg(&darg, tl, val);
1497 trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1498 darg.parent_ino, darg.dname_len);
1499
1500 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1501 if (IS_ERR(inode)) {
1502 ext4_debug("Inode not found.");
1503 return 0;
1504 }
1505
1506 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1507 iput(inode);
1508 return ret;
1509}
1510
1511/*
1512 * Record all the modified inodes during replay. We use this later to setup
1513 * block bitmaps correctly.
1514 */
1515static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1516{
1517 struct ext4_fc_replay_state *state;
1518 int i;
1519
1520 state = &EXT4_SB(sb)->s_fc_replay_state;
1521 for (i = 0; i < state->fc_modified_inodes_used; i++)
1522 if (state->fc_modified_inodes[i] == ino)
1523 return 0;
1524 if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1525 int *fc_modified_inodes;
1526
1527 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1528 sizeof(int) * (state->fc_modified_inodes_size +
1529 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1530 GFP_KERNEL);
1531 if (!fc_modified_inodes)
1532 return -ENOMEM;
1533 state->fc_modified_inodes = fc_modified_inodes;
1534 state->fc_modified_inodes_size +=
1535 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1536 }
1537 state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1538 return 0;
1539}
1540
1541/*
1542 * Inode replay function
1543 */
1544static int ext4_fc_replay_inode(struct super_block *sb,
1545 struct ext4_fc_tl_mem *tl, u8 *val)
1546{
1547 struct ext4_fc_inode fc_inode;
1548 struct ext4_inode *raw_inode;
1549 struct ext4_inode *raw_fc_inode;
1550 struct inode *inode = NULL;
1551 struct ext4_iloc iloc;
1552 int inode_len, ino, ret, tag = tl->fc_tag;
1553 struct ext4_extent_header *eh;
1554 size_t off_gen = offsetof(struct ext4_inode, i_generation);
1555
1556 memcpy(&fc_inode, val, sizeof(fc_inode));
1557
1558 ino = le32_to_cpu(fc_inode.fc_ino);
1559 trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1560
1561 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1562 if (!IS_ERR(inode)) {
1563 ext4_ext_clear_bb(inode);
1564 iput(inode);
1565 }
1566 inode = NULL;
1567
1568 ret = ext4_fc_record_modified_inode(sb, ino);
1569 if (ret)
1570 goto out;
1571
1572 raw_fc_inode = (struct ext4_inode *)
1573 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1574 ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1575 if (ret)
1576 goto out;
1577
1578 inode_len = tl->fc_len - sizeof(struct ext4_fc_inode);
1579 raw_inode = ext4_raw_inode(&iloc);
1580
1581 memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1582 memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen,
1583 inode_len - off_gen);
1584 if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1585 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1586 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1587 memset(eh, 0, sizeof(*eh));
1588 eh->eh_magic = EXT4_EXT_MAGIC;
1589 eh->eh_max = cpu_to_le16(
1590 (sizeof(raw_inode->i_block) -
1591 sizeof(struct ext4_extent_header))
1592 / sizeof(struct ext4_extent));
1593 }
1594 } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1595 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1596 sizeof(raw_inode->i_block));
1597 }
1598
1599 /* Immediately update the inode on disk. */
1600 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1601 if (ret)
1602 goto out;
1603 ret = sync_dirty_buffer(iloc.bh);
1604 if (ret)
1605 goto out;
1606 ret = ext4_mark_inode_used(sb, ino);
1607 if (ret)
1608 goto out;
1609
1610 /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1611 inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1612 if (IS_ERR(inode)) {
1613 ext4_debug("Inode not found.");
1614 return -EFSCORRUPTED;
1615 }
1616
1617 /*
1618 * Our allocator could have made different decisions than before
1619 * crashing. This should be fixed but until then, we calculate
1620 * the number of blocks the inode.
1621 */
1622 if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
1623 ext4_ext_replay_set_iblocks(inode);
1624
1625 inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1626 ext4_reset_inode_seed(inode);
1627
1628 ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1629 ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1630 sync_dirty_buffer(iloc.bh);
1631 brelse(iloc.bh);
1632out:
1633 iput(inode);
1634 if (!ret)
1635 blkdev_issue_flush(sb->s_bdev);
1636
1637 return 0;
1638}
1639
1640/*
1641 * Dentry create replay function.
1642 *
1643 * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1644 * inode for which we are trying to create a dentry here, should already have
1645 * been replayed before we start here.
1646 */
1647static int ext4_fc_replay_create(struct super_block *sb,
1648 struct ext4_fc_tl_mem *tl, u8 *val)
1649{
1650 int ret = 0;
1651 struct inode *inode = NULL;
1652 struct inode *dir = NULL;
1653 struct dentry_info_args darg;
1654
1655 tl_to_darg(&darg, tl, val);
1656
1657 trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1658 darg.parent_ino, darg.dname_len);
1659
1660 /* This takes care of update group descriptor and other metadata */
1661 ret = ext4_mark_inode_used(sb, darg.ino);
1662 if (ret)
1663 goto out;
1664
1665 inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1666 if (IS_ERR(inode)) {
1667 ext4_debug("inode %d not found.", darg.ino);
1668 inode = NULL;
1669 ret = -EINVAL;
1670 goto out;
1671 }
1672
1673 if (S_ISDIR(inode->i_mode)) {
1674 /*
1675 * If we are creating a directory, we need to make sure that the
1676 * dot and dot dot dirents are setup properly.
1677 */
1678 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1679 if (IS_ERR(dir)) {
1680 ext4_debug("Dir %d not found.", darg.ino);
1681 goto out;
1682 }
1683 ret = ext4_init_new_dir(NULL, dir, inode);
1684 iput(dir);
1685 if (ret) {
1686 ret = 0;
1687 goto out;
1688 }
1689 }
1690 ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1691 if (ret)
1692 goto out;
1693 set_nlink(inode, 1);
1694 ext4_mark_inode_dirty(NULL, inode);
1695out:
1696 iput(inode);
1697 return ret;
1698}
1699
1700/*
1701 * Record physical disk regions which are in use as per fast commit area,
1702 * and used by inodes during replay phase. Our simple replay phase
1703 * allocator excludes these regions from allocation.
1704 */
1705int ext4_fc_record_regions(struct super_block *sb, int ino,
1706 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1707{
1708 struct ext4_fc_replay_state *state;
1709 struct ext4_fc_alloc_region *region;
1710
1711 state = &EXT4_SB(sb)->s_fc_replay_state;
1712 /*
1713 * during replay phase, the fc_regions_valid may not same as
1714 * fc_regions_used, update it when do new additions.
1715 */
1716 if (replay && state->fc_regions_used != state->fc_regions_valid)
1717 state->fc_regions_used = state->fc_regions_valid;
1718 if (state->fc_regions_used == state->fc_regions_size) {
1719 struct ext4_fc_alloc_region *fc_regions;
1720
1721 fc_regions = krealloc(state->fc_regions,
1722 sizeof(struct ext4_fc_alloc_region) *
1723 (state->fc_regions_size +
1724 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1725 GFP_KERNEL);
1726 if (!fc_regions)
1727 return -ENOMEM;
1728 state->fc_regions_size +=
1729 EXT4_FC_REPLAY_REALLOC_INCREMENT;
1730 state->fc_regions = fc_regions;
1731 }
1732 region = &state->fc_regions[state->fc_regions_used++];
1733 region->ino = ino;
1734 region->lblk = lblk;
1735 region->pblk = pblk;
1736 region->len = len;
1737
1738 if (replay)
1739 state->fc_regions_valid++;
1740
1741 return 0;
1742}
1743
1744/* Replay add range tag */
1745static int ext4_fc_replay_add_range(struct super_block *sb,
1746 struct ext4_fc_tl_mem *tl, u8 *val)
1747{
1748 struct ext4_fc_add_range fc_add_ex;
1749 struct ext4_extent newex, *ex;
1750 struct inode *inode;
1751 ext4_lblk_t start, cur;
1752 int remaining, len;
1753 ext4_fsblk_t start_pblk;
1754 struct ext4_map_blocks map;
1755 struct ext4_ext_path *path = NULL;
1756 int ret;
1757
1758 memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1759 ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1760
1761 trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1762 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1763 ext4_ext_get_actual_len(ex));
1764
1765 inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1766 if (IS_ERR(inode)) {
1767 ext4_debug("Inode not found.");
1768 return 0;
1769 }
1770
1771 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1772 if (ret)
1773 goto out;
1774
1775 start = le32_to_cpu(ex->ee_block);
1776 start_pblk = ext4_ext_pblock(ex);
1777 len = ext4_ext_get_actual_len(ex);
1778
1779 cur = start;
1780 remaining = len;
1781 ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1782 start, start_pblk, len, ext4_ext_is_unwritten(ex),
1783 inode->i_ino);
1784
1785 while (remaining > 0) {
1786 map.m_lblk = cur;
1787 map.m_len = remaining;
1788 map.m_pblk = 0;
1789 ret = ext4_map_blocks(NULL, inode, &map, 0);
1790
1791 if (ret < 0)
1792 goto out;
1793
1794 if (ret == 0) {
1795 /* Range is not mapped */
1796 path = ext4_find_extent(inode, cur, path, 0);
1797 if (IS_ERR(path))
1798 goto out;
1799 memset(&newex, 0, sizeof(newex));
1800 newex.ee_block = cpu_to_le32(cur);
1801 ext4_ext_store_pblock(
1802 &newex, start_pblk + cur - start);
1803 newex.ee_len = cpu_to_le16(map.m_len);
1804 if (ext4_ext_is_unwritten(ex))
1805 ext4_ext_mark_unwritten(&newex);
1806 down_write(&EXT4_I(inode)->i_data_sem);
1807 path = ext4_ext_insert_extent(NULL, inode,
1808 path, &newex, 0);
1809 up_write((&EXT4_I(inode)->i_data_sem));
1810 if (IS_ERR(path))
1811 goto out;
1812 goto next;
1813 }
1814
1815 if (start_pblk + cur - start != map.m_pblk) {
1816 /*
1817 * Logical to physical mapping changed. This can happen
1818 * if this range was removed and then reallocated to
1819 * map to new physical blocks during a fast commit.
1820 */
1821 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1822 ext4_ext_is_unwritten(ex),
1823 start_pblk + cur - start);
1824 if (ret)
1825 goto out;
1826 /*
1827 * Mark the old blocks as free since they aren't used
1828 * anymore. We maintain an array of all the modified
1829 * inodes. In case these blocks are still used at either
1830 * a different logical range in the same inode or in
1831 * some different inode, we will mark them as allocated
1832 * at the end of the FC replay using our array of
1833 * modified inodes.
1834 */
1835 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1836 goto next;
1837 }
1838
1839 /* Range is mapped and needs a state change */
1840 ext4_debug("Converting from %ld to %d %lld",
1841 map.m_flags & EXT4_MAP_UNWRITTEN,
1842 ext4_ext_is_unwritten(ex), map.m_pblk);
1843 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1844 ext4_ext_is_unwritten(ex), map.m_pblk);
1845 if (ret)
1846 goto out;
1847 /*
1848 * We may have split the extent tree while toggling the state.
1849 * Try to shrink the extent tree now.
1850 */
1851 ext4_ext_replay_shrink_inode(inode, start + len);
1852next:
1853 cur += map.m_len;
1854 remaining -= map.m_len;
1855 }
1856 ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1857 sb->s_blocksize_bits);
1858out:
1859 ext4_free_ext_path(path);
1860 iput(inode);
1861 return 0;
1862}
1863
1864/* Replay DEL_RANGE tag */
1865static int
1866ext4_fc_replay_del_range(struct super_block *sb,
1867 struct ext4_fc_tl_mem *tl, u8 *val)
1868{
1869 struct inode *inode;
1870 struct ext4_fc_del_range lrange;
1871 struct ext4_map_blocks map;
1872 ext4_lblk_t cur, remaining;
1873 int ret;
1874
1875 memcpy(&lrange, val, sizeof(lrange));
1876 cur = le32_to_cpu(lrange.fc_lblk);
1877 remaining = le32_to_cpu(lrange.fc_len);
1878
1879 trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1880 le32_to_cpu(lrange.fc_ino), cur, remaining);
1881
1882 inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1883 if (IS_ERR(inode)) {
1884 ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino));
1885 return 0;
1886 }
1887
1888 ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1889 if (ret)
1890 goto out;
1891
1892 ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n",
1893 inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1894 le32_to_cpu(lrange.fc_len));
1895 while (remaining > 0) {
1896 map.m_lblk = cur;
1897 map.m_len = remaining;
1898
1899 ret = ext4_map_blocks(NULL, inode, &map, 0);
1900 if (ret < 0)
1901 goto out;
1902 if (ret > 0) {
1903 remaining -= ret;
1904 cur += ret;
1905 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false);
1906 } else {
1907 remaining -= map.m_len;
1908 cur += map.m_len;
1909 }
1910 }
1911
1912 down_write(&EXT4_I(inode)->i_data_sem);
1913 ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1914 le32_to_cpu(lrange.fc_lblk) +
1915 le32_to_cpu(lrange.fc_len) - 1);
1916 up_write(&EXT4_I(inode)->i_data_sem);
1917 if (ret)
1918 goto out;
1919 ext4_ext_replay_shrink_inode(inode,
1920 i_size_read(inode) >> sb->s_blocksize_bits);
1921 ext4_mark_inode_dirty(NULL, inode);
1922out:
1923 iput(inode);
1924 return 0;
1925}
1926
1927static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1928{
1929 struct ext4_fc_replay_state *state;
1930 struct inode *inode;
1931 struct ext4_ext_path *path = NULL;
1932 struct ext4_map_blocks map;
1933 int i, ret, j;
1934 ext4_lblk_t cur, end;
1935
1936 state = &EXT4_SB(sb)->s_fc_replay_state;
1937 for (i = 0; i < state->fc_modified_inodes_used; i++) {
1938 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1939 EXT4_IGET_NORMAL);
1940 if (IS_ERR(inode)) {
1941 ext4_debug("Inode %d not found.",
1942 state->fc_modified_inodes[i]);
1943 continue;
1944 }
1945 cur = 0;
1946 end = EXT_MAX_BLOCKS;
1947 if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) {
1948 iput(inode);
1949 continue;
1950 }
1951 while (cur < end) {
1952 map.m_lblk = cur;
1953 map.m_len = end - cur;
1954
1955 ret = ext4_map_blocks(NULL, inode, &map, 0);
1956 if (ret < 0)
1957 break;
1958
1959 if (ret > 0) {
1960 path = ext4_find_extent(inode, map.m_lblk, path, 0);
1961 if (!IS_ERR(path)) {
1962 for (j = 0; j < path->p_depth; j++)
1963 ext4_mb_mark_bb(inode->i_sb,
1964 path[j].p_block, 1, true);
1965 } else {
1966 path = NULL;
1967 }
1968 cur += ret;
1969 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1970 map.m_len, true);
1971 } else {
1972 cur = cur + (map.m_len ? map.m_len : 1);
1973 }
1974 }
1975 iput(inode);
1976 }
1977
1978 ext4_free_ext_path(path);
1979}
1980
1981/*
1982 * Check if block is in excluded regions for block allocation. The simple
1983 * allocator that runs during replay phase is calls this function to see
1984 * if it is okay to use a block.
1985 */
1986bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1987{
1988 int i;
1989 struct ext4_fc_replay_state *state;
1990
1991 state = &EXT4_SB(sb)->s_fc_replay_state;
1992 for (i = 0; i < state->fc_regions_valid; i++) {
1993 if (state->fc_regions[i].ino == 0 ||
1994 state->fc_regions[i].len == 0)
1995 continue;
1996 if (in_range(blk, state->fc_regions[i].pblk,
1997 state->fc_regions[i].len))
1998 return true;
1999 }
2000 return false;
2001}
2002
2003/* Cleanup function called after replay */
2004void ext4_fc_replay_cleanup(struct super_block *sb)
2005{
2006 struct ext4_sb_info *sbi = EXT4_SB(sb);
2007
2008 sbi->s_mount_state &= ~EXT4_FC_REPLAY;
2009 kfree(sbi->s_fc_replay_state.fc_regions);
2010 kfree(sbi->s_fc_replay_state.fc_modified_inodes);
2011}
2012
2013static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi,
2014 int tag, int len)
2015{
2016 switch (tag) {
2017 case EXT4_FC_TAG_ADD_RANGE:
2018 return len == sizeof(struct ext4_fc_add_range);
2019 case EXT4_FC_TAG_DEL_RANGE:
2020 return len == sizeof(struct ext4_fc_del_range);
2021 case EXT4_FC_TAG_CREAT:
2022 case EXT4_FC_TAG_LINK:
2023 case EXT4_FC_TAG_UNLINK:
2024 len -= sizeof(struct ext4_fc_dentry_info);
2025 return len >= 1 && len <= EXT4_NAME_LEN;
2026 case EXT4_FC_TAG_INODE:
2027 len -= sizeof(struct ext4_fc_inode);
2028 return len >= EXT4_GOOD_OLD_INODE_SIZE &&
2029 len <= sbi->s_inode_size;
2030 case EXT4_FC_TAG_PAD:
2031 return true; /* padding can have any length */
2032 case EXT4_FC_TAG_TAIL:
2033 return len >= sizeof(struct ext4_fc_tail);
2034 case EXT4_FC_TAG_HEAD:
2035 return len == sizeof(struct ext4_fc_head);
2036 }
2037 return false;
2038}
2039
2040/*
2041 * Recovery Scan phase handler
2042 *
2043 * This function is called during the scan phase and is responsible
2044 * for doing following things:
2045 * - Make sure the fast commit area has valid tags for replay
2046 * - Count number of tags that need to be replayed by the replay handler
2047 * - Verify CRC
2048 * - Create a list of excluded blocks for allocation during replay phase
2049 *
2050 * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
2051 * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
2052 * to indicate that scan has finished and JBD2 can now start replay phase.
2053 * It returns a negative error to indicate that there was an error. At the end
2054 * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
2055 * to indicate the number of tags that need to replayed during the replay phase.
2056 */
2057static int ext4_fc_replay_scan(journal_t *journal,
2058 struct buffer_head *bh, int off,
2059 tid_t expected_tid)
2060{
2061 struct super_block *sb = journal->j_private;
2062 struct ext4_sb_info *sbi = EXT4_SB(sb);
2063 struct ext4_fc_replay_state *state;
2064 int ret = JBD2_FC_REPLAY_CONTINUE;
2065 struct ext4_fc_add_range ext;
2066 struct ext4_fc_tl_mem tl;
2067 struct ext4_fc_tail tail;
2068 __u8 *start, *end, *cur, *val;
2069 struct ext4_fc_head head;
2070 struct ext4_extent *ex;
2071
2072 state = &sbi->s_fc_replay_state;
2073
2074 start = (u8 *)bh->b_data;
2075 end = start + journal->j_blocksize;
2076
2077 if (state->fc_replay_expected_off == 0) {
2078 state->fc_cur_tag = 0;
2079 state->fc_replay_num_tags = 0;
2080 state->fc_crc = 0;
2081 state->fc_regions = NULL;
2082 state->fc_regions_valid = state->fc_regions_used =
2083 state->fc_regions_size = 0;
2084 /* Check if we can stop early */
2085 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
2086 != EXT4_FC_TAG_HEAD)
2087 return 0;
2088 }
2089
2090 if (off != state->fc_replay_expected_off) {
2091 ret = -EFSCORRUPTED;
2092 goto out_err;
2093 }
2094
2095 state->fc_replay_expected_off++;
2096 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2097 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2098 ext4_fc_get_tl(&tl, cur);
2099 val = cur + EXT4_FC_TAG_BASE_LEN;
2100 if (tl.fc_len > end - val ||
2101 !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) {
2102 ret = state->fc_replay_num_tags ?
2103 JBD2_FC_REPLAY_STOP : -ECANCELED;
2104 goto out_err;
2105 }
2106 ext4_debug("Scan phase, tag:%s, blk %lld\n",
2107 tag2str(tl.fc_tag), bh->b_blocknr);
2108 switch (tl.fc_tag) {
2109 case EXT4_FC_TAG_ADD_RANGE:
2110 memcpy(&ext, val, sizeof(ext));
2111 ex = (struct ext4_extent *)&ext.fc_ex;
2112 ret = ext4_fc_record_regions(sb,
2113 le32_to_cpu(ext.fc_ino),
2114 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
2115 ext4_ext_get_actual_len(ex), 0);
2116 if (ret < 0)
2117 break;
2118 ret = JBD2_FC_REPLAY_CONTINUE;
2119 fallthrough;
2120 case EXT4_FC_TAG_DEL_RANGE:
2121 case EXT4_FC_TAG_LINK:
2122 case EXT4_FC_TAG_UNLINK:
2123 case EXT4_FC_TAG_CREAT:
2124 case EXT4_FC_TAG_INODE:
2125 case EXT4_FC_TAG_PAD:
2126 state->fc_cur_tag++;
2127 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2128 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2129 break;
2130 case EXT4_FC_TAG_TAIL:
2131 state->fc_cur_tag++;
2132 memcpy(&tail, val, sizeof(tail));
2133 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2134 EXT4_FC_TAG_BASE_LEN +
2135 offsetof(struct ext4_fc_tail,
2136 fc_crc));
2137 if (le32_to_cpu(tail.fc_tid) == expected_tid &&
2138 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
2139 state->fc_replay_num_tags = state->fc_cur_tag;
2140 state->fc_regions_valid =
2141 state->fc_regions_used;
2142 } else {
2143 ret = state->fc_replay_num_tags ?
2144 JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2145 }
2146 state->fc_crc = 0;
2147 break;
2148 case EXT4_FC_TAG_HEAD:
2149 memcpy(&head, val, sizeof(head));
2150 if (le32_to_cpu(head.fc_features) &
2151 ~EXT4_FC_SUPPORTED_FEATURES) {
2152 ret = -EOPNOTSUPP;
2153 break;
2154 }
2155 if (le32_to_cpu(head.fc_tid) != expected_tid) {
2156 ret = JBD2_FC_REPLAY_STOP;
2157 break;
2158 }
2159 state->fc_cur_tag++;
2160 state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2161 EXT4_FC_TAG_BASE_LEN + tl.fc_len);
2162 break;
2163 default:
2164 ret = state->fc_replay_num_tags ?
2165 JBD2_FC_REPLAY_STOP : -ECANCELED;
2166 }
2167 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2168 break;
2169 }
2170
2171out_err:
2172 trace_ext4_fc_replay_scan(sb, ret, off);
2173 return ret;
2174}
2175
2176/*
2177 * Main recovery path entry point.
2178 * The meaning of return codes is similar as above.
2179 */
2180static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2181 enum passtype pass, int off, tid_t expected_tid)
2182{
2183 struct super_block *sb = journal->j_private;
2184 struct ext4_sb_info *sbi = EXT4_SB(sb);
2185 struct ext4_fc_tl_mem tl;
2186 __u8 *start, *end, *cur, *val;
2187 int ret = JBD2_FC_REPLAY_CONTINUE;
2188 struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2189 struct ext4_fc_tail tail;
2190
2191 if (pass == PASS_SCAN) {
2192 state->fc_current_pass = PASS_SCAN;
2193 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2194 }
2195
2196 if (state->fc_current_pass != pass) {
2197 state->fc_current_pass = pass;
2198 sbi->s_mount_state |= EXT4_FC_REPLAY;
2199 }
2200 if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2201 ext4_debug("Replay stops\n");
2202 ext4_fc_set_bitmaps_and_counters(sb);
2203 return 0;
2204 }
2205
2206#ifdef CONFIG_EXT4_DEBUG
2207 if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2208 pr_warn("Dropping fc block %d because max_replay set\n", off);
2209 return JBD2_FC_REPLAY_STOP;
2210 }
2211#endif
2212
2213 start = (u8 *)bh->b_data;
2214 end = start + journal->j_blocksize;
2215
2216 for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN;
2217 cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) {
2218 ext4_fc_get_tl(&tl, cur);
2219 val = cur + EXT4_FC_TAG_BASE_LEN;
2220
2221 if (state->fc_replay_num_tags == 0) {
2222 ret = JBD2_FC_REPLAY_STOP;
2223 ext4_fc_set_bitmaps_and_counters(sb);
2224 break;
2225 }
2226
2227 ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag));
2228 state->fc_replay_num_tags--;
2229 switch (tl.fc_tag) {
2230 case EXT4_FC_TAG_LINK:
2231 ret = ext4_fc_replay_link(sb, &tl, val);
2232 break;
2233 case EXT4_FC_TAG_UNLINK:
2234 ret = ext4_fc_replay_unlink(sb, &tl, val);
2235 break;
2236 case EXT4_FC_TAG_ADD_RANGE:
2237 ret = ext4_fc_replay_add_range(sb, &tl, val);
2238 break;
2239 case EXT4_FC_TAG_CREAT:
2240 ret = ext4_fc_replay_create(sb, &tl, val);
2241 break;
2242 case EXT4_FC_TAG_DEL_RANGE:
2243 ret = ext4_fc_replay_del_range(sb, &tl, val);
2244 break;
2245 case EXT4_FC_TAG_INODE:
2246 ret = ext4_fc_replay_inode(sb, &tl, val);
2247 break;
2248 case EXT4_FC_TAG_PAD:
2249 trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2250 tl.fc_len, 0);
2251 break;
2252 case EXT4_FC_TAG_TAIL:
2253 trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL,
2254 0, tl.fc_len, 0);
2255 memcpy(&tail, val, sizeof(tail));
2256 WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2257 break;
2258 case EXT4_FC_TAG_HEAD:
2259 break;
2260 default:
2261 trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0);
2262 ret = -ECANCELED;
2263 break;
2264 }
2265 if (ret < 0)
2266 break;
2267 ret = JBD2_FC_REPLAY_CONTINUE;
2268 }
2269 return ret;
2270}
2271
2272void ext4_fc_init(struct super_block *sb, journal_t *journal)
2273{
2274 /*
2275 * We set replay callback even if fast commit disabled because we may
2276 * could still have fast commit blocks that need to be replayed even if
2277 * fast commit has now been turned off.
2278 */
2279 journal->j_fc_replay_callback = ext4_fc_replay;
2280 if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2281 return;
2282 journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2283}
2284
2285static const char * const fc_ineligible_reasons[] = {
2286 [EXT4_FC_REASON_XATTR] = "Extended attributes changed",
2287 [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename",
2288 [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed",
2289 [EXT4_FC_REASON_NOMEM] = "Insufficient memory",
2290 [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot",
2291 [EXT4_FC_REASON_RESIZE] = "Resize",
2292 [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed",
2293 [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op",
2294 [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling",
2295 [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename",
2296};
2297
2298int ext4_fc_info_show(struct seq_file *seq, void *v)
2299{
2300 struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2301 struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2302 int i;
2303
2304 if (v != SEQ_START_TOKEN)
2305 return 0;
2306
2307 seq_printf(seq,
2308 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2309 stats->fc_num_commits, stats->fc_ineligible_commits,
2310 stats->fc_numblks,
2311 div_u64(stats->s_fc_avg_commit_time, 1000));
2312 seq_puts(seq, "Ineligible reasons:\n");
2313 for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2314 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2315 stats->fc_ineligible_reason_count[i]);
2316
2317 return 0;
2318}
2319
2320int __init ext4_fc_init_dentry_cache(void)
2321{
2322 ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2323 SLAB_RECLAIM_ACCOUNT);
2324
2325 if (ext4_fc_dentry_cachep == NULL)
2326 return -ENOMEM;
2327
2328 return 0;
2329}
2330
2331void ext4_fc_destroy_dentry_cache(void)
2332{
2333 kmem_cache_destroy(ext4_fc_dentry_cachep);
2334}