Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount.h"
24#include "xfs_refcount_btree.h"
25#include "xfs_extent_busy.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_quota.h"
29#include "xfs_qm.h"
30#include "xfs_bmap.h"
31#include "xfs_da_format.h"
32#include "xfs_da_btree.h"
33#include "xfs_attr.h"
34#include "xfs_attr_remote.h"
35#include "xfs_defer.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/fsb_bitmap.h"
43#include "scrub/reap.h"
44
45/*
46 * Disposal of Blocks from Old Metadata
47 *
48 * Now that we've constructed a new btree to replace the damaged one, we want
49 * to dispose of the blocks that (we think) the old btree was using.
50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
52 * blocks with the same rmap owner that are owned by another data structure
53 * (sublist), and subtracted sublist from bitmap. In theory the extents
54 * remaining in bitmap are the old btree's blocks.
55 *
56 * Unfortunately, it's possible that the btree was crosslinked with other
57 * blocks on disk. The rmap data can tell us if there are multiple owners, so
58 * if the rmapbt says there is an owner of this block other than @oinfo, then
59 * the block is crosslinked. Remove the reverse mapping and continue.
60 *
61 * If there is one rmap record, we can free the block, which removes the
62 * reverse mapping but doesn't add the block to the free space. Our repair
63 * strategy is to hope the other metadata objects crosslinked on this block
64 * will be rebuilt (atop different blocks), thereby removing all the cross
65 * links.
66 *
67 * If there are no rmap records at all, we also free the block. If the btree
68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69 * supposed to be a rmap record and everything is ok. For other btrees there
70 * had to have been an rmap entry for the block to have ended up on @bitmap,
71 * so if it's gone now there's something wrong and the fs will shut down.
72 *
73 * Note: If there are multiple rmap records with only the same rmap owner as
74 * the btree we're trying to rebuild and the block is indeed owned by another
75 * data structure with the same rmap owner, then the block will be in sublist
76 * and therefore doesn't need disposal. If there are multiple rmap records
77 * with only the same rmap owner but the block is not owned by something with
78 * the same rmap owner, the block will be freed.
79 *
80 * The caller is responsible for locking the AG headers/inode for the entire
81 * rebuild operation so that nothing else can sneak in and change the incore
82 * state while we're not looking. We must also invalidate any buffers
83 * associated with @bitmap.
84 */
85
86/* Information about reaping extents after a repair. */
87struct xreap_state {
88 struct xfs_scrub *sc;
89
90 /* Reverse mapping owner and metadata reservation type. */
91 const struct xfs_owner_info *oinfo;
92 enum xfs_ag_resv_type resv;
93
94 /* If true, roll the transaction before reaping the next extent. */
95 bool force_roll;
96
97 /* Number of deferred reaps attached to the current transaction. */
98 unsigned int deferred;
99
100 /* Number of invalidated buffers logged to the current transaction. */
101 unsigned int invalidated;
102
103 /* Number of deferred reaps queued during the whole reap sequence. */
104 unsigned long long total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110 struct xfs_scrub *sc,
111 xfs_agblock_t agbno)
112{
113 struct xfs_buf *agfl_bp;
114 int error;
115
116 /* Make sure there's space on the freelist. */
117 error = xrep_fix_freelist(sc, 0);
118 if (error)
119 return error;
120
121 /*
122 * Since we're "freeing" a lost block onto the AGFL, we have to
123 * create an rmap for the block prior to merging it or else other
124 * parts will break.
125 */
126 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127 &XFS_RMAP_OINFO_AG);
128 if (error)
129 return error;
130
131 /* Put the block on the AGFL. */
132 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133 if (error)
134 return error;
135
136 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137 agfl_bp, agbno, 0);
138 if (error)
139 return error;
140 xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141 XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143 return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149 if (rs->force_roll)
150 return true;
151 if (rs->deferred)
152 return true;
153 if (rs->invalidated)
154 return true;
155 if (rs->total_deferred)
156 return true;
157 return false;
158}
159
160#define XREAP_MAX_BINVAL (2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent. We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction. For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170 if (rs->force_roll)
171 return true;
172 if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173 return true;
174 if (rs->invalidated > XREAP_MAX_BINVAL)
175 return true;
176 return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181 rs->total_deferred += rs->deferred;
182 rs->deferred = 0;
183 rs->invalidated = 0;
184 rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN (2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction. We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory. Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199 if (rs->force_roll)
200 return true;
201 if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202 return true;
203 return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208 rs->total_deferred = 0;
209 rs->deferred = 0;
210 rs->invalidated = 0;
211 rs->force_roll = false;
212}
213
214/* Try to invalidate the incore buffers for an extent that we're freeing. */
215STATIC void
216xreap_agextent_binval(
217 struct xreap_state *rs,
218 xfs_agblock_t agbno,
219 xfs_extlen_t *aglenp)
220{
221 struct xfs_scrub *sc = rs->sc;
222 struct xfs_perag *pag = sc->sa.pag;
223 struct xfs_mount *mp = sc->mp;
224 xfs_agnumber_t agno = sc->sa.pag->pag_agno;
225 xfs_agblock_t agbno_next = agbno + *aglenp;
226 xfs_agblock_t bno = agbno;
227
228 /*
229 * Avoid invalidating AG headers and post-EOFS blocks because we never
230 * own those.
231 */
232 if (!xfs_verify_agbno(pag, agbno) ||
233 !xfs_verify_agbno(pag, agbno_next - 1))
234 return;
235
236 /*
237 * If there are incore buffers for these blocks, invalidate them. We
238 * assume that the lack of any other known owners means that the buffer
239 * can be locked without risk of deadlocking. The buffer cache cannot
240 * detect aliasing, so employ nested loops to scan for incore buffers
241 * of any plausible size.
242 */
243 while (bno < agbno_next) {
244 xfs_agblock_t fsbcount;
245 xfs_agblock_t max_fsbs;
246
247 /*
248 * Max buffer size is the max remote xattr buffer size, which
249 * is one fs block larger than 64k.
250 */
251 max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252 xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253
254 for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
255 struct xfs_buf *bp = NULL;
256 xfs_daddr_t daddr;
257 int error;
258
259 daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260 error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261 XFS_FSB_TO_BB(mp, fsbcount),
262 XBF_LIVESCAN, &bp);
263 if (error)
264 continue;
265
266 xfs_trans_bjoin(sc->tp, bp);
267 xfs_trans_binval(sc->tp, bp);
268 rs->invalidated++;
269
270 /*
271 * Stop invalidating if we've hit the limit; we should
272 * still have enough reservation left to free however
273 * far we've gotten.
274 */
275 if (rs->invalidated > XREAP_MAX_BINVAL) {
276 *aglenp -= agbno_next - bno;
277 goto out;
278 }
279 }
280
281 bno++;
282 }
283
284out:
285 trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286}
287
288/*
289 * Figure out the longest run of blocks that we can dispose of with a single
290 * call. Cross-linked blocks should have their reverse mappings removed, but
291 * single-owner extents can be freed. AGFL blocks can only be put back one at
292 * a time.
293 */
294STATIC int
295xreap_agextent_select(
296 struct xreap_state *rs,
297 xfs_agblock_t agbno,
298 xfs_agblock_t agbno_next,
299 bool *crosslinked,
300 xfs_extlen_t *aglenp)
301{
302 struct xfs_scrub *sc = rs->sc;
303 struct xfs_btree_cur *cur;
304 xfs_agblock_t bno = agbno + 1;
305 xfs_extlen_t len = 1;
306 int error;
307
308 /*
309 * Determine if there are any other rmap records covering the first
310 * block of this extent. If so, the block is crosslinked.
311 */
312 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313 sc->sa.pag);
314 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
315 crosslinked);
316 if (error)
317 goto out_cur;
318
319 /* AGFL blocks can only be deal with one at a time. */
320 if (rs->resv == XFS_AG_RESV_AGFL)
321 goto out_found;
322
323 /*
324 * Figure out how many of the subsequent blocks have the same crosslink
325 * status.
326 */
327 while (bno < agbno_next) {
328 bool also_crosslinked;
329
330 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
331 &also_crosslinked);
332 if (error)
333 goto out_cur;
334
335 if (*crosslinked != also_crosslinked)
336 break;
337
338 len++;
339 bno++;
340 }
341
342out_found:
343 *aglenp = len;
344 trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345out_cur:
346 xfs_btree_del_cursor(cur, error);
347 return error;
348}
349
350/*
351 * Dispose of as much of the beginning of this AG extent as possible. The
352 * number of blocks disposed of will be returned in @aglenp.
353 */
354STATIC int
355xreap_agextent_iter(
356 struct xreap_state *rs,
357 xfs_agblock_t agbno,
358 xfs_extlen_t *aglenp,
359 bool crosslinked)
360{
361 struct xfs_scrub *sc = rs->sc;
362 xfs_fsblock_t fsbno;
363 int error = 0;
364
365 fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366
367 /*
368 * If there are other rmappings, this block is cross linked and must
369 * not be freed. Remove the reverse mapping and move on. Otherwise,
370 * we were the only owner of the block, so free the extent, which will
371 * also remove the rmap.
372 *
373 * XXX: XFS doesn't support detecting the case where a single block
374 * metadata structure is crosslinked with a multi-block structure
375 * because the buffer cache doesn't detect aliasing problems, so we
376 * can't fix 100% of crosslinking problems (yet). The verifiers will
377 * blow on writeout, the filesystem will shut down, and the admin gets
378 * to run xfs_repair.
379 */
380 if (crosslinked) {
381 trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382
383 rs->force_roll = true;
384
385 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386 /*
387 * If we're unmapping CoW staging extents, remove the
388 * records from the refcountbt, which will remove the
389 * rmap record as well.
390 */
391 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392 return 0;
393 }
394
395 return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396 *aglenp, rs->oinfo);
397 }
398
399 trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400
401 /*
402 * Invalidate as many buffers as we can, starting at agbno. If this
403 * function sets *aglenp to zero, the transaction is full of logged
404 * buffer invalidations, so we need to return early so that we can
405 * roll and retry.
406 */
407 xreap_agextent_binval(rs, agbno, aglenp);
408 if (*aglenp == 0) {
409 ASSERT(xreap_want_roll(rs));
410 return 0;
411 }
412
413 /*
414 * If we're getting rid of CoW staging extents, use deferred work items
415 * to remove the refcountbt records (which removes the rmap records)
416 * and free the extent. We're not worried about the system going down
417 * here because log recovery walks the refcount btree to clean out the
418 * CoW staging extents.
419 */
420 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421 ASSERT(rs->resv == XFS_AG_RESV_NONE);
422
423 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425 rs->resv, true);
426 if (error)
427 return error;
428
429 rs->force_roll = true;
430 return 0;
431 }
432
433 /* Put blocks back on the AGFL one at a time. */
434 if (rs->resv == XFS_AG_RESV_AGFL) {
435 ASSERT(*aglenp == 1);
436 error = xreap_put_freelist(sc, agbno);
437 if (error)
438 return error;
439
440 rs->force_roll = true;
441 return 0;
442 }
443
444 /*
445 * Use deferred frees to get rid of the old btree blocks to try to
446 * minimize the window in which we could crash and lose the old blocks.
447 * Add a defer ops barrier every other extent to avoid stressing the
448 * system with large EFIs.
449 */
450 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451 rs->resv, true);
452 if (error)
453 return error;
454
455 rs->deferred++;
456 if (rs->deferred % 2 == 0)
457 xfs_defer_add_barrier(sc->tp);
458 return 0;
459}
460
461/*
462 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
463 * crosslinked), and dispose of each sub-extent separately.
464 */
465STATIC int
466xreap_agmeta_extent(
467 uint32_t agbno,
468 uint32_t len,
469 void *priv)
470{
471 struct xreap_state *rs = priv;
472 struct xfs_scrub *sc = rs->sc;
473 xfs_agblock_t agbno_next = agbno + len;
474 int error = 0;
475
476 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477 ASSERT(sc->ip == NULL);
478
479 while (agbno < agbno_next) {
480 xfs_extlen_t aglen;
481 bool crosslinked;
482
483 error = xreap_agextent_select(rs, agbno, agbno_next,
484 &crosslinked, &aglen);
485 if (error)
486 return error;
487
488 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489 if (error)
490 return error;
491
492 if (xreap_want_defer_finish(rs)) {
493 error = xrep_defer_finish(sc);
494 if (error)
495 return error;
496 xreap_defer_finish_reset(rs);
497 } else if (xreap_want_roll(rs)) {
498 error = xrep_roll_ag_trans(sc);
499 if (error)
500 return error;
501 xreap_reset(rs);
502 }
503
504 agbno += aglen;
505 }
506
507 return 0;
508}
509
510/* Dispose of every block of every AG metadata extent in the bitmap. */
511int
512xrep_reap_agblocks(
513 struct xfs_scrub *sc,
514 struct xagb_bitmap *bitmap,
515 const struct xfs_owner_info *oinfo,
516 enum xfs_ag_resv_type type)
517{
518 struct xreap_state rs = {
519 .sc = sc,
520 .oinfo = oinfo,
521 .resv = type,
522 };
523 int error;
524
525 ASSERT(xfs_has_rmapbt(sc->mp));
526 ASSERT(sc->ip == NULL);
527
528 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529 if (error)
530 return error;
531
532 if (xreap_dirty(&rs))
533 return xrep_defer_finish(sc);
534
535 return 0;
536}
537
538/*
539 * Break a file metadata extent into sub-extents by fate (crosslinked, not
540 * crosslinked), and dispose of each sub-extent separately. The extent must
541 * not cross an AG boundary.
542 */
543STATIC int
544xreap_fsmeta_extent(
545 uint64_t fsbno,
546 uint64_t len,
547 void *priv)
548{
549 struct xreap_state *rs = priv;
550 struct xfs_scrub *sc = rs->sc;
551 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553 xfs_agblock_t agbno_next = agbno + len;
554 int error = 0;
555
556 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557 ASSERT(sc->ip != NULL);
558 ASSERT(!sc->sa.pag);
559
560 /*
561 * We're reaping blocks after repairing file metadata, which means that
562 * we have to init the xchk_ag structure ourselves.
563 */
564 sc->sa.pag = xfs_perag_get(sc->mp, agno);
565 if (!sc->sa.pag)
566 return -EFSCORRUPTED;
567
568 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
569 if (error)
570 goto out_pag;
571
572 while (agbno < agbno_next) {
573 xfs_extlen_t aglen;
574 bool crosslinked;
575
576 error = xreap_agextent_select(rs, agbno, agbno_next,
577 &crosslinked, &aglen);
578 if (error)
579 goto out_agf;
580
581 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582 if (error)
583 goto out_agf;
584
585 if (xreap_want_defer_finish(rs)) {
586 /*
587 * Holds the AGF buffer across the deferred chain
588 * processing.
589 */
590 error = xrep_defer_finish(sc);
591 if (error)
592 goto out_agf;
593 xreap_defer_finish_reset(rs);
594 } else if (xreap_want_roll(rs)) {
595 /*
596 * Hold the AGF buffer across the transaction roll so
597 * that we don't have to reattach it to the scrub
598 * context.
599 */
600 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601 error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603 if (error)
604 goto out_agf;
605 xreap_reset(rs);
606 }
607
608 agbno += aglen;
609 }
610
611out_agf:
612 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613 sc->sa.agf_bp = NULL;
614out_pag:
615 xfs_perag_put(sc->sa.pag);
616 sc->sa.pag = NULL;
617 return error;
618}
619
620/*
621 * Dispose of every block of every fs metadata extent in the bitmap.
622 * Do not use this to dispose of the mappings in an ondisk inode fork.
623 */
624int
625xrep_reap_fsblocks(
626 struct xfs_scrub *sc,
627 struct xfsb_bitmap *bitmap,
628 const struct xfs_owner_info *oinfo)
629{
630 struct xreap_state rs = {
631 .sc = sc,
632 .oinfo = oinfo,
633 .resv = XFS_AG_RESV_NONE,
634 };
635 int error;
636
637 ASSERT(xfs_has_rmapbt(sc->mp));
638 ASSERT(sc->ip != NULL);
639
640 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641 if (error)
642 return error;
643
644 if (xreap_dirty(&rs))
645 return xrep_defer_finish(sc);
646
647 return 0;
648}
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_btree.h"
13#include "xfs_log_format.h"
14#include "xfs_trans.h"
15#include "xfs_sb.h"
16#include "xfs_inode.h"
17#include "xfs_alloc.h"
18#include "xfs_alloc_btree.h"
19#include "xfs_ialloc.h"
20#include "xfs_ialloc_btree.h"
21#include "xfs_rmap.h"
22#include "xfs_rmap_btree.h"
23#include "xfs_refcount.h"
24#include "xfs_refcount_btree.h"
25#include "xfs_extent_busy.h"
26#include "xfs_ag.h"
27#include "xfs_ag_resv.h"
28#include "xfs_quota.h"
29#include "xfs_qm.h"
30#include "xfs_bmap.h"
31#include "xfs_da_format.h"
32#include "xfs_da_btree.h"
33#include "xfs_attr.h"
34#include "xfs_attr_remote.h"
35#include "xfs_defer.h"
36#include "scrub/scrub.h"
37#include "scrub/common.h"
38#include "scrub/trace.h"
39#include "scrub/repair.h"
40#include "scrub/bitmap.h"
41#include "scrub/agb_bitmap.h"
42#include "scrub/fsb_bitmap.h"
43#include "scrub/reap.h"
44
45/*
46 * Disposal of Blocks from Old Metadata
47 *
48 * Now that we've constructed a new btree to replace the damaged one, we want
49 * to dispose of the blocks that (we think) the old btree was using.
50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
52 * blocks with the same rmap owner that are owned by another data structure
53 * (sublist), and subtracted sublist from bitmap. In theory the extents
54 * remaining in bitmap are the old btree's blocks.
55 *
56 * Unfortunately, it's possible that the btree was crosslinked with other
57 * blocks on disk. The rmap data can tell us if there are multiple owners, so
58 * if the rmapbt says there is an owner of this block other than @oinfo, then
59 * the block is crosslinked. Remove the reverse mapping and continue.
60 *
61 * If there is one rmap record, we can free the block, which removes the
62 * reverse mapping but doesn't add the block to the free space. Our repair
63 * strategy is to hope the other metadata objects crosslinked on this block
64 * will be rebuilt (atop different blocks), thereby removing all the cross
65 * links.
66 *
67 * If there are no rmap records at all, we also free the block. If the btree
68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
69 * supposed to be a rmap record and everything is ok. For other btrees there
70 * had to have been an rmap entry for the block to have ended up on @bitmap,
71 * so if it's gone now there's something wrong and the fs will shut down.
72 *
73 * Note: If there are multiple rmap records with only the same rmap owner as
74 * the btree we're trying to rebuild and the block is indeed owned by another
75 * data structure with the same rmap owner, then the block will be in sublist
76 * and therefore doesn't need disposal. If there are multiple rmap records
77 * with only the same rmap owner but the block is not owned by something with
78 * the same rmap owner, the block will be freed.
79 *
80 * The caller is responsible for locking the AG headers/inode for the entire
81 * rebuild operation so that nothing else can sneak in and change the incore
82 * state while we're not looking. We must also invalidate any buffers
83 * associated with @bitmap.
84 */
85
86/* Information about reaping extents after a repair. */
87struct xreap_state {
88 struct xfs_scrub *sc;
89
90 /* Reverse mapping owner and metadata reservation type. */
91 const struct xfs_owner_info *oinfo;
92 enum xfs_ag_resv_type resv;
93
94 /* If true, roll the transaction before reaping the next extent. */
95 bool force_roll;
96
97 /* Number of deferred reaps attached to the current transaction. */
98 unsigned int deferred;
99
100 /* Number of invalidated buffers logged to the current transaction. */
101 unsigned int invalidated;
102
103 /* Number of deferred reaps queued during the whole reap sequence. */
104 unsigned long long total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110 struct xfs_scrub *sc,
111 xfs_agblock_t agbno)
112{
113 struct xfs_buf *agfl_bp;
114 int error;
115
116 /* Make sure there's space on the freelist. */
117 error = xrep_fix_freelist(sc, 0);
118 if (error)
119 return error;
120
121 /*
122 * Since we're "freeing" a lost block onto the AGFL, we have to
123 * create an rmap for the block prior to merging it or else other
124 * parts will break.
125 */
126 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127 &XFS_RMAP_OINFO_AG);
128 if (error)
129 return error;
130
131 /* Put the block on the AGFL. */
132 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133 if (error)
134 return error;
135
136 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137 agfl_bp, agbno, 0);
138 if (error)
139 return error;
140 xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
141 XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143 return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149 if (rs->force_roll)
150 return true;
151 if (rs->deferred)
152 return true;
153 if (rs->invalidated)
154 return true;
155 if (rs->total_deferred)
156 return true;
157 return false;
158}
159
160#define XREAP_MAX_BINVAL (2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent. We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction. For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170 if (rs->force_roll)
171 return true;
172 if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173 return true;
174 if (rs->invalidated > XREAP_MAX_BINVAL)
175 return true;
176 return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181 rs->total_deferred += rs->deferred;
182 rs->deferred = 0;
183 rs->invalidated = 0;
184 rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN (2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction. We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory. Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199 if (rs->force_roll)
200 return true;
201 if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202 return true;
203 return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208 rs->total_deferred = 0;
209 rs->deferred = 0;
210 rs->invalidated = 0;
211 rs->force_roll = false;
212}
213
214/*
215 * Compute the maximum length of a buffer cache scan (in units of sectors),
216 * given a quantity of fs blocks.
217 */
218xfs_daddr_t
219xrep_bufscan_max_sectors(
220 struct xfs_mount *mp,
221 xfs_extlen_t fsblocks)
222{
223 int max_fsbs;
224
225 /* Remote xattr values are the largest buffers that we support. */
226 max_fsbs = xfs_attr3_max_rmt_blocks(mp);
227
228 return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
229}
230
231/*
232 * Return an incore buffer from a sector scan, or NULL if there are no buffers
233 * left to return.
234 */
235struct xfs_buf *
236xrep_bufscan_advance(
237 struct xfs_mount *mp,
238 struct xrep_bufscan *scan)
239{
240 scan->__sector_count += scan->daddr_step;
241 while (scan->__sector_count <= scan->max_sectors) {
242 struct xfs_buf *bp = NULL;
243 int error;
244
245 error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
246 scan->__sector_count, XBF_LIVESCAN, &bp);
247 if (!error)
248 return bp;
249
250 scan->__sector_count += scan->daddr_step;
251 }
252
253 return NULL;
254}
255
256/* Try to invalidate the incore buffers for an extent that we're freeing. */
257STATIC void
258xreap_agextent_binval(
259 struct xreap_state *rs,
260 xfs_agblock_t agbno,
261 xfs_extlen_t *aglenp)
262{
263 struct xfs_scrub *sc = rs->sc;
264 struct xfs_perag *pag = sc->sa.pag;
265 struct xfs_mount *mp = sc->mp;
266 xfs_agblock_t agbno_next = agbno + *aglenp;
267 xfs_agblock_t bno = agbno;
268
269 /*
270 * Avoid invalidating AG headers and post-EOFS blocks because we never
271 * own those.
272 */
273 if (!xfs_verify_agbno(pag, agbno) ||
274 !xfs_verify_agbno(pag, agbno_next - 1))
275 return;
276
277 /*
278 * If there are incore buffers for these blocks, invalidate them. We
279 * assume that the lack of any other known owners means that the buffer
280 * can be locked without risk of deadlocking. The buffer cache cannot
281 * detect aliasing, so employ nested loops to scan for incore buffers
282 * of any plausible size.
283 */
284 while (bno < agbno_next) {
285 struct xrep_bufscan scan = {
286 .daddr = xfs_agbno_to_daddr(pag, bno),
287 .max_sectors = xrep_bufscan_max_sectors(mp,
288 agbno_next - bno),
289 .daddr_step = XFS_FSB_TO_BB(mp, 1),
290 };
291 struct xfs_buf *bp;
292
293 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
294 xfs_trans_bjoin(sc->tp, bp);
295 xfs_trans_binval(sc->tp, bp);
296 rs->invalidated++;
297
298 /*
299 * Stop invalidating if we've hit the limit; we should
300 * still have enough reservation left to free however
301 * far we've gotten.
302 */
303 if (rs->invalidated > XREAP_MAX_BINVAL) {
304 *aglenp -= agbno_next - bno;
305 goto out;
306 }
307 }
308
309 bno++;
310 }
311
312out:
313 trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
314}
315
316/*
317 * Figure out the longest run of blocks that we can dispose of with a single
318 * call. Cross-linked blocks should have their reverse mappings removed, but
319 * single-owner extents can be freed. AGFL blocks can only be put back one at
320 * a time.
321 */
322STATIC int
323xreap_agextent_select(
324 struct xreap_state *rs,
325 xfs_agblock_t agbno,
326 xfs_agblock_t agbno_next,
327 bool *crosslinked,
328 xfs_extlen_t *aglenp)
329{
330 struct xfs_scrub *sc = rs->sc;
331 struct xfs_btree_cur *cur;
332 xfs_agblock_t bno = agbno + 1;
333 xfs_extlen_t len = 1;
334 int error;
335
336 /*
337 * Determine if there are any other rmap records covering the first
338 * block of this extent. If so, the block is crosslinked.
339 */
340 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
341 sc->sa.pag);
342 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
343 crosslinked);
344 if (error)
345 goto out_cur;
346
347 /* AGFL blocks can only be deal with one at a time. */
348 if (rs->resv == XFS_AG_RESV_AGFL)
349 goto out_found;
350
351 /*
352 * Figure out how many of the subsequent blocks have the same crosslink
353 * status.
354 */
355 while (bno < agbno_next) {
356 bool also_crosslinked;
357
358 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
359 &also_crosslinked);
360 if (error)
361 goto out_cur;
362
363 if (*crosslinked != also_crosslinked)
364 break;
365
366 len++;
367 bno++;
368 }
369
370out_found:
371 *aglenp = len;
372 trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
373out_cur:
374 xfs_btree_del_cursor(cur, error);
375 return error;
376}
377
378/*
379 * Dispose of as much of the beginning of this AG extent as possible. The
380 * number of blocks disposed of will be returned in @aglenp.
381 */
382STATIC int
383xreap_agextent_iter(
384 struct xreap_state *rs,
385 xfs_agblock_t agbno,
386 xfs_extlen_t *aglenp,
387 bool crosslinked)
388{
389 struct xfs_scrub *sc = rs->sc;
390 xfs_fsblock_t fsbno;
391 int error = 0;
392
393 fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
394
395 /*
396 * If there are other rmappings, this block is cross linked and must
397 * not be freed. Remove the reverse mapping and move on. Otherwise,
398 * we were the only owner of the block, so free the extent, which will
399 * also remove the rmap.
400 *
401 * XXX: XFS doesn't support detecting the case where a single block
402 * metadata structure is crosslinked with a multi-block structure
403 * because the buffer cache doesn't detect aliasing problems, so we
404 * can't fix 100% of crosslinking problems (yet). The verifiers will
405 * blow on writeout, the filesystem will shut down, and the admin gets
406 * to run xfs_repair.
407 */
408 if (crosslinked) {
409 trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
410
411 rs->force_roll = true;
412
413 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
414 /*
415 * If we're unmapping CoW staging extents, remove the
416 * records from the refcountbt, which will remove the
417 * rmap record as well.
418 */
419 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
420 return 0;
421 }
422
423 return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
424 *aglenp, rs->oinfo);
425 }
426
427 trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
428
429 /*
430 * Invalidate as many buffers as we can, starting at agbno. If this
431 * function sets *aglenp to zero, the transaction is full of logged
432 * buffer invalidations, so we need to return early so that we can
433 * roll and retry.
434 */
435 xreap_agextent_binval(rs, agbno, aglenp);
436 if (*aglenp == 0) {
437 ASSERT(xreap_want_roll(rs));
438 return 0;
439 }
440
441 /*
442 * If we're getting rid of CoW staging extents, use deferred work items
443 * to remove the refcountbt records (which removes the rmap records)
444 * and free the extent. We're not worried about the system going down
445 * here because log recovery walks the refcount btree to clean out the
446 * CoW staging extents.
447 */
448 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
449 ASSERT(rs->resv == XFS_AG_RESV_NONE);
450
451 xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
452 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
453 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
454 if (error)
455 return error;
456
457 rs->force_roll = true;
458 return 0;
459 }
460
461 /* Put blocks back on the AGFL one at a time. */
462 if (rs->resv == XFS_AG_RESV_AGFL) {
463 ASSERT(*aglenp == 1);
464 error = xreap_put_freelist(sc, agbno);
465 if (error)
466 return error;
467
468 rs->force_roll = true;
469 return 0;
470 }
471
472 /*
473 * Use deferred frees to get rid of the old btree blocks to try to
474 * minimize the window in which we could crash and lose the old blocks.
475 * Add a defer ops barrier every other extent to avoid stressing the
476 * system with large EFIs.
477 */
478 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
479 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
480 if (error)
481 return error;
482
483 rs->deferred++;
484 if (rs->deferred % 2 == 0)
485 xfs_defer_add_barrier(sc->tp);
486 return 0;
487}
488
489/*
490 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
491 * crosslinked), and dispose of each sub-extent separately.
492 */
493STATIC int
494xreap_agmeta_extent(
495 uint32_t agbno,
496 uint32_t len,
497 void *priv)
498{
499 struct xreap_state *rs = priv;
500 struct xfs_scrub *sc = rs->sc;
501 xfs_agblock_t agbno_next = agbno + len;
502 int error = 0;
503
504 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
505 ASSERT(sc->ip == NULL);
506
507 while (agbno < agbno_next) {
508 xfs_extlen_t aglen;
509 bool crosslinked;
510
511 error = xreap_agextent_select(rs, agbno, agbno_next,
512 &crosslinked, &aglen);
513 if (error)
514 return error;
515
516 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
517 if (error)
518 return error;
519
520 if (xreap_want_defer_finish(rs)) {
521 error = xrep_defer_finish(sc);
522 if (error)
523 return error;
524 xreap_defer_finish_reset(rs);
525 } else if (xreap_want_roll(rs)) {
526 error = xrep_roll_ag_trans(sc);
527 if (error)
528 return error;
529 xreap_reset(rs);
530 }
531
532 agbno += aglen;
533 }
534
535 return 0;
536}
537
538/* Dispose of every block of every AG metadata extent in the bitmap. */
539int
540xrep_reap_agblocks(
541 struct xfs_scrub *sc,
542 struct xagb_bitmap *bitmap,
543 const struct xfs_owner_info *oinfo,
544 enum xfs_ag_resv_type type)
545{
546 struct xreap_state rs = {
547 .sc = sc,
548 .oinfo = oinfo,
549 .resv = type,
550 };
551 int error;
552
553 ASSERT(xfs_has_rmapbt(sc->mp));
554 ASSERT(sc->ip == NULL);
555
556 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
557 if (error)
558 return error;
559
560 if (xreap_dirty(&rs))
561 return xrep_defer_finish(sc);
562
563 return 0;
564}
565
566/*
567 * Break a file metadata extent into sub-extents by fate (crosslinked, not
568 * crosslinked), and dispose of each sub-extent separately. The extent must
569 * not cross an AG boundary.
570 */
571STATIC int
572xreap_fsmeta_extent(
573 uint64_t fsbno,
574 uint64_t len,
575 void *priv)
576{
577 struct xreap_state *rs = priv;
578 struct xfs_scrub *sc = rs->sc;
579 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
580 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
581 xfs_agblock_t agbno_next = agbno + len;
582 int error = 0;
583
584 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
585 ASSERT(sc->ip != NULL);
586 ASSERT(!sc->sa.pag);
587
588 /*
589 * We're reaping blocks after repairing file metadata, which means that
590 * we have to init the xchk_ag structure ourselves.
591 */
592 sc->sa.pag = xfs_perag_get(sc->mp, agno);
593 if (!sc->sa.pag)
594 return -EFSCORRUPTED;
595
596 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
597 if (error)
598 goto out_pag;
599
600 while (agbno < agbno_next) {
601 xfs_extlen_t aglen;
602 bool crosslinked;
603
604 error = xreap_agextent_select(rs, agbno, agbno_next,
605 &crosslinked, &aglen);
606 if (error)
607 goto out_agf;
608
609 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
610 if (error)
611 goto out_agf;
612
613 if (xreap_want_defer_finish(rs)) {
614 /*
615 * Holds the AGF buffer across the deferred chain
616 * processing.
617 */
618 error = xrep_defer_finish(sc);
619 if (error)
620 goto out_agf;
621 xreap_defer_finish_reset(rs);
622 } else if (xreap_want_roll(rs)) {
623 /*
624 * Hold the AGF buffer across the transaction roll so
625 * that we don't have to reattach it to the scrub
626 * context.
627 */
628 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
629 error = xfs_trans_roll_inode(&sc->tp, sc->ip);
630 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
631 if (error)
632 goto out_agf;
633 xreap_reset(rs);
634 }
635
636 agbno += aglen;
637 }
638
639out_agf:
640 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
641 sc->sa.agf_bp = NULL;
642out_pag:
643 xfs_perag_put(sc->sa.pag);
644 sc->sa.pag = NULL;
645 return error;
646}
647
648/*
649 * Dispose of every block of every fs metadata extent in the bitmap.
650 * Do not use this to dispose of the mappings in an ondisk inode fork.
651 */
652int
653xrep_reap_fsblocks(
654 struct xfs_scrub *sc,
655 struct xfsb_bitmap *bitmap,
656 const struct xfs_owner_info *oinfo)
657{
658 struct xreap_state rs = {
659 .sc = sc,
660 .oinfo = oinfo,
661 .resv = XFS_AG_RESV_NONE,
662 };
663 int error;
664
665 ASSERT(xfs_has_rmapbt(sc->mp));
666 ASSERT(sc->ip != NULL);
667
668 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
669 if (error)
670 return error;
671
672 if (xreap_dirty(&rs))
673 return xrep_defer_finish(sc);
674
675 return 0;
676}
677
678/*
679 * Metadata files are not supposed to share blocks with anything else.
680 * If blocks are shared, we remove the reverse mapping (thus reducing the
681 * crosslink factor); if blocks are not shared, we also need to free them.
682 *
683 * This first step determines the longest subset of the passed-in imap
684 * (starting at its beginning) that is either crosslinked or not crosslinked.
685 * The blockcount will be adjust down as needed.
686 */
687STATIC int
688xreap_bmapi_select(
689 struct xfs_scrub *sc,
690 struct xfs_inode *ip,
691 int whichfork,
692 struct xfs_bmbt_irec *imap,
693 bool *crosslinked)
694{
695 struct xfs_owner_info oinfo;
696 struct xfs_btree_cur *cur;
697 xfs_filblks_t len = 1;
698 xfs_agblock_t bno;
699 xfs_agblock_t agbno;
700 xfs_agblock_t agbno_next;
701 int error;
702
703 agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
704 agbno_next = agbno + imap->br_blockcount;
705
706 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
707 sc->sa.pag);
708
709 xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
710 error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
711 if (error)
712 goto out_cur;
713
714 bno = agbno + 1;
715 while (bno < agbno_next) {
716 bool also_crosslinked;
717
718 oinfo.oi_offset++;
719 error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
720 &also_crosslinked);
721 if (error)
722 goto out_cur;
723
724 if (also_crosslinked != *crosslinked)
725 break;
726
727 len++;
728 bno++;
729 }
730
731 imap->br_blockcount = len;
732 trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
733out_cur:
734 xfs_btree_del_cursor(cur, error);
735 return error;
736}
737
738/*
739 * Decide if this buffer can be joined to a transaction. This is true for most
740 * buffers, but there are two cases that we want to catch: large remote xattr
741 * value buffers are not logged and can overflow the buffer log item dirty
742 * bitmap size; and oversized cached buffers if things have really gone
743 * haywire.
744 */
745static inline bool
746xreap_buf_loggable(
747 const struct xfs_buf *bp)
748{
749 int i;
750
751 for (i = 0; i < bp->b_map_count; i++) {
752 int chunks;
753 int map_size;
754
755 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
756 XFS_BLF_CHUNK);
757 map_size = DIV_ROUND_UP(chunks, NBWORD);
758 if (map_size > XFS_BLF_DATAMAP_SIZE)
759 return false;
760 }
761
762 return true;
763}
764
765/*
766 * Invalidate any buffers for this file mapping. The @imap blockcount may be
767 * adjusted downward if we need to roll the transaction.
768 */
769STATIC int
770xreap_bmapi_binval(
771 struct xfs_scrub *sc,
772 struct xfs_inode *ip,
773 int whichfork,
774 struct xfs_bmbt_irec *imap)
775{
776 struct xfs_mount *mp = sc->mp;
777 struct xfs_perag *pag = sc->sa.pag;
778 int bmap_flags = xfs_bmapi_aflag(whichfork);
779 xfs_fileoff_t off;
780 xfs_fileoff_t max_off;
781 xfs_extlen_t scan_blocks;
782 xfs_agblock_t bno;
783 xfs_agblock_t agbno;
784 xfs_agblock_t agbno_next;
785 unsigned int invalidated = 0;
786 int error;
787
788 /*
789 * Avoid invalidating AG headers and post-EOFS blocks because we never
790 * own those.
791 */
792 agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
793 agbno_next = agbno + imap->br_blockcount;
794 if (!xfs_verify_agbno(pag, agbno) ||
795 !xfs_verify_agbno(pag, agbno_next - 1))
796 return 0;
797
798 /*
799 * Buffers for file blocks can span multiple contiguous mappings. This
800 * means that for each block in the mapping, there could exist an
801 * xfs_buf indexed by that block with any length up to the maximum
802 * buffer size (remote xattr values) or to the next hole in the fork.
803 * To set up our binval scan, first we need to figure out the location
804 * of the next hole.
805 */
806 off = imap->br_startoff + imap->br_blockcount;
807 max_off = off + xfs_attr3_max_rmt_blocks(mp);
808 while (off < max_off) {
809 struct xfs_bmbt_irec hmap;
810 int nhmaps = 1;
811
812 error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
813 &nhmaps, bmap_flags);
814 if (error)
815 return error;
816 if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
817 ASSERT(0);
818 return -EFSCORRUPTED;
819 }
820
821 if (!xfs_bmap_is_real_extent(&hmap))
822 break;
823
824 off = hmap.br_startoff + hmap.br_blockcount;
825 }
826 scan_blocks = off - imap->br_startoff;
827
828 trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
829
830 /*
831 * If there are incore buffers for these blocks, invalidate them. If
832 * we can't (try)lock the buffer we assume it's owned by someone else
833 * and leave it alone. The buffer cache cannot detect aliasing, so
834 * employ nested loops to detect incore buffers of any plausible size.
835 */
836 while (bno < agbno_next) {
837 struct xrep_bufscan scan = {
838 .daddr = xfs_agbno_to_daddr(pag, bno),
839 .max_sectors = xrep_bufscan_max_sectors(mp,
840 scan_blocks),
841 .daddr_step = XFS_FSB_TO_BB(mp, 1),
842 };
843 struct xfs_buf *bp;
844
845 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
846 if (xreap_buf_loggable(bp)) {
847 xfs_trans_bjoin(sc->tp, bp);
848 xfs_trans_binval(sc->tp, bp);
849 } else {
850 xfs_buf_stale(bp);
851 xfs_buf_relse(bp);
852 }
853 invalidated++;
854
855 /*
856 * Stop invalidating if we've hit the limit; we should
857 * still have enough reservation left to free however
858 * much of the mapping we've seen so far.
859 */
860 if (invalidated > XREAP_MAX_BINVAL) {
861 imap->br_blockcount = agbno_next - bno;
862 goto out;
863 }
864 }
865
866 bno++;
867 scan_blocks--;
868 }
869
870out:
871 trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
872 return 0;
873}
874
875/*
876 * Dispose of as much of the beginning of this file fork mapping as possible.
877 * The number of blocks disposed of is returned in @imap->br_blockcount.
878 */
879STATIC int
880xrep_reap_bmapi_iter(
881 struct xfs_scrub *sc,
882 struct xfs_inode *ip,
883 int whichfork,
884 struct xfs_bmbt_irec *imap,
885 bool crosslinked)
886{
887 int error;
888
889 if (crosslinked) {
890 /*
891 * If there are other rmappings, this block is cross linked and
892 * must not be freed. Remove the reverse mapping, leave the
893 * buffer cache in its possibly confused state, and move on.
894 * We don't want to risk discarding valid data buffers from
895 * anybody else who thinks they own the block, even though that
896 * runs the risk of stale buffer warnings in the future.
897 */
898 trace_xreap_dispose_unmap_extent(sc->sa.pag,
899 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
900 imap->br_blockcount);
901
902 /*
903 * Schedule removal of the mapping from the fork. We use
904 * deferred log intents in this function to control the exact
905 * sequence of metadata updates.
906 */
907 xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
908 xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
909 -(int64_t)imap->br_blockcount);
910 xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
911 return 0;
912 }
913
914 /*
915 * If the block is not crosslinked, we can invalidate all the incore
916 * buffers for the extent, and then free the extent. This is a bit of
917 * a mess since we don't detect discontiguous buffers that are indexed
918 * by a block starting before the first block of the extent but overlap
919 * anyway.
920 */
921 trace_xreap_dispose_free_extent(sc->sa.pag,
922 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
923 imap->br_blockcount);
924
925 /*
926 * Invalidate as many buffers as we can, starting at the beginning of
927 * this mapping. If this function sets blockcount to zero, the
928 * transaction is full of logged buffer invalidations, so we need to
929 * return early so that we can roll and retry.
930 */
931 error = xreap_bmapi_binval(sc, ip, whichfork, imap);
932 if (error || imap->br_blockcount == 0)
933 return error;
934
935 /*
936 * Schedule removal of the mapping from the fork. We use deferred log
937 * intents in this function to control the exact sequence of metadata
938 * updates.
939 */
940 xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
941 xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
942 -(int64_t)imap->br_blockcount);
943 return xfs_free_extent_later(sc->tp, imap->br_startblock,
944 imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
945 XFS_FREE_EXTENT_SKIP_DISCARD);
946}
947
948/*
949 * Dispose of as much of this file extent as we can. Upon successful return,
950 * the imap will reflect the mapping that was removed from the fork.
951 */
952STATIC int
953xreap_ifork_extent(
954 struct xfs_scrub *sc,
955 struct xfs_inode *ip,
956 int whichfork,
957 struct xfs_bmbt_irec *imap)
958{
959 xfs_agnumber_t agno;
960 bool crosslinked;
961 int error;
962
963 ASSERT(sc->sa.pag == NULL);
964
965 trace_xreap_ifork_extent(sc, ip, whichfork, imap);
966
967 agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
968 sc->sa.pag = xfs_perag_get(sc->mp, agno);
969 if (!sc->sa.pag)
970 return -EFSCORRUPTED;
971
972 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
973 if (error)
974 goto out_pag;
975
976 /*
977 * Decide the fate of the blocks at the beginning of the mapping, then
978 * update the mapping to use it with the unmap calls.
979 */
980 error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
981 if (error)
982 goto out_agf;
983
984 error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
985 if (error)
986 goto out_agf;
987
988out_agf:
989 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
990 sc->sa.agf_bp = NULL;
991out_pag:
992 xfs_perag_put(sc->sa.pag);
993 sc->sa.pag = NULL;
994 return error;
995}
996
997/*
998 * Dispose of each block mapped to the given fork of the given file. Callers
999 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
1000 * must not have any delalloc reservations.
1001 */
1002int
1003xrep_reap_ifork(
1004 struct xfs_scrub *sc,
1005 struct xfs_inode *ip,
1006 int whichfork)
1007{
1008 xfs_fileoff_t off = 0;
1009 int bmap_flags = xfs_bmapi_aflag(whichfork);
1010 int error;
1011
1012 ASSERT(xfs_has_rmapbt(sc->mp));
1013 ASSERT(ip == sc->ip || ip == sc->tempip);
1014 ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1015
1016 while (off < XFS_MAX_FILEOFF) {
1017 struct xfs_bmbt_irec imap;
1018 int nimaps = 1;
1019
1020 /* Read the next extent, skip past holes and delalloc. */
1021 error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1022 &nimaps, bmap_flags);
1023 if (error)
1024 return error;
1025 if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1026 ASSERT(0);
1027 return -EFSCORRUPTED;
1028 }
1029
1030 /*
1031 * If this is a real space mapping, reap as much of it as we
1032 * can in a single transaction.
1033 */
1034 if (xfs_bmap_is_real_extent(&imap)) {
1035 error = xreap_ifork_extent(sc, ip, whichfork, &imap);
1036 if (error)
1037 return error;
1038
1039 error = xfs_defer_finish(&sc->tp);
1040 if (error)
1041 return error;
1042 }
1043
1044 off = imap.br_startoff + imap.br_blockcount;
1045 }
1046
1047 return 0;
1048}