Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_trans_resv.h"
  11#include "xfs_mount.h"
  12#include "xfs_defer.h"
  13#include "xfs_btree.h"
  14#include "xfs_btree_staging.h"
  15#include "xfs_buf_mem.h"
  16#include "xfs_btree_mem.h"
  17#include "xfs_bit.h"
  18#include "xfs_log_format.h"
  19#include "xfs_trans.h"
  20#include "xfs_sb.h"
  21#include "xfs_alloc.h"
  22#include "xfs_alloc_btree.h"
  23#include "xfs_ialloc.h"
  24#include "xfs_ialloc_btree.h"
  25#include "xfs_rmap.h"
  26#include "xfs_rmap_btree.h"
  27#include "xfs_inode.h"
  28#include "xfs_icache.h"
  29#include "xfs_bmap.h"
  30#include "xfs_bmap_btree.h"
  31#include "xfs_refcount.h"
  32#include "xfs_refcount_btree.h"
  33#include "xfs_ag.h"
  34#include "scrub/xfs_scrub.h"
  35#include "scrub/scrub.h"
  36#include "scrub/common.h"
  37#include "scrub/btree.h"
  38#include "scrub/trace.h"
  39#include "scrub/repair.h"
  40#include "scrub/bitmap.h"
  41#include "scrub/agb_bitmap.h"
  42#include "scrub/xfile.h"
  43#include "scrub/xfarray.h"
  44#include "scrub/iscan.h"
  45#include "scrub/newbt.h"
  46#include "scrub/reap.h"
  47
  48/*
  49 * Reverse Mapping Btree Repair
  50 * ============================
  51 *
  52 * This is the most involved of all the AG space btree rebuilds.  Everywhere
  53 * else in XFS we lock inodes and then AG data structures, but generating the
  54 * list of rmap records requires that we be able to scan both block mapping
  55 * btrees of every inode in the filesystem to see if it owns any extents in
  56 * this AG.  We can't tolerate any inode updates while we do this, so we
  57 * freeze the filesystem to lock everyone else out, and grant ourselves
  58 * special privileges to run transactions with regular background reclamation
  59 * turned off.
  60 *
  61 * We also have to be very careful not to allow inode reclaim to start a
  62 * transaction because all transactions (other than our own) will block.
  63 * Deferred inode inactivation helps us out there.
  64 *
  65 * I) Reverse mappings for all non-space metadata and file data are collected
  66 * according to the following algorithm:
  67 *
  68 * 1. For each fork of each inode:
  69 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
  70 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
  71 *      bmaps into rmap records (see 1.1.4).  Set bits in BMBIT for each btree
  72 *      block.
  73 * 1.3. If the incore extent map is loaded but the fork is in btree format,
  74 *      just visit the bmbt blocks to set the corresponding BMBIT areas.
  75 * 1.4. From the incore extent map, accumulate each bmap that falls into our
  76 *      target AG.  Remember, multiple bmap records can map to a single rmap
  77 *      record, so we cannot simply emit rmap records 1:1.
  78 * 1.5. Emit rmap records for each extent in BMBIT and free it.
  79 * 2. Create bitmaps INOBIT and ICHUNKBIT.
  80 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
  81 *    and set bits in INOBIT for each btree block.  If the inobt has no records
  82 *    at all, we must be careful to record its root in INOBIT.
  83 * 4. For each block in the finobt, set the corresponding INOBIT area.
  84 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
  85 * 6. Create bitmaps REFCBIT and COWBIT.
  86 * 7. For each CoW staging extent in the refcountbt, set the corresponding
  87 *    areas in COWBIT.
  88 * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
  89 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
  90 * A. Emit rmap for the AG headers.
  91 * B. Emit rmap for the log, if there is one.
  92 *
  93 * II) The rmapbt shape and space metadata rmaps are computed as follows:
  94 *
  95 * 1. Count the rmaps collected in the previous step. (= NR)
  96 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
  97 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
  98 * 4. Create bitmap AGBIT.
  99 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
 100 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
 101 * 7. Count the extents in AGBIT. (= AGNR)
 102 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
 103 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
 104 *    and clear AGBIT.  Go to step 5.
 105 * A. Emit rmaps for each extent in AGBIT.
 106 *
 107 * III) The rmapbt is constructed and set in place as follows:
 108 *
 109 * 1. Sort the rmap records.
 110 * 2. Bulk load the rmaps.
 111 *
 112 * IV) Reap the old btree blocks.
 113 *
 114 * 1. Create a bitmap OLDRMBIT.
 115 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
 116 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
 117 * 4. Reap the extents corresponding to the set areas in OLDRMBIT.  These are
 118 *    the parts of the AG that the rmap didn't find during its scan of the
 119 *    primary metadata and aren't known to be in the free space, which implies
 120 *    that they were the old rmapbt blocks.
 121 * 5. Commit.
 122 *
 123 * We use the 'xrep_rmap' prefix for all the rmap functions.
 124 */
 125
 126/* Context for collecting rmaps */
 127struct xrep_rmap {
 128	/* new rmapbt information */
 129	struct xrep_newbt	new_btree;
 130
 131	/* lock for the xfbtree and xfile */
 132	struct mutex		lock;
 133
 134	/* rmap records generated from primary metadata */
 135	struct xfbtree		rmap_btree;
 136
 137	struct xfs_scrub	*sc;
 138
 139	/* in-memory btree cursor for the xfs_btree_bload iteration */
 140	struct xfs_btree_cur	*mcur;
 141
 142	/* Hooks into rmap update code. */
 143	struct xfs_rmap_hook	rhook;
 144
 145	/* inode scan cursor */
 146	struct xchk_iscan	iscan;
 147
 148	/* Number of non-freespace records found. */
 149	unsigned long long	nr_records;
 150
 151	/* bnobt/cntbt contribution to btreeblks */
 152	xfs_agblock_t		freesp_btblocks;
 153
 154	/* old agf_rmap_blocks counter */
 155	unsigned int		old_rmapbt_fsbcount;
 156};
 157
 158/* Set us up to repair reverse mapping btrees. */
 159int
 160xrep_setup_ag_rmapbt(
 161	struct xfs_scrub	*sc)
 162{
 163	struct xrep_rmap	*rr;
 164	char			*descr;
 165	int			error;
 166
 167	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
 168
 169	descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
 170	error = xrep_setup_xfbtree(sc, descr);
 171	kfree(descr);
 172	if (error)
 173		return error;
 174
 175	rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
 176	if (!rr)
 177		return -ENOMEM;
 178
 179	rr->sc = sc;
 180	sc->buf = rr;
 181	return 0;
 182}
 183
 184/* Make sure there's nothing funny about this mapping. */
 185STATIC int
 186xrep_rmap_check_mapping(
 187	struct xfs_scrub	*sc,
 188	const struct xfs_rmap_irec *rec)
 189{
 190	enum xbtree_recpacking	outcome;
 191	int			error;
 192
 193	if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
 194		return -EFSCORRUPTED;
 195
 196	/* Make sure this isn't free space. */
 197	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
 198			rec->rm_blockcount, &outcome);
 199	if (error)
 200		return error;
 201	if (outcome != XBTREE_RECPACKING_EMPTY)
 202		return -EFSCORRUPTED;
 203
 204	return 0;
 205}
 206
 207/* Store a reverse-mapping record. */
 208static inline int
 209xrep_rmap_stash(
 210	struct xrep_rmap	*rr,
 211	xfs_agblock_t		startblock,
 212	xfs_extlen_t		blockcount,
 213	uint64_t		owner,
 214	uint64_t		offset,
 215	unsigned int		flags)
 216{
 217	struct xfs_rmap_irec	rmap = {
 218		.rm_startblock	= startblock,
 219		.rm_blockcount	= blockcount,
 220		.rm_owner	= owner,
 221		.rm_offset	= offset,
 222		.rm_flags	= flags,
 223	};
 224	struct xfs_scrub	*sc = rr->sc;
 225	struct xfs_btree_cur	*mcur;
 226	int			error = 0;
 227
 228	if (xchk_should_terminate(sc, &error))
 229		return error;
 230
 231	if (xchk_iscan_aborted(&rr->iscan))
 232		return -EFSCORRUPTED;
 233
 234	trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
 235
 236	mutex_lock(&rr->lock);
 237	mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
 238	error = xfs_rmap_map_raw(mcur, &rmap);
 239	xfs_btree_del_cursor(mcur, error);
 240	if (error)
 241		goto out_cancel;
 242
 243	error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
 244	if (error)
 245		goto out_abort;
 246
 247	mutex_unlock(&rr->lock);
 248	return 0;
 249
 250out_cancel:
 251	xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
 252out_abort:
 253	xchk_iscan_abort(&rr->iscan);
 254	mutex_unlock(&rr->lock);
 255	return error;
 256}
 257
 258struct xrep_rmap_stash_run {
 259	struct xrep_rmap	*rr;
 260	uint64_t		owner;
 261	unsigned int		rmap_flags;
 262};
 263
 264static int
 265xrep_rmap_stash_run(
 266	uint32_t			start,
 267	uint32_t			len,
 268	void				*priv)
 269{
 270	struct xrep_rmap_stash_run	*rsr = priv;
 271	struct xrep_rmap		*rr = rsr->rr;
 272
 273	return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
 274}
 275
 276/*
 277 * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
 278 * that the ranges are in units of FS blocks.
 279 */
 280STATIC int
 281xrep_rmap_stash_bitmap(
 282	struct xrep_rmap		*rr,
 283	struct xagb_bitmap		*bitmap,
 284	const struct xfs_owner_info	*oinfo)
 285{
 286	struct xrep_rmap_stash_run	rsr = {
 287		.rr			= rr,
 288		.owner			= oinfo->oi_owner,
 289		.rmap_flags		= 0,
 290	};
 291
 292	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
 293		rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
 294	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
 295		rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
 296
 297	return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
 298}
 299
 300/* Section (I): Finding all file and bmbt extents. */
 301
 302/* Context for accumulating rmaps for an inode fork. */
 303struct xrep_rmap_ifork {
 304	/*
 305	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
 306	 * single rmap.
 307	 */
 308	struct xfs_rmap_irec	accum;
 309
 310	/* Bitmap of bmbt blocks in this AG. */
 311	struct xagb_bitmap	bmbt_blocks;
 312
 313	struct xrep_rmap	*rr;
 314
 315	/* Which inode fork? */
 316	int			whichfork;
 317};
 318
 319/* Stash an rmap that we accumulated while walking an inode fork. */
 320STATIC int
 321xrep_rmap_stash_accumulated(
 322	struct xrep_rmap_ifork	*rf)
 323{
 324	if (rf->accum.rm_blockcount == 0)
 325		return 0;
 326
 327	return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
 328			rf->accum.rm_blockcount, rf->accum.rm_owner,
 329			rf->accum.rm_offset, rf->accum.rm_flags);
 330}
 331
 332/* Accumulate a bmbt record. */
 333STATIC int
 334xrep_rmap_visit_bmbt(
 335	struct xfs_btree_cur	*cur,
 336	struct xfs_bmbt_irec	*rec,
 337	void			*priv)
 338{
 339	struct xrep_rmap_ifork	*rf = priv;
 340	struct xfs_mount	*mp = rf->rr->sc->mp;
 341	struct xfs_rmap_irec	*accum = &rf->accum;
 342	xfs_agblock_t		agbno;
 343	unsigned int		rmap_flags = 0;
 344	int			error;
 345
 346	if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
 347			rf->rr->sc->sa.pag->pag_agno)
 348		return 0;
 349
 350	agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
 351	if (rf->whichfork == XFS_ATTR_FORK)
 352		rmap_flags |= XFS_RMAP_ATTR_FORK;
 353	if (rec->br_state == XFS_EXT_UNWRITTEN)
 354		rmap_flags |= XFS_RMAP_UNWRITTEN;
 355
 356	/* If this bmap is adjacent to the previous one, just add it. */
 357	if (accum->rm_blockcount > 0 &&
 358	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
 359	    agbno == accum->rm_startblock + accum->rm_blockcount &&
 360	    rmap_flags == accum->rm_flags) {
 361		accum->rm_blockcount += rec->br_blockcount;
 362		return 0;
 363	}
 364
 365	/* Otherwise stash the old rmap and start accumulating a new one. */
 366	error = xrep_rmap_stash_accumulated(rf);
 367	if (error)
 368		return error;
 369
 370	accum->rm_startblock = agbno;
 371	accum->rm_blockcount = rec->br_blockcount;
 372	accum->rm_offset = rec->br_startoff;
 373	accum->rm_flags = rmap_flags;
 374	return 0;
 375}
 376
 377/* Add a btree block to the bitmap. */
 378STATIC int
 379xrep_rmap_visit_iroot_btree_block(
 380	struct xfs_btree_cur	*cur,
 381	int			level,
 382	void			*priv)
 383{
 384	struct xrep_rmap_ifork	*rf = priv;
 385	struct xfs_buf		*bp;
 386	xfs_fsblock_t		fsbno;
 387	xfs_agblock_t		agbno;
 388
 389	xfs_btree_get_block(cur, level, &bp);
 390	if (!bp)
 391		return 0;
 392
 393	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
 394	if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
 395		return 0;
 396
 397	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
 398	return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
 399}
 400
 401/*
 402 * Iterate a metadata btree rooted in an inode to collect rmap records for
 403 * anything in this fork that matches the AG.
 404 */
 405STATIC int
 406xrep_rmap_scan_iroot_btree(
 407	struct xrep_rmap_ifork	*rf,
 408	struct xfs_btree_cur	*cur)
 409{
 410	struct xfs_owner_info	oinfo;
 411	struct xrep_rmap	*rr = rf->rr;
 412	int			error;
 413
 414	xagb_bitmap_init(&rf->bmbt_blocks);
 415
 416	/* Record all the blocks in the btree itself. */
 417	error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
 418			XFS_BTREE_VISIT_ALL, rf);
 419	if (error)
 420		goto out;
 421
 422	/* Emit rmaps for the btree blocks. */
 423	xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
 424	error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
 425	if (error)
 426		goto out;
 427
 428	/* Stash any remaining accumulated rmaps. */
 429	error = xrep_rmap_stash_accumulated(rf);
 430out:
 431	xagb_bitmap_destroy(&rf->bmbt_blocks);
 432	return error;
 433}
 434
 435static inline bool
 436is_rt_data_fork(
 437	struct xfs_inode	*ip,
 438	int			whichfork)
 439{
 440	return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
 441}
 442
 443/*
 444 * Iterate the block mapping btree to collect rmap records for anything in this
 445 * fork that matches the AG.  Sets @mappings_done to true if we've scanned the
 446 * block mappings in this fork.
 447 */
 448STATIC int
 449xrep_rmap_scan_bmbt(
 450	struct xrep_rmap_ifork	*rf,
 451	struct xfs_inode	*ip,
 452	bool			*mappings_done)
 453{
 454	struct xrep_rmap	*rr = rf->rr;
 455	struct xfs_btree_cur	*cur;
 456	struct xfs_ifork	*ifp;
 457	int			error;
 458
 459	*mappings_done = false;
 460	ifp = xfs_ifork_ptr(ip, rf->whichfork);
 461	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
 462
 463	if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
 464	    xfs_need_iread_extents(ifp)) {
 465		/*
 466		 * If the incore extent cache isn't loaded, scan the bmbt for
 467		 * mapping records.  This avoids loading the incore extent
 468		 * tree, which will increase memory pressure at a time when
 469		 * we're trying to run as quickly as we possibly can.  Ignore
 470		 * realtime extents.
 471		 */
 472		error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
 473		if (error)
 474			goto out_cur;
 475
 476		*mappings_done = true;
 477	}
 478
 479	/* Scan for the bmbt blocks, which always live on the data device. */
 480	error = xrep_rmap_scan_iroot_btree(rf, cur);
 481out_cur:
 482	xfs_btree_del_cursor(cur, error);
 483	return error;
 484}
 485
 486/*
 487 * Iterate the in-core extent cache to collect rmap records for anything in
 488 * this fork that matches the AG.
 489 */
 490STATIC int
 491xrep_rmap_scan_iext(
 492	struct xrep_rmap_ifork	*rf,
 493	struct xfs_ifork	*ifp)
 494{
 495	struct xfs_bmbt_irec	rec;
 496	struct xfs_iext_cursor	icur;
 497	int			error;
 498
 499	for_each_xfs_iext(ifp, &icur, &rec) {
 500		if (isnullstartblock(rec.br_startblock))
 501			continue;
 502		error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
 503		if (error)
 504			return error;
 505	}
 506
 507	return xrep_rmap_stash_accumulated(rf);
 508}
 509
 510/* Find all the extents from a given AG in an inode fork. */
 511STATIC int
 512xrep_rmap_scan_ifork(
 513	struct xrep_rmap	*rr,
 514	struct xfs_inode	*ip,
 515	int			whichfork)
 516{
 517	struct xrep_rmap_ifork	rf = {
 518		.accum		= { .rm_owner = ip->i_ino, },
 519		.rr		= rr,
 520		.whichfork	= whichfork,
 521	};
 522	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 523	int			error = 0;
 524
 525	if (!ifp)
 526		return 0;
 527
 528	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
 529		bool		mappings_done;
 530
 531		/*
 532		 * Scan the bmap btree for data device mappings.  This includes
 533		 * the btree blocks themselves, even if this is a realtime
 534		 * file.
 535		 */
 536		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
 537		if (error || mappings_done)
 538			return error;
 539	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
 540		return 0;
 541	}
 542
 543	/* Scan incore extent cache if this isn't a realtime file. */
 544	if (xfs_ifork_is_realtime(ip, whichfork))
 545		return 0;
 546
 547	return xrep_rmap_scan_iext(&rf, ifp);
 548}
 549
 550/*
 551 * Take ILOCK on a file that we want to scan.
 552 *
 553 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
 554 * attr bmbt.  Otherwise, take ILOCK_SHARED.
 555 */
 556static inline unsigned int
 557xrep_rmap_scan_ilock(
 558	struct xfs_inode	*ip)
 559{
 560	uint			lock_mode = XFS_ILOCK_SHARED;
 561
 562	if (xfs_need_iread_extents(&ip->i_df)) {
 563		lock_mode = XFS_ILOCK_EXCL;
 564		goto lock;
 565	}
 566
 567	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
 568		lock_mode = XFS_ILOCK_EXCL;
 569
 570lock:
 571	xfs_ilock(ip, lock_mode);
 572	return lock_mode;
 573}
 574
 575/* Record reverse mappings for a file. */
 576STATIC int
 577xrep_rmap_scan_inode(
 578	struct xrep_rmap	*rr,
 579	struct xfs_inode	*ip)
 580{
 581	unsigned int		lock_mode = 0;
 582	int			error;
 583
 584	/*
 585	 * Directory updates (create/link/unlink/rename) drop the directory's
 586	 * ILOCK before finishing any rmapbt updates associated with directory
 587	 * shape changes.  For this scan to coordinate correctly with the live
 588	 * update hook, we must take the only lock (i_rwsem) that is held all
 589	 * the way to dir op completion.  This will get fixed by the parent
 590	 * pointer patchset.
 591	 */
 592	if (S_ISDIR(VFS_I(ip)->i_mode)) {
 593		lock_mode = XFS_IOLOCK_SHARED;
 594		xfs_ilock(ip, lock_mode);
 595	}
 596	lock_mode |= xrep_rmap_scan_ilock(ip);
 597
 598	/* Check the data fork. */
 599	error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
 600	if (error)
 601		goto out_unlock;
 602
 603	/* Check the attr fork. */
 604	error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
 605	if (error)
 606		goto out_unlock;
 607
 608	/* COW fork extents are "owned" by the refcount btree. */
 609
 610	xchk_iscan_mark_visited(&rr->iscan, ip);
 611out_unlock:
 612	xfs_iunlock(ip, lock_mode);
 613	return error;
 614}
 615
 616/* Section (I): Find all AG metadata extents except for free space metadata. */
 617
 618struct xrep_rmap_inodes {
 619	struct xrep_rmap	*rr;
 620	struct xagb_bitmap	inobt_blocks;	/* INOBIT */
 621	struct xagb_bitmap	ichunk_blocks;	/* ICHUNKBIT */
 622};
 623
 624/* Record inode btree rmaps. */
 625STATIC int
 626xrep_rmap_walk_inobt(
 627	struct xfs_btree_cur		*cur,
 628	const union xfs_btree_rec	*rec,
 629	void				*priv)
 630{
 631	struct xfs_inobt_rec_incore	irec;
 632	struct xrep_rmap_inodes		*ri = priv;
 633	struct xfs_mount		*mp = cur->bc_mp;
 634	xfs_agblock_t			agbno;
 635	xfs_extlen_t			aglen;
 636	xfs_agino_t			agino;
 637	xfs_agino_t			iperhole;
 638	unsigned int			i;
 639	int				error;
 640
 641	/* Record the inobt blocks. */
 642	error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
 643	if (error)
 644		return error;
 645
 646	xfs_inobt_btrec_to_irec(mp, rec, &irec);
 647	if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
 648		return -EFSCORRUPTED;
 649
 650	agino = irec.ir_startino;
 651
 652	/* Record a non-sparse inode chunk. */
 653	if (!xfs_inobt_issparse(irec.ir_holemask)) {
 654		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 655		aglen = max_t(xfs_extlen_t, 1,
 656				XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
 657
 658		return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
 659	}
 660
 661	/* Iterate each chunk. */
 662	iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
 663			XFS_INODES_PER_HOLEMASK_BIT);
 664	aglen = iperhole / mp->m_sb.sb_inopblock;
 665	for (i = 0, agino = irec.ir_startino;
 666	     i < XFS_INOBT_HOLEMASK_BITS;
 667	     i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
 668		/* Skip holes. */
 669		if (irec.ir_holemask & (1 << i))
 670			continue;
 671
 672		/* Record the inode chunk otherwise. */
 673		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
 674		error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
 675		if (error)
 676			return error;
 677	}
 678
 679	return 0;
 680}
 681
 682/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
 683STATIC int
 684xrep_rmap_find_inode_rmaps(
 685	struct xrep_rmap	*rr)
 686{
 687	struct xrep_rmap_inodes	ri = {
 688		.rr		= rr,
 689	};
 690	struct xfs_scrub	*sc = rr->sc;
 691	int			error;
 692
 693	xagb_bitmap_init(&ri.inobt_blocks);
 694	xagb_bitmap_init(&ri.ichunk_blocks);
 695
 696	/*
 697	 * Iterate every record in the inobt so we can capture all the inode
 698	 * chunks and the blocks in the inobt itself.
 699	 */
 700	error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
 701	if (error)
 702		goto out_bitmap;
 703
 704	/*
 705	 * Note that if there are zero records in the inobt then query_all does
 706	 * nothing and we have to account the empty inobt root manually.
 707	 */
 708	if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
 709		struct xfs_agi	*agi = sc->sa.agi_bp->b_addr;
 710
 711		error = xagb_bitmap_set(&ri.inobt_blocks,
 712				be32_to_cpu(agi->agi_root), 1);
 713		if (error)
 714			goto out_bitmap;
 715	}
 716
 717	/* Scan the finobt too. */
 718	if (xfs_has_finobt(sc->mp)) {
 719		error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
 720				sc->sa.fino_cur);
 721		if (error)
 722			goto out_bitmap;
 723	}
 724
 725	/* Generate rmaps for everything. */
 726	error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
 727			&XFS_RMAP_OINFO_INOBT);
 728	if (error)
 729		goto out_bitmap;
 730	error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
 731			&XFS_RMAP_OINFO_INODES);
 732
 733out_bitmap:
 734	xagb_bitmap_destroy(&ri.inobt_blocks);
 735	xagb_bitmap_destroy(&ri.ichunk_blocks);
 736	return error;
 737}
 738
 739/* Record a CoW staging extent. */
 740STATIC int
 741xrep_rmap_walk_cowblocks(
 742	struct xfs_btree_cur		*cur,
 743	const struct xfs_refcount_irec	*irec,
 744	void				*priv)
 745{
 746	struct xagb_bitmap		*bitmap = priv;
 747
 748	if (!xfs_refcount_check_domain(irec) ||
 749	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
 750		return -EFSCORRUPTED;
 751
 752	return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
 753}
 754
 755/*
 756 * Collect rmaps for the blocks containing the refcount btree, and all CoW
 757 * staging extents.
 758 */
 759STATIC int
 760xrep_rmap_find_refcount_rmaps(
 761	struct xrep_rmap	*rr)
 762{
 763	struct xagb_bitmap	refcountbt_blocks;	/* REFCBIT */
 764	struct xagb_bitmap	cow_blocks;		/* COWBIT */
 765	struct xfs_refcount_irec low = {
 766		.rc_startblock	= 0,
 767		.rc_domain	= XFS_REFC_DOMAIN_COW,
 768	};
 769	struct xfs_refcount_irec high = {
 770		.rc_startblock	= -1U,
 771		.rc_domain	= XFS_REFC_DOMAIN_COW,
 772	};
 773	struct xfs_scrub	*sc = rr->sc;
 774	int			error;
 775
 776	if (!xfs_has_reflink(sc->mp))
 777		return 0;
 778
 779	xagb_bitmap_init(&refcountbt_blocks);
 780	xagb_bitmap_init(&cow_blocks);
 781
 782	/* refcountbt */
 783	error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
 784	if (error)
 785		goto out_bitmap;
 786
 787	/* Collect rmaps for CoW staging extents. */
 788	error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
 789			xrep_rmap_walk_cowblocks, &cow_blocks);
 790	if (error)
 791		goto out_bitmap;
 792
 793	/* Generate rmaps for everything. */
 794	error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
 795	if (error)
 796		goto out_bitmap;
 797	error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
 798			&XFS_RMAP_OINFO_REFC);
 799
 800out_bitmap:
 801	xagb_bitmap_destroy(&cow_blocks);
 802	xagb_bitmap_destroy(&refcountbt_blocks);
 803	return error;
 804}
 805
 806/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
 807STATIC int
 808xrep_rmap_find_agheader_rmaps(
 809	struct xrep_rmap	*rr)
 810{
 811	struct xfs_scrub	*sc = rr->sc;
 812
 813	/* Create a record for the AG sb->agfl. */
 814	return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
 815			XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
 816			XFS_RMAP_OWN_FS, 0, 0);
 817}
 818
 819/* Generate rmaps for the log, if it's in this AG. */
 820STATIC int
 821xrep_rmap_find_log_rmaps(
 822	struct xrep_rmap	*rr)
 823{
 824	struct xfs_scrub	*sc = rr->sc;
 825
 826	if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
 827		return 0;
 828
 829	return xrep_rmap_stash(rr,
 830			XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
 831			sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
 832}
 833
 834/* Check and count all the records that we gathered. */
 835STATIC int
 836xrep_rmap_check_record(
 837	struct xfs_btree_cur		*cur,
 838	const struct xfs_rmap_irec	*rec,
 839	void				*priv)
 840{
 841	struct xrep_rmap		*rr = priv;
 842	int				error;
 843
 844	error = xrep_rmap_check_mapping(rr->sc, rec);
 845	if (error)
 846		return error;
 847
 848	rr->nr_records++;
 849	return 0;
 850}
 851
 852/*
 853 * Generate all the reverse-mappings for this AG, a list of the old rmapbt
 854 * blocks, and the new btreeblks count.  Figure out if we have enough free
 855 * space to reconstruct the inode btrees.  The caller must clean up the lists
 856 * if anything goes wrong.  This implements section (I) above.
 857 */
 858STATIC int
 859xrep_rmap_find_rmaps(
 860	struct xrep_rmap	*rr)
 861{
 862	struct xfs_scrub	*sc = rr->sc;
 863	struct xchk_ag		*sa = &sc->sa;
 864	struct xfs_inode	*ip;
 865	struct xfs_btree_cur	*mcur;
 866	int			error;
 867
 868	/* Find all the per-AG metadata. */
 869	xrep_ag_btcur_init(sc, &sc->sa);
 870
 871	error = xrep_rmap_find_inode_rmaps(rr);
 872	if (error)
 873		goto end_agscan;
 874
 875	error = xrep_rmap_find_refcount_rmaps(rr);
 876	if (error)
 877		goto end_agscan;
 878
 879	error = xrep_rmap_find_agheader_rmaps(rr);
 880	if (error)
 881		goto end_agscan;
 882
 883	error = xrep_rmap_find_log_rmaps(rr);
 884end_agscan:
 885	xchk_ag_btcur_free(&sc->sa);
 886	if (error)
 887		return error;
 888
 889	/*
 890	 * Set up for a potentially lengthy filesystem scan by reducing our
 891	 * transaction resource usage for the duration.  Specifically:
 892	 *
 893	 * Unlock the AG header buffers and cancel the transaction to release
 894	 * the log grant space while we scan the filesystem.
 895	 *
 896	 * Create a new empty transaction to eliminate the possibility of the
 897	 * inode scan deadlocking on cyclical metadata.
 898	 *
 899	 * We pass the empty transaction to the file scanning function to avoid
 900	 * repeatedly cycling empty transactions.  This can be done even though
 901	 * we take the IOLOCK to quiesce the file because empty transactions
 902	 * do not take sb_internal.
 903	 */
 904	sa->agf_bp = NULL;
 905	sa->agi_bp = NULL;
 906	xchk_trans_cancel(sc);
 907	error = xchk_trans_alloc_empty(sc);
 908	if (error)
 909		return error;
 910
 911	/* Iterate all AGs for inodes rmaps. */
 912	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
 913		error = xrep_rmap_scan_inode(rr, ip);
 914		xchk_irele(sc, ip);
 915		if (error)
 916			break;
 917
 918		if (xchk_should_terminate(sc, &error))
 919			break;
 920	}
 921	xchk_iscan_iter_finish(&rr->iscan);
 922	if (error)
 923		return error;
 924
 925	/*
 926	 * Switch out for a real transaction and lock the AG headers in
 927	 * preparation for building a new tree.
 928	 */
 929	xchk_trans_cancel(sc);
 930	error = xchk_setup_fs(sc);
 931	if (error)
 932		return error;
 933	error = xchk_perag_drain_and_lock(sc);
 934	if (error)
 935		return error;
 936
 937	/*
 938	 * If a hook failed to update the in-memory btree, we lack the data to
 939	 * continue the repair.
 940	 */
 941	if (xchk_iscan_aborted(&rr->iscan))
 942		return -EFSCORRUPTED;
 943
 944	/*
 945	 * Now that we have everything locked again, we need to count the
 946	 * number of rmap records stashed in the btree.  This should reflect
 947	 * all actively-owned space in the filesystem.  At the same time, check
 948	 * all our records before we start building a new btree, which requires
 949	 * a bnobt cursor.
 950	 */
 951	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
 952	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 953			sc->sa.pag);
 954
 955	rr->nr_records = 0;
 956	error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
 957
 958	xfs_btree_del_cursor(sc->sa.bno_cur, error);
 959	sc->sa.bno_cur = NULL;
 960	xfs_btree_del_cursor(mcur, error);
 961
 962	return error;
 963}
 964
 965/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
 966
 967struct xrep_rmap_agfl {
 968	struct xagb_bitmap	*bitmap;
 969	xfs_agnumber_t		agno;
 970};
 971
 972/* Add an AGFL block to the rmap list. */
 973STATIC int
 974xrep_rmap_walk_agfl(
 975	struct xfs_mount	*mp,
 976	xfs_agblock_t		agbno,
 977	void			*priv)
 978{
 979	struct xrep_rmap_agfl	*ra = priv;
 980
 981	return xagb_bitmap_set(ra->bitmap, agbno, 1);
 982}
 983
 984/*
 985 * Run one round of reserving space for the new rmapbt and recomputing the
 986 * number of blocks needed to store the previously observed rmapbt records and
 987 * the ones we'll create for the free space metadata.  When we don't need more
 988 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
 989 * true.
 990 */
 991STATIC int
 992xrep_rmap_try_reserve(
 993	struct xrep_rmap	*rr,
 994	struct xfs_btree_cur	*rmap_cur,
 995	struct xagb_bitmap	*freesp_blocks,
 996	uint64_t		*blocks_reserved,
 997	bool			*done)
 998{
 999	struct xrep_rmap_agfl	ra = {
1000		.bitmap		= freesp_blocks,
1001		.agno		= rr->sc->sa.pag->pag_agno,
1002	};
1003	struct xfs_scrub	*sc = rr->sc;
1004	struct xrep_newbt_resv	*resv, *n;
1005	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1006	struct xfs_buf		*agfl_bp;
1007	uint64_t		nr_blocks;	/* RMB */
1008	uint64_t		freesp_records;
1009	int			error;
1010
1011	/*
1012	 * We're going to recompute new_btree.bload.nr_blocks at the end of
1013	 * this function to reflect however many btree blocks we need to store
1014	 * all the rmap records (including the ones that reflect the changes we
1015	 * made to support the new rmapbt blocks), so we save the old value
1016	 * here so we can decide if we've reserved enough blocks.
1017	 */
1018	nr_blocks = rr->new_btree.bload.nr_blocks;
1019
1020	/*
1021	 * Make sure we've reserved enough space for the new btree.  This can
1022	 * change the shape of the free space btrees, which can cause secondary
1023	 * interactions with the rmap records because all three space btrees
1024	 * have the same rmap owner.  We'll account for all that below.
1025	 */
1026	error = xrep_newbt_alloc_blocks(&rr->new_btree,
1027			nr_blocks - *blocks_reserved);
1028	if (error)
1029		return error;
1030
1031	*blocks_reserved = rr->new_btree.bload.nr_blocks;
1032
1033	/* Clear everything in the bitmap. */
1034	xagb_bitmap_destroy(freesp_blocks);
1035
1036	/* Set all the bnobt blocks in the bitmap. */
1037	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1038			sc->sa.pag);
1039	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1040	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1041	sc->sa.bno_cur = NULL;
1042	if (error)
1043		return error;
1044
1045	/* Set all the cntbt blocks in the bitmap. */
1046	sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1047			sc->sa.pag);
1048	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1049	xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1050	sc->sa.cnt_cur = NULL;
1051	if (error)
1052		return error;
1053
1054	/* Record our new btreeblks value. */
1055	rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1056
1057	/* Set all the new rmapbt blocks in the bitmap. */
1058	list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1059		error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1060		if (error)
1061			return error;
1062	}
1063
1064	/* Set all the AGFL blocks in the bitmap. */
1065	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1066	if (error)
1067		return error;
1068
1069	error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1070	if (error)
1071		return error;
1072
1073	/* Count the extents in the bitmap. */
1074	freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1075
1076	/* Compute how many blocks we'll need for all the rmaps. */
1077	error = xfs_btree_bload_compute_geometry(rmap_cur,
1078			&rr->new_btree.bload, rr->nr_records + freesp_records);
1079	if (error)
1080		return error;
1081
1082	/* We're done when we don't need more blocks. */
1083	*done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1084	return 0;
1085}
1086
1087/*
1088 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1089 * the free space metadata.  This implements section (II) above.
1090 */
1091STATIC int
1092xrep_rmap_reserve_space(
1093	struct xrep_rmap	*rr,
1094	struct xfs_btree_cur	*rmap_cur)
1095{
1096	struct xagb_bitmap	freesp_blocks;	/* AGBIT */
1097	uint64_t		blocks_reserved = 0;
1098	bool			done = false;
1099	int			error;
1100
1101	/* Compute how many blocks we'll need for the rmaps collected so far. */
1102	error = xfs_btree_bload_compute_geometry(rmap_cur,
1103			&rr->new_btree.bload, rr->nr_records);
1104	if (error)
1105		return error;
1106
1107	/* Last chance to abort before we start committing fixes. */
1108	if (xchk_should_terminate(rr->sc, &error))
1109		return error;
1110
1111	xagb_bitmap_init(&freesp_blocks);
1112
1113	/*
1114	 * Iteratively reserve space for the new rmapbt and recompute the
1115	 * number of blocks needed to store the previously observed rmapbt
1116	 * records and the ones we'll create for the free space metadata.
1117	 * Finish when we don't need more blocks.
1118	 */
1119	do {
1120		error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1121				&blocks_reserved, &done);
1122		if (error)
1123			goto out_bitmap;
1124	} while (!done);
1125
1126	/* Emit rmaps for everything in the free space bitmap. */
1127	xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1128	error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1129	xchk_ag_btcur_free(&rr->sc->sa);
1130
1131out_bitmap:
1132	xagb_bitmap_destroy(&freesp_blocks);
1133	return error;
1134}
1135
1136/* Section (III): Building the new rmap btree. */
1137
1138/* Update the AGF counters. */
1139STATIC int
1140xrep_rmap_reset_counters(
1141	struct xrep_rmap	*rr)
1142{
1143	struct xfs_scrub	*sc = rr->sc;
1144	struct xfs_perag	*pag = sc->sa.pag;
1145	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1146	xfs_agblock_t		rmap_btblocks;
1147
1148	/*
1149	 * The AGF header contains extra information related to the reverse
1150	 * mapping btree, so we must update those fields here.
1151	 */
1152	rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1153	agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1154	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1155
1156	/*
1157	 * After we commit the new btree to disk, it is possible that the
1158	 * process to reap the old btree blocks will race with the AIL trying
1159	 * to checkpoint the old btree blocks into the filesystem.  If the new
1160	 * tree is shorter than the old one, the rmapbt write verifier will
1161	 * fail and the AIL will shut down the filesystem.
1162	 *
1163	 * To avoid this, save the old incore btree height values as the alt
1164	 * height values before re-initializing the perag info from the updated
1165	 * AGF to capture all the new values.
1166	 */
1167	pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1168
1169	/* Reinitialize with the values we just logged. */
1170	return xrep_reinit_pagf(sc);
1171}
1172
1173/* Retrieve rmapbt data for bulk load. */
1174STATIC int
1175xrep_rmap_get_records(
1176	struct xfs_btree_cur	*cur,
1177	unsigned int		idx,
1178	struct xfs_btree_block	*block,
1179	unsigned int		nr_wanted,
1180	void			*priv)
1181{
1182	struct xrep_rmap	*rr = priv;
1183	union xfs_btree_rec	*block_rec;
1184	unsigned int		loaded;
1185	int			error;
1186
1187	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1188		int		stat = 0;
1189
1190		error = xfs_btree_increment(rr->mcur, 0, &stat);
1191		if (error)
1192			return error;
1193		if (!stat)
1194			return -EFSCORRUPTED;
1195
1196		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1197		if (error)
1198			return error;
1199		if (!stat)
1200			return -EFSCORRUPTED;
1201
1202		block_rec = xfs_btree_rec_addr(cur, idx, block);
1203		cur->bc_ops->init_rec_from_cur(cur, block_rec);
1204	}
1205
1206	return loaded;
1207}
1208
1209/* Feed one of the new btree blocks to the bulk loader. */
1210STATIC int
1211xrep_rmap_claim_block(
1212	struct xfs_btree_cur	*cur,
1213	union xfs_btree_ptr	*ptr,
1214	void			*priv)
1215{
1216	struct xrep_rmap        *rr = priv;
1217
1218	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1219}
1220
1221/* Custom allocation function for new rmap btrees. */
1222STATIC int
1223xrep_rmap_alloc_vextent(
1224	struct xfs_scrub	*sc,
1225	struct xfs_alloc_arg	*args,
1226	xfs_fsblock_t		alloc_hint)
1227{
1228	int			error;
1229
1230	/*
1231	 * We don't want an rmap update on the allocation, since we iteratively
1232	 * compute the OWN_AG records /after/ allocating blocks for the records
1233	 * that we already know we need to store.  Therefore, fix the freelist
1234	 * with the NORMAP flag set so that we don't also try to create an rmap
1235	 * for new AGFL blocks.
1236	 */
1237	error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1238	if (error)
1239		return error;
1240
1241	/*
1242	 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1243	 * free space btrees or by removing blocks from the AGFL and queueing
1244	 * an EFI to free the block, the transaction will be dirty.  This
1245	 * second case is of interest to us.
1246	 *
1247	 * Later on, we will need to compare gaps in the new recordset against
1248	 * the block usage of all OWN_AG owners in order to free the old
1249	 * btree's blocks, which means that we can't have EFIs for former AGFL
1250	 * blocks attached to the repair transaction when we commit the new
1251	 * btree.
1252	 *
1253	 * xrep_newbt_alloc_blocks guarantees this for us by calling
1254	 * xrep_defer_finish to commit anything that fix_freelist may have
1255	 * added to the transaction.
1256	 */
1257	return xfs_alloc_vextent_near_bno(args, alloc_hint);
1258}
1259
1260
1261/* Count the records in this btree. */
1262STATIC int
1263xrep_rmap_count_records(
1264	struct xfs_btree_cur	*cur,
1265	unsigned long long	*nr)
1266{
1267	int			running = 1;
1268	int			error;
1269
1270	*nr = 0;
1271
1272	error = xfs_btree_goto_left_edge(cur);
1273	if (error)
1274		return error;
1275
1276	while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1277		if (running)
1278			(*nr)++;
1279	}
1280
1281	return error;
1282}
1283/*
1284 * Use the collected rmap information to stage a new rmap btree.  If this is
1285 * successful we'll return with the new btree root information logged to the
1286 * repair transaction but not yet committed.  This implements section (III)
1287 * above.
1288 */
1289STATIC int
1290xrep_rmap_build_new_tree(
1291	struct xrep_rmap	*rr)
1292{
1293	struct xfs_scrub	*sc = rr->sc;
1294	struct xfs_perag	*pag = sc->sa.pag;
1295	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1296	struct xfs_btree_cur	*rmap_cur;
1297	xfs_fsblock_t		fsbno;
1298	int			error;
1299
1300	/*
1301	 * Preserve the old rmapbt block count so that we can adjust the
1302	 * per-AG rmapbt reservation after we commit the new btree root and
1303	 * want to dispose of the old btree blocks.
1304	 */
1305	rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1306
1307	/*
1308	 * Prepare to construct the new btree by reserving disk space for the
1309	 * new btree and setting up all the accounting information we'll need
1310	 * to root the new btree while it's under construction and before we
1311	 * attach it to the AG header.  The new blocks are accounted to the
1312	 * rmapbt per-AG reservation, which we will adjust further after
1313	 * committing the new btree.
1314	 */
1315	fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
1316	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1317			fsbno, XFS_AG_RESV_RMAPBT);
1318	rr->new_btree.bload.get_records = xrep_rmap_get_records;
1319	rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1320	rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1321	rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1322	xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1323
1324	/*
1325	 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1326	 * and compute OWN_AG rmaps.
1327	 */
1328	error = xrep_rmap_reserve_space(rr, rmap_cur);
1329	if (error)
1330		goto err_cur;
1331
1332	/*
1333	 * Count the rmapbt records again, because the space reservation
1334	 * for the rmapbt itself probably added more records to the btree.
1335	 */
1336	rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1337			&rr->rmap_btree);
1338
1339	error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1340	if (error)
1341		goto err_mcur;
1342
1343	/*
1344	 * Due to btree slack factors, it's possible for a new btree to be one
1345	 * level taller than the old btree.  Update the incore btree height so
1346	 * that we don't trip the verifiers when writing the new btree blocks
1347	 * to disk.
1348	 */
1349	pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1350
1351	/*
1352	 * Move the cursor to the left edge of the tree so that the first
1353	 * increment in ->get_records positions us at the first record.
1354	 */
1355	error = xfs_btree_goto_left_edge(rr->mcur);
1356	if (error)
1357		goto err_level;
1358
1359	/* Add all observed rmap records. */
1360	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1361	if (error)
1362		goto err_level;
1363
1364	/*
1365	 * Install the new btree in the AG header.  After this point the old
1366	 * btree is no longer accessible and the new tree is live.
1367	 */
1368	xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1369	xfs_btree_del_cursor(rmap_cur, 0);
1370	xfs_btree_del_cursor(rr->mcur, 0);
1371	rr->mcur = NULL;
1372
1373	/*
1374	 * Now that we've written the new btree to disk, we don't need to keep
1375	 * updating the in-memory btree.  Abort the scan to stop live updates.
1376	 */
1377	xchk_iscan_abort(&rr->iscan);
1378
1379	/*
1380	 * The newly committed rmap recordset includes mappings for the blocks
1381	 * that we reserved to build the new btree.  If there is excess space
1382	 * reservation to be freed, the corresponding rmap records must also be
1383	 * removed.
1384	 */
1385	rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1386
1387	/* Reset the AGF counters now that we've changed the btree shape. */
1388	error = xrep_rmap_reset_counters(rr);
1389	if (error)
1390		goto err_newbt;
1391
1392	/* Dispose of any unused blocks and the accounting information. */
1393	error = xrep_newbt_commit(&rr->new_btree);
1394	if (error)
1395		return error;
1396
1397	return xrep_roll_ag_trans(sc);
1398
1399err_level:
1400	pag->pagf_repair_rmap_level = 0;
1401err_mcur:
1402	xfs_btree_del_cursor(rr->mcur, error);
1403err_cur:
1404	xfs_btree_del_cursor(rmap_cur, error);
1405err_newbt:
1406	xrep_newbt_cancel(&rr->new_btree);
1407	return error;
1408}
1409
1410/* Section (IV): Reaping the old btree. */
1411
1412struct xrep_rmap_find_gaps {
1413	struct xagb_bitmap	rmap_gaps;
1414	xfs_agblock_t		next_agbno;
1415};
1416
1417/* Subtract each free extent in the bnobt from the rmap gaps. */
1418STATIC int
1419xrep_rmap_find_freesp(
1420	struct xfs_btree_cur		*cur,
1421	const struct xfs_alloc_rec_incore *rec,
1422	void				*priv)
1423{
1424	struct xrep_rmap_find_gaps	*rfg = priv;
1425
1426	return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1427			rec->ar_blockcount);
1428}
1429
1430/* Record the free space we find, as part of cleaning out the btree. */
1431STATIC int
1432xrep_rmap_find_gaps(
1433	struct xfs_btree_cur		*cur,
1434	const struct xfs_rmap_irec	*rec,
1435	void				*priv)
1436{
1437	struct xrep_rmap_find_gaps	*rfg = priv;
1438	int				error;
1439
1440	if (rec->rm_startblock > rfg->next_agbno) {
1441		error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1442				rec->rm_startblock - rfg->next_agbno);
1443		if (error)
1444			return error;
1445	}
1446
1447	rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1448				rec->rm_startblock + rec->rm_blockcount);
1449	return 0;
1450}
1451
1452/*
1453 * Reap the old rmapbt blocks.  Now that the rmapbt is fully rebuilt, we make
1454 * a list of gaps in the rmap records and a list of the extents mentioned in
1455 * the bnobt.  Any block that's in the new rmapbt gap list but not mentioned
1456 * in the bnobt is a block from the old rmapbt and can be removed.
1457 */
1458STATIC int
1459xrep_rmap_remove_old_tree(
1460	struct xrep_rmap	*rr)
1461{
1462	struct xrep_rmap_find_gaps rfg = {
1463		.next_agbno	= 0,
1464	};
1465	struct xfs_scrub	*sc = rr->sc;
1466	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
1467	struct xfs_perag	*pag = sc->sa.pag;
1468	struct xfs_btree_cur	*mcur;
1469	xfs_agblock_t		agend;
1470	int			error;
1471
1472	xagb_bitmap_init(&rfg.rmap_gaps);
1473
1474	/* Compute free space from the new rmapbt. */
1475	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1476
1477	error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1478	xfs_btree_del_cursor(mcur, error);
1479	if (error)
1480		goto out_bitmap;
1481
1482	/* Insert a record for space between the last rmap and EOAG. */
1483	agend = be32_to_cpu(agf->agf_length);
1484	if (rfg.next_agbno < agend) {
1485		error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1486				agend - rfg.next_agbno);
1487		if (error)
1488			goto out_bitmap;
1489	}
1490
1491	/* Compute free space from the existing bnobt. */
1492	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1493			sc->sa.pag);
1494	error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1495			&rfg);
1496	xfs_btree_del_cursor(sc->sa.bno_cur, error);
1497	sc->sa.bno_cur = NULL;
1498	if (error)
1499		goto out_bitmap;
1500
1501	/*
1502	 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1503	 * doesn't--these are the old rmapbt blocks.  Credit the old rmapbt
1504	 * block usage count back to the per-AG rmapbt reservation (and not
1505	 * fdblocks, since the rmap btree lives in free space) to keep the
1506	 * reservation and free space accounting correct.
1507	 */
1508	error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1509			&XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1510	if (error)
1511		goto out_bitmap;
1512
1513	/*
1514	 * Now that we've zapped all the old rmapbt blocks we can turn off
1515	 * the alternate height mechanism and reset the per-AG space
1516	 * reservation.
1517	 */
1518	pag->pagf_repair_rmap_level = 0;
1519	sc->flags |= XREP_RESET_PERAG_RESV;
1520out_bitmap:
1521	xagb_bitmap_destroy(&rfg.rmap_gaps);
1522	return error;
1523}
1524
1525static inline bool
1526xrep_rmapbt_want_live_update(
1527	struct xchk_iscan		*iscan,
1528	const struct xfs_owner_info	*oi)
1529{
1530	if (xchk_iscan_aborted(iscan))
1531		return false;
1532
1533	/*
1534	 * Before unlocking the AG header to perform the inode scan, we
1535	 * recorded reverse mappings for all AG metadata except for the OWN_AG
1536	 * metadata.  IOWs, the in-memory btree knows about the AG headers, the
1537	 * two inode btrees, the CoW staging extents, and the refcount btrees.
1538	 * For these types of metadata, we need to record the live updates in
1539	 * the in-memory rmap btree.
1540	 *
1541	 * However, we do not scan the free space btrees or the AGFL until we
1542	 * have re-locked the AGF and are ready to reserve space for the new
1543	 * rmap btree, so we do not want live updates for OWN_AG metadata.
1544	 */
1545	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1546		return oi->oi_owner != XFS_RMAP_OWN_AG;
1547
1548	/* Ignore updates to files that the scanner hasn't visited yet. */
1549	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1550}
1551
1552/*
1553 * Apply a rmapbt update from the regular filesystem into our shadow btree.
1554 * We're running from the thread that owns the AGF buffer and is generating
1555 * the update, so we must be careful about which parts of the struct xrep_rmap
1556 * that we change.
1557 */
1558static int
1559xrep_rmapbt_live_update(
1560	struct notifier_block		*nb,
1561	unsigned long			action,
1562	void				*data)
1563{
1564	struct xfs_rmap_update_params	*p = data;
1565	struct xrep_rmap		*rr;
1566	struct xfs_mount		*mp;
1567	struct xfs_btree_cur		*mcur;
1568	struct xfs_trans		*tp;
1569	void				*txcookie;
1570	int				error;
1571
1572	rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1573	mp = rr->sc->mp;
1574
1575	if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1576		goto out_unlock;
1577
1578	trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
1579
1580	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
1581	if (error)
1582		goto out_abort;
1583
1584	mutex_lock(&rr->lock);
1585	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1586	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1587			p->blockcount, &p->oinfo, p->unwritten);
1588	xfs_btree_del_cursor(mcur, error);
1589	if (error)
1590		goto out_cancel;
1591
1592	error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1593	if (error)
1594		goto out_cancel;
1595
1596	xrep_trans_cancel_hook_dummy(&txcookie, tp);
1597	mutex_unlock(&rr->lock);
1598	return NOTIFY_DONE;
1599
1600out_cancel:
1601	xfbtree_trans_cancel(&rr->rmap_btree, tp);
1602	xrep_trans_cancel_hook_dummy(&txcookie, tp);
1603out_abort:
1604	mutex_unlock(&rr->lock);
1605	xchk_iscan_abort(&rr->iscan);
1606out_unlock:
1607	return NOTIFY_DONE;
1608}
1609
1610/* Set up the filesystem scan components. */
1611STATIC int
1612xrep_rmap_setup_scan(
1613	struct xrep_rmap	*rr)
1614{
1615	struct xfs_scrub	*sc = rr->sc;
1616	int			error;
1617
1618	mutex_init(&rr->lock);
1619
1620	/* Set up in-memory rmap btree */
1621	error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1622			sc->sa.pag->pag_agno);
1623	if (error)
1624		goto out_mutex;
1625
1626	/* Retry iget every tenth of a second for up to 30 seconds. */
1627	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1628
1629	/*
1630	 * Hook into live rmap operations so that we can update our in-memory
1631	 * btree to reflect live changes on the filesystem.  Since we drop the
1632	 * AGF buffer to scan all the inodes, we need this piece to avoid
1633	 * installing a stale btree.
1634	 */
1635	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1636	xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1637	error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
1638	if (error)
1639		goto out_iscan;
1640	return 0;
1641
1642out_iscan:
1643	xchk_iscan_teardown(&rr->iscan);
1644	xfbtree_destroy(&rr->rmap_btree);
1645out_mutex:
1646	mutex_destroy(&rr->lock);
1647	return error;
1648}
1649
1650/* Tear down scan components. */
1651STATIC void
1652xrep_rmap_teardown(
1653	struct xrep_rmap	*rr)
1654{
1655	struct xfs_scrub	*sc = rr->sc;
1656
1657	xchk_iscan_abort(&rr->iscan);
1658	xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
1659	xchk_iscan_teardown(&rr->iscan);
1660	xfbtree_destroy(&rr->rmap_btree);
1661	mutex_destroy(&rr->lock);
1662}
1663
1664/* Repair the rmap btree for some AG. */
1665int
1666xrep_rmapbt(
1667	struct xfs_scrub	*sc)
1668{
1669	struct xrep_rmap	*rr = sc->buf;
1670	int			error;
1671
1672	error = xrep_rmap_setup_scan(rr);
1673	if (error)
1674		return error;
1675
1676	/*
1677	 * Collect rmaps for everything in this AG that isn't space metadata.
1678	 * These rmaps won't change even as we try to allocate blocks.
1679	 */
1680	error = xrep_rmap_find_rmaps(rr);
1681	if (error)
1682		goto out_records;
1683
1684	/* Rebuild the rmap information. */
1685	error = xrep_rmap_build_new_tree(rr);
1686	if (error)
1687		goto out_records;
1688
1689	/* Kill the old tree. */
1690	error = xrep_rmap_remove_old_tree(rr);
1691	if (error)
1692		goto out_records;
1693
1694out_records:
1695	xrep_rmap_teardown(rr);
1696	return error;
1697}