repair.c - fs/xfs/scrub/repair.c - Linux diff v6.9.4

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_trans_resv.h"
  11#include "xfs_mount.h"
  12#include "xfs_btree.h"
  13#include "xfs_log_format.h"
  14#include "xfs_trans.h"
  15#include "xfs_sb.h"
  16#include "xfs_inode.h"
  17#include "xfs_alloc.h"
  18#include "xfs_alloc_btree.h"
  19#include "xfs_ialloc.h"
  20#include "xfs_ialloc_btree.h"
  21#include "xfs_rmap.h"
  22#include "xfs_rmap_btree.h"
  23#include "xfs_refcount_btree.h"
  24#include "xfs_extent_busy.h"
  25#include "xfs_ag.h"
  26#include "xfs_ag_resv.h"
  27#include "xfs_quota.h"
  28#include "xfs_qm.h"
  29#include "xfs_defer.h"
  30#include "xfs_errortag.h"
  31#include "xfs_error.h"
  32#include "xfs_reflink.h"
  33#include "xfs_health.h"
  34#include "xfs_buf_mem.h"
  35#include "scrub/scrub.h"
  36#include "scrub/common.h"
  37#include "scrub/trace.h"
  38#include "scrub/repair.h"
  39#include "scrub/bitmap.h"
  40#include "scrub/stats.h"
  41#include "scrub/xfile.h"
  42
  43/*
  44 * Attempt to repair some metadata, if the metadata is corrupt and userspace
  45 * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
  46 * and will set *fixed to true if it thinks it repaired anything.
  47 */
  48int
  49xrep_attempt(
  50	struct xfs_scrub	*sc,
  51	struct xchk_stats_run	*run)
  52{
  53	u64			repair_start;
  54	int			error = 0;
  55
  56	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
  57
  58	xchk_ag_btcur_free(&sc->sa);
  59
  60	/* Repair whatever's broken. */
  61	ASSERT(sc->ops->repair);
  62	run->repair_attempted = true;
  63	repair_start = xchk_stats_now();
  64	error = sc->ops->repair(sc);
  65	trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
  66	run->repair_ns += xchk_stats_elapsed_ns(repair_start);
  67	switch (error) {
  68	case 0:
  69		/*
  70		 * Repair succeeded.  Commit the fixes and perform a second
  71		 * scrub so that we can tell userspace if we fixed the problem.
  72		 */
  73		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
  74		sc->flags |= XREP_ALREADY_FIXED;
  75		run->repair_succeeded = true;
  76		return -EAGAIN;
  77	case -ECHRNG:
  78		sc->flags |= XCHK_NEED_DRAIN;
  79		run->retries++;
  80		return -EAGAIN;
  81	case -EDEADLOCK:
  82		/* Tell the caller to try again having grabbed all the locks. */
  83		if (!(sc->flags & XCHK_TRY_HARDER)) {
  84			sc->flags |= XCHK_TRY_HARDER;
  85			run->retries++;
  86			return -EAGAIN;
  87		}
  88		/*
  89		 * We tried harder but still couldn't grab all the resources
  90		 * we needed to fix it.  The corruption has not been fixed,
  91		 * so exit to userspace with the scan's output flags unchanged.
  92		 */
  93		return 0;
  94	default:
  95		/*
  96		 * EAGAIN tells the caller to re-scrub, so we cannot return
  97		 * that here.
  98		 */
  99		ASSERT(error != -EAGAIN);
 100		return error;
 101	}
 102}
 103
 104/*
 105 * Complain about unfixable problems in the filesystem.  We don't log
 106 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
 107 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
 108 * administrator isn't running xfs_scrub in no-repairs mode.
 109 *
 110 * Use this helper function because _ratelimited silently declares a static
 111 * structure to track rate limiting information.
 112 */
 113void
 114xrep_failure(
 115	struct xfs_mount	*mp)
 116{
 117	xfs_alert_ratelimited(mp,
 118"Corruption not fixed during online repair.  Unmount and run xfs_repair.");
 119}
 120
 121/*
 122 * Repair probe -- userspace uses this to probe if we're willing to repair a
 123 * given mountpoint.
 124 */
 125int
 126xrep_probe(
 127	struct xfs_scrub	*sc)
 128{
 129	int			error = 0;
 130
 131	if (xchk_should_terminate(sc, &error))
 132		return error;
 133
 134	return 0;
 135}
 136
 137/*
 138 * Roll a transaction, keeping the AG headers locked and reinitializing
 139 * the btree cursors.
 140 */
 141int
 142xrep_roll_ag_trans(
 143	struct xfs_scrub	*sc)
 144{
 145	int			error;
 146
 147	/*
 148	 * Keep the AG header buffers locked while we roll the transaction.
 149	 * Ensure that both AG buffers are dirty and held when we roll the
 150	 * transaction so that they move forward in the log without losing the
 151	 * bli (and hence the bli type) when the transaction commits.
 152	 *
 153	 * Normal code would never hold clean buffers across a roll, but repair
 154	 * needs both buffers to maintain a total lock on the AG.
 155	 */
 156	if (sc->sa.agi_bp) {
 157		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
 158		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 159	}
 160
 161	if (sc->sa.agf_bp) {
 162		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
 163		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 164	}
 165
 166	/*
 167	 * Roll the transaction.  We still hold the AG header buffers locked
 168	 * regardless of whether or not that succeeds.  On failure, the buffers
 169	 * will be released during teardown on our way out of the kernel.  If
 170	 * successful, join the buffers to the new transaction and move on.
 171	 */
 172	error = xfs_trans_roll(&sc->tp);
 173	if (error)
 174		return error;
 175
 176	/* Join the AG headers to the new transaction. */
 177	if (sc->sa.agi_bp)
 178		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 179	if (sc->sa.agf_bp)
 180		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 181
 182	return 0;
 183}
 184
 185/* Roll the scrub transaction, holding the primary metadata locked. */
 186int
 187xrep_roll_trans(
 188	struct xfs_scrub	*sc)
 189{
 190	if (!sc->ip)
 191		return xrep_roll_ag_trans(sc);
 192	return xfs_trans_roll_inode(&sc->tp, sc->ip);
 193}
 194
 195/* Finish all deferred work attached to the repair transaction. */
 196int
 197xrep_defer_finish(
 198	struct xfs_scrub	*sc)
 199{
 200	int			error;
 201
 202	/*
 203	 * Keep the AG header buffers locked while we complete deferred work
 204	 * items.  Ensure that both AG buffers are dirty and held when we roll
 205	 * the transaction so that they move forward in the log without losing
 206	 * the bli (and hence the bli type) when the transaction commits.
 207	 *
 208	 * Normal code would never hold clean buffers across a roll, but repair
 209	 * needs both buffers to maintain a total lock on the AG.
 210	 */
 211	if (sc->sa.agi_bp) {
 212		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
 213		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 214	}
 215
 216	if (sc->sa.agf_bp) {
 217		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
 218		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 219	}
 220
 221	/*
 222	 * Finish all deferred work items.  We still hold the AG header buffers
 223	 * locked regardless of whether or not that succeeds.  On failure, the
 224	 * buffers will be released during teardown on our way out of the
 225	 * kernel.  If successful, join the buffers to the new transaction
 226	 * and move on.
 227	 */
 228	error = xfs_defer_finish(&sc->tp);
 229	if (error)
 230		return error;
 231
 232	/*
 233	 * Release the hold that we set above because defer_finish won't do
 234	 * that for us.  The defer roll code redirties held buffers after each
 235	 * roll, so the AG header buffers should be ready for logging.
 236	 */
 237	if (sc->sa.agi_bp)
 238		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
 239	if (sc->sa.agf_bp)
 240		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
 241
 242	return 0;
 243}
 244
 245/*
 246 * Does the given AG have enough space to rebuild a btree?  Neither AG
 247 * reservation can be critical, and we must have enough space (factoring
 248 * in AG reservations) to construct a whole btree.
 249 */
 250bool
 251xrep_ag_has_space(
 252	struct xfs_perag	*pag,
 253	xfs_extlen_t		nr_blocks,
 254	enum xfs_ag_resv_type	type)
 255{
 256	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
 257		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
 258		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
 259}
 260
 261/*
 262 * Figure out how many blocks to reserve for an AG repair.  We calculate the
 263 * worst case estimate for the number of blocks we'd need to rebuild one of
 264 * any type of per-AG btree.
 265 */
 266xfs_extlen_t
 267xrep_calc_ag_resblks(
 268	struct xfs_scrub		*sc)
 269{
 270	struct xfs_mount		*mp = sc->mp;
 271	struct xfs_scrub_metadata	*sm = sc->sm;
 272	struct xfs_perag		*pag;
 273	struct xfs_buf			*bp;
 274	xfs_agino_t			icount = NULLAGINO;
 275	xfs_extlen_t			aglen = NULLAGBLOCK;
 276	xfs_extlen_t			usedlen;
 277	xfs_extlen_t			freelen;
 278	xfs_extlen_t			bnobt_sz;
 279	xfs_extlen_t			inobt_sz;
 280	xfs_extlen_t			rmapbt_sz;
 281	xfs_extlen_t			refcbt_sz;
 282	int				error;
 283
 284	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 285		return 0;
 286
 287	pag = xfs_perag_get(mp, sm->sm_agno);
 288	if (xfs_perag_initialised_agi(pag)) {
 289		/* Use in-core icount if possible. */
 290		icount = pag->pagi_count;
 291	} else {
 292		/* Try to get the actual counters from disk. */
 293		error = xfs_ialloc_read_agi(pag, NULL, &bp);
 294		if (!error) {
 295			icount = pag->pagi_count;
 296			xfs_buf_relse(bp);
 297		}
 298	}
 299
 300	/* Now grab the block counters from the AGF. */
 301	error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
 302	if (error) {
 303		aglen = pag->block_count;
 304		freelen = aglen;
 305		usedlen = aglen;
 306	} else {
 307		struct xfs_agf	*agf = bp->b_addr;
 308
 309		aglen = be32_to_cpu(agf->agf_length);
 310		freelen = be32_to_cpu(agf->agf_freeblks);
 311		usedlen = aglen - freelen;
 312		xfs_buf_relse(bp);
 313	}
 314
 315	/* If the icount is impossible, make some worst-case assumptions. */
 316	if (icount == NULLAGINO ||
 317	    !xfs_verify_agino(pag, icount)) {
 318		icount = pag->agino_max - pag->agino_min + 1;
 319	}
 320
 321	/* If the block counts are impossible, make worst-case assumptions. */
 322	if (aglen == NULLAGBLOCK ||
 323	    aglen != pag->block_count ||
 324	    freelen >= aglen) {
 325		aglen = pag->block_count;
 326		freelen = aglen;
 327		usedlen = aglen;
 328	}
 329	xfs_perag_put(pag);
 330
 331	trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
 332			freelen, usedlen);
 333
 334	/*
 335	 * Figure out how many blocks we'd need worst case to rebuild
 336	 * each type of btree.  Note that we can only rebuild the
 337	 * bnobt/cntbt or inobt/finobt as pairs.
 338	 */
 339	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
 340	if (xfs_has_sparseinodes(mp))
 341		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 342				XFS_INODES_PER_HOLEMASK_BIT);
 343	else
 344		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 345				XFS_INODES_PER_CHUNK);
 346	if (xfs_has_finobt(mp))
 347		inobt_sz *= 2;
 348	if (xfs_has_reflink(mp))
 349		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
 350	else
 351		refcbt_sz = 0;
 352	if (xfs_has_rmapbt(mp)) {
 353		/*
 354		 * Guess how many blocks we need to rebuild the rmapbt.
 355		 * For non-reflink filesystems we can't have more records than
 356		 * used blocks.  However, with reflink it's possible to have
 357		 * more than one rmap record per AG block.  We don't know how
 358		 * many rmaps there could be in the AG, so we start off with
 359		 * what we hope is an generous over-estimation.
 360		 */
 361		if (xfs_has_reflink(mp))
 362			rmapbt_sz = xfs_rmapbt_calc_size(mp,
 363					(unsigned long long)aglen * 2);
 364		else
 365			rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
 366	} else {
 367		rmapbt_sz = 0;
 368	}
 369
 370	trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
 371			inobt_sz, rmapbt_sz, refcbt_sz);
 372
 373	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 374}
 375
 376/*
 377 * Reconstructing per-AG Btrees
 378 *
 379 * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
 380 * we scan secondary space metadata to derive the records that should be in
 381 * the damaged btree, initialize a fresh btree root, and insert the records.
 382 * Note that for rebuilding the rmapbt we scan all the primary data to
 383 * generate the new records.
 384 *
 385 * However, that leaves the matter of removing all the metadata describing the
 386 * old broken structure.  For primary metadata we use the rmap data to collect
 387 * every extent with a matching rmap owner (bitmap); we then iterate all other
 388 * metadata structures with the same rmap owner to collect the extents that
 389 * cannot be removed (sublist).  We then subtract sublist from bitmap to
 390 * derive the blocks that were used by the old btree.  These blocks can be
 391 * reaped.
 392 *
 393 * For rmapbt reconstructions we must use different tactics for extent
 394 * collection.  First we iterate all primary metadata (this excludes the old
 395 * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
 396 * records are collected as bitmap.  The bnobt records are collected as
 397 * sublist.  As with the other btrees we subtract sublist from bitmap, and the
 398 * result (since the rmapbt lives in the free space) are the blocks from the
 399 * old rmapbt.
 400 */
 401
 402/* Ensure the freelist is the correct size. */
 403int
 404xrep_fix_freelist(
 405	struct xfs_scrub	*sc,
 406	int			alloc_flags)
 407{
 408	struct xfs_alloc_arg	args = {0};
 409
 410	args.mp = sc->mp;
 411	args.tp = sc->tp;
 412	args.agno = sc->sa.pag->pag_agno;
 413	args.alignment = 1;
 414	args.pag = sc->sa.pag;
 415
 416	return xfs_alloc_fix_freelist(&args, alloc_flags);
 
 417}
 418
 419/*
 420 * Finding per-AG Btree Roots for AGF/AGI Reconstruction
 421 *
 422 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
 423 * the AG headers by using the rmap data to rummage through the AG looking for
 424 * btree roots.  This is not guaranteed to work if the AG is heavily damaged
 425 * or the rmap data are corrupt.
 426 *
 427 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
 428 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
 429 * AGI is being rebuilt.  It must maintain these locks until it's safe for
 430 * other threads to change the btrees' shapes.  The caller provides
 431 * information about the btrees to look for by passing in an array of
 432 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
 433 * The (root, height) fields will be set on return if anything is found.  The
 434 * last element of the array should have a NULL buf_ops to mark the end of the
 435 * array.
 436 *
 437 * For every rmapbt record matching any of the rmap owners in btree_info,
 438 * read each block referenced by the rmap record.  If the block is a btree
 439 * block from this filesystem matching any of the magic numbers and has a
 440 * level higher than what we've already seen, remember the block and the
 441 * height of the tree required to have such a block.  When the call completes,
 442 * we return the highest block we've found for each btree description; those
 443 * should be the roots.
 444 */
 445
 446struct xrep_findroot {
 447	struct xfs_scrub		*sc;
 448	struct xfs_buf			*agfl_bp;
 449	struct xfs_agf			*agf;
 450	struct xrep_find_ag_btree	*btree_info;
 451};
 452
 453/* See if our block is in the AGFL. */
 454STATIC int
 455xrep_findroot_agfl_walk(
 456	struct xfs_mount	*mp,
 457	xfs_agblock_t		bno,
 458	void			*priv)
 459{
 460	xfs_agblock_t		*agbno = priv;
 461
 462	return (*agbno == bno) ? -ECANCELED : 0;
 463}
 464
 465/* Does this block match the btree information passed in? */
 466STATIC int
 467xrep_findroot_block(
 468	struct xrep_findroot		*ri,
 469	struct xrep_find_ag_btree	*fab,
 470	uint64_t			owner,
 471	xfs_agblock_t			agbno,
 472	bool				*done_with_block)
 473{
 474	struct xfs_mount		*mp = ri->sc->mp;
 475	struct xfs_buf			*bp;
 476	struct xfs_btree_block		*btblock;
 477	xfs_daddr_t			daddr;
 478	int				block_level;
 479	int				error = 0;
 480
 481	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
 482
 483	/*
 484	 * Blocks in the AGFL have stale contents that might just happen to
 485	 * have a matching magic and uuid.  We don't want to pull these blocks
 486	 * in as part of a tree root, so we have to filter out the AGFL stuff
 487	 * here.  If the AGFL looks insane we'll just refuse to repair.
 488	 */
 489	if (owner == XFS_RMAP_OWN_AG) {
 490		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 491				xrep_findroot_agfl_walk, &agbno);
 492		if (error == -ECANCELED)
 493			return 0;
 494		if (error)
 495			return error;
 496	}
 497
 498	/*
 499	 * Read the buffer into memory so that we can see if it's a match for
 500	 * our btree type.  We have no clue if it is beforehand, and we want to
 501	 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
 502	 * will cause needless disk reads in subsequent calls to this function)
 503	 * and logging metadata verifier failures.
 504	 *
 505	 * Therefore, pass in NULL buffer ops.  If the buffer was already in
 506	 * memory from some other caller it will already have b_ops assigned.
 507	 * If it was in memory from a previous unsuccessful findroot_block
 508	 * call, the buffer won't have b_ops but it should be clean and ready
 509	 * for us to try to verify if the read call succeeds.  The same applies
 510	 * if the buffer wasn't in memory at all.
 511	 *
 512	 * Note: If we never match a btree type with this buffer, it will be
 513	 * left in memory with NULL b_ops.  This shouldn't be a problem unless
 514	 * the buffer gets written.
 515	 */
 516	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
 517			mp->m_bsize, 0, &bp, NULL);
 518	if (error)
 519		return error;
 520
 521	/* Ensure the block magic matches the btree type we're looking for. */
 522	btblock = XFS_BUF_TO_BLOCK(bp);
 523	ASSERT(fab->buf_ops->magic[1] != 0);
 524	if (btblock->bb_magic != fab->buf_ops->magic[1])
 525		goto out;
 526
 527	/*
 528	 * If the buffer already has ops applied and they're not the ones for
 529	 * this btree type, we know this block doesn't match the btree and we
 530	 * can bail out.
 531	 *
 532	 * If the buffer ops match ours, someone else has already validated
 533	 * the block for us, so we can move on to checking if this is a root
 534	 * block candidate.
 535	 *
 536	 * If the buffer does not have ops, nobody has successfully validated
 537	 * the contents and the buffer cannot be dirty.  If the magic, uuid,
 538	 * and structure match this btree type then we'll move on to checking
 539	 * if it's a root block candidate.  If there is no match, bail out.
 540	 */
 541	if (bp->b_ops) {
 542		if (bp->b_ops != fab->buf_ops)
 543			goto out;
 544	} else {
 545		ASSERT(!xfs_trans_buf_is_dirty(bp));
 546		if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
 547				&mp->m_sb.sb_meta_uuid))
 548			goto out;
 549		/*
 550		 * Read verifiers can reference b_ops, so we set the pointer
 551		 * here.  If the verifier fails we'll reset the buffer state
 552		 * to what it was before we touched the buffer.
 553		 */
 554		bp->b_ops = fab->buf_ops;
 555		fab->buf_ops->verify_read(bp);
 556		if (bp->b_error) {
 557			bp->b_ops = NULL;
 558			bp->b_error = 0;
 559			goto out;
 560		}
 561
 562		/*
 563		 * Some read verifiers will (re)set b_ops, so we must be
 564		 * careful not to change b_ops after running the verifier.
 565		 */
 566	}
 567
 568	/*
 569	 * This block passes the magic/uuid and verifier tests for this btree
 570	 * type.  We don't need the caller to try the other tree types.
 571	 */
 572	*done_with_block = true;
 573
 574	/*
 575	 * Compare this btree block's level to the height of the current
 576	 * candidate root block.
 577	 *
 578	 * If the level matches the root we found previously, throw away both
 579	 * blocks because there can't be two candidate roots.
 580	 *
 581	 * If level is lower in the tree than the root we found previously,
 582	 * ignore this block.
 583	 */
 584	block_level = xfs_btree_get_level(btblock);
 585	if (block_level + 1 == fab->height) {
 586		fab->root = NULLAGBLOCK;
 587		goto out;
 588	} else if (block_level < fab->height) {
 589		goto out;
 590	}
 591
 592	/*
 593	 * This is the highest block in the tree that we've found so far.
 594	 * Update the btree height to reflect what we've learned from this
 595	 * block.
 596	 */
 597	fab->height = block_level + 1;
 598
 599	/*
 600	 * If this block doesn't have sibling pointers, then it's the new root
 601	 * block candidate.  Otherwise, the root will be found farther up the
 602	 * tree.
 603	 */
 604	if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
 605	    btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
 606		fab->root = agbno;
 607	else
 608		fab->root = NULLAGBLOCK;
 609
 610	trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
 611			be32_to_cpu(btblock->bb_magic), fab->height - 1);
 612out:
 613	xfs_trans_brelse(ri->sc->tp, bp);
 614	return error;
 615}
 616
 617/*
 618 * Do any of the blocks in this rmap record match one of the btrees we're
 619 * looking for?
 620 */
 621STATIC int
 622xrep_findroot_rmap(
 623	struct xfs_btree_cur		*cur,
 624	const struct xfs_rmap_irec	*rec,
 625	void				*priv)
 626{
 627	struct xrep_findroot		*ri = priv;
 628	struct xrep_find_ag_btree	*fab;
 629	xfs_agblock_t			b;
 630	bool				done;
 631	int				error = 0;
 632
 633	/* Ignore anything that isn't AG metadata. */
 634	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
 635		return 0;
 636
 637	/* Otherwise scan each block + btree type. */
 638	for (b = 0; b < rec->rm_blockcount; b++) {
 639		done = false;
 640		for (fab = ri->btree_info; fab->buf_ops; fab++) {
 641			if (rec->rm_owner != fab->rmap_owner)
 642				continue;
 643			error = xrep_findroot_block(ri, fab,
 644					rec->rm_owner, rec->rm_startblock + b,
 645					&done);
 646			if (error)
 647				return error;
 648			if (done)
 649				break;
 650		}
 651	}
 652
 653	return 0;
 654}
 655
 656/* Find the roots of the per-AG btrees described in btree_info. */
 657int
 658xrep_find_ag_btree_roots(
 659	struct xfs_scrub		*sc,
 660	struct xfs_buf			*agf_bp,
 661	struct xrep_find_ag_btree	*btree_info,
 662	struct xfs_buf			*agfl_bp)
 663{
 664	struct xfs_mount		*mp = sc->mp;
 665	struct xrep_findroot		ri;
 666	struct xrep_find_ag_btree	*fab;
 667	struct xfs_btree_cur		*cur;
 668	int				error;
 669
 670	ASSERT(xfs_buf_islocked(agf_bp));
 671	ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
 672
 673	ri.sc = sc;
 674	ri.btree_info = btree_info;
 675	ri.agf = agf_bp->b_addr;
 676	ri.agfl_bp = agfl_bp;
 677	for (fab = btree_info; fab->buf_ops; fab++) {
 678		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
 679		ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
 680		fab->root = NULLAGBLOCK;
 681		fab->height = 0;
 682	}
 683
 684	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 685	error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
 686	xfs_btree_del_cursor(cur, error);
 687
 688	return error;
 689}
 690
 691#ifdef CONFIG_XFS_QUOTA
 692/* Update some quota flags in the superblock. */
 693void
 694xrep_update_qflags(
 695	struct xfs_scrub	*sc,
 696	unsigned int		clear_flags,
 697	unsigned int		set_flags)
 698{
 699	struct xfs_mount	*mp = sc->mp;
 700	struct xfs_buf		*bp;
 701
 702	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
 703	if ((mp->m_qflags & clear_flags) == 0 &&
 704	    (mp->m_qflags & set_flags) == set_flags)
 705		goto no_update;
 706
 707	mp->m_qflags &= ~clear_flags;
 708	mp->m_qflags |= set_flags;
 709
 710	spin_lock(&mp->m_sb_lock);
 711	mp->m_sb.sb_qflags &= ~clear_flags;
 712	mp->m_sb.sb_qflags |= set_flags;
 713	spin_unlock(&mp->m_sb_lock);
 714
 715	/*
 716	 * Update the quota flags in the ondisk superblock without touching
 717	 * the summary counters.  We have not quiesced inode chunk allocation,
 718	 * so we cannot coordinate with updates to the icount and ifree percpu
 719	 * counters.
 720	 */
 721	bp = xfs_trans_getsb(sc->tp);
 722	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
 723	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
 724	xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
 725
 726no_update:
 727	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
 728}
 729
 730/* Force a quotacheck the next time we mount. */
 731void
 732xrep_force_quotacheck(
 733	struct xfs_scrub	*sc,
 734	xfs_dqtype_t		type)
 735{
 736	uint			flag;
 737
 738	flag = xfs_quota_chkd_flag(type);
 739	if (!(flag & sc->mp->m_qflags))
 740		return;
 741
 742	xrep_update_qflags(sc, flag, 0);
 
 
 
 
 
 
 743}
 744
 745/*
 746 * Attach dquots to this inode, or schedule quotacheck to fix them.
 747 *
 748 * This function ensures that the appropriate dquots are attached to an inode.
 749 * We cannot allow the dquot code to allocate an on-disk dquot block here
 750 * because we're already in transaction context.  The on-disk dquot should
 751 * already exist anyway.  If the quota code signals corruption or missing quota
 752 * information, schedule quotacheck, which will repair corruptions in the quota
 753 * metadata.
 754 */
 755int
 756xrep_ino_dqattach(
 757	struct xfs_scrub	*sc)
 758{
 759	int			error;
 760
 761	ASSERT(sc->tp != NULL);
 762	ASSERT(sc->ip != NULL);
 763
 764	error = xfs_qm_dqattach(sc->ip);
 765	switch (error) {
 766	case -EFSBADCRC:
 767	case -EFSCORRUPTED:
 768	case -ENOENT:
 769		xfs_err_ratelimited(sc->mp,
 770"inode %llu repair encountered quota error %d, quotacheck forced.",
 771				(unsigned long long)sc->ip->i_ino, error);
 772		if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
 773			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
 774		if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
 775			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
 776		if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
 777			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
 778		fallthrough;
 779	case -ESRCH:
 780		error = 0;
 781		break;
 782	default:
 783		break;
 784	}
 785
 786	return error;
 787}
 788#endif /* CONFIG_XFS_QUOTA */
 789
 790/*
 791 * Ensure that the inode being repaired is ready to handle a certain number of
 792 * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
 793 * being repaired and have joined it to the scrub transaction.
 794 */
 795int
 796xrep_ino_ensure_extent_count(
 797	struct xfs_scrub	*sc,
 798	int			whichfork,
 799	xfs_extnum_t		nextents)
 800{
 801	xfs_extnum_t		max_extents;
 802	bool			inode_has_nrext64;
 803
 804	inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
 805	max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
 806	if (nextents <= max_extents)
 807		return 0;
 808	if (inode_has_nrext64)
 809		return -EFSCORRUPTED;
 810	if (!xfs_has_large_extent_counts(sc->mp))
 811		return -EFSCORRUPTED;
 812
 813	max_extents = xfs_iext_max_nextents(true, whichfork);
 814	if (nextents > max_extents)
 815		return -EFSCORRUPTED;
 816
 817	sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
 818	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
 819	return 0;
 820}
 821
 822/*
 823 * Initialize all the btree cursors for an AG repair except for the btree that
 824 * we're rebuilding.
 825 */
 826void
 827xrep_ag_btcur_init(
 828	struct xfs_scrub	*sc,
 829	struct xchk_ag		*sa)
 830{
 831	struct xfs_mount	*mp = sc->mp;
 832
 833	/* Set up a bnobt cursor for cross-referencing. */
 834	if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
 835	    sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
 836		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
 837				sc->sa.pag);
 838		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
 839				sc->sa.pag);
 840	}
 841
 842	/* Set up a inobt cursor for cross-referencing. */
 843	if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
 844	    sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
 845		sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
 846				sa->agi_bp);
 847		if (xfs_has_finobt(mp))
 848			sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
 849					sc->tp, sa->agi_bp);
 850	}
 851
 852	/* Set up a rmapbt cursor for cross-referencing. */
 853	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
 854	    xfs_has_rmapbt(mp))
 855		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
 856				sc->sa.pag);
 857
 858	/* Set up a refcountbt cursor for cross-referencing. */
 859	if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
 860	    xfs_has_reflink(mp))
 861		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 862				sa->agf_bp, sc->sa.pag);
 863}
 864
 865/*
 866 * Reinitialize the in-core AG state after a repair by rereading the AGF
 867 * buffer.  We had better get the same AGF buffer as the one that's attached
 868 * to the scrub context.
 869 */
 870int
 871xrep_reinit_pagf(
 872	struct xfs_scrub	*sc)
 873{
 874	struct xfs_perag	*pag = sc->sa.pag;
 875	struct xfs_buf		*bp;
 876	int			error;
 877
 878	ASSERT(pag);
 879	ASSERT(xfs_perag_initialised_agf(pag));
 880
 881	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
 882	error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
 883	if (error)
 884		return error;
 885
 886	if (bp != sc->sa.agf_bp) {
 887		ASSERT(bp == sc->sa.agf_bp);
 888		return -EFSCORRUPTED;
 889	}
 890
 891	return 0;
 892}
 893
 894/*
 895 * Reinitialize the in-core AG state after a repair by rereading the AGI
 896 * buffer.  We had better get the same AGI buffer as the one that's attached
 897 * to the scrub context.
 898 */
 899int
 900xrep_reinit_pagi(
 901	struct xfs_scrub	*sc)
 902{
 903	struct xfs_perag	*pag = sc->sa.pag;
 904	struct xfs_buf		*bp;
 905	int			error;
 906
 907	ASSERT(pag);
 908	ASSERT(xfs_perag_initialised_agi(pag));
 909
 910	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
 911	error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
 912	if (error)
 913		return error;
 914
 915	if (bp != sc->sa.agi_bp) {
 916		ASSERT(bp == sc->sa.agi_bp);
 917		return -EFSCORRUPTED;
 918	}
 919
 920	return 0;
 921}
 922
 923/*
 924 * Given an active reference to a perag structure, load AG headers and cursors.
 925 * This should only be called to scan an AG while repairing file-based metadata.
 926 */
 927int
 928xrep_ag_init(
 929	struct xfs_scrub	*sc,
 930	struct xfs_perag	*pag,
 931	struct xchk_ag		*sa)
 932{
 933	int			error;
 934
 935	ASSERT(!sa->pag);
 936
 937	error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
 938	if (error)
 939		return error;
 940
 941	error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
 942	if (error)
 943		return error;
 944
 945	/* Grab our own passive reference from the caller's ref. */
 946	sa->pag = xfs_perag_hold(pag);
 947	xrep_ag_btcur_init(sc, sa);
 948	return 0;
 949}
 950
 951/* Reinitialize the per-AG block reservation for the AG we just fixed. */
 952int
 953xrep_reset_perag_resv(
 954	struct xfs_scrub	*sc)
 955{
 956	int			error;
 957
 958	if (!(sc->flags & XREP_RESET_PERAG_RESV))
 959		return 0;
 960
 961	ASSERT(sc->sa.pag != NULL);
 962	ASSERT(sc->ops->type == ST_PERAG);
 963	ASSERT(sc->tp);
 964
 965	sc->flags &= ~XREP_RESET_PERAG_RESV;
 966	error = xfs_ag_resv_free(sc->sa.pag);
 967	if (error)
 968		goto out;
 969	error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
 970	if (error == -ENOSPC) {
 971		xfs_err(sc->mp,
 972"Insufficient free space to reset per-AG reservation for AG %u after repair.",
 973				sc->sa.pag->pag_agno);
 974		error = 0;
 975	}
 976
 977out:
 978	return error;
 979}
 980
 981/* Decide if we are going to call the repair function for a scrub type. */
 982bool
 983xrep_will_attempt(
 984	struct xfs_scrub	*sc)
 985{
 986	/* Userspace asked us to rebuild the structure regardless. */
 987	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
 988		return true;
 989
 990	/* Let debug users force us into the repair routines. */
 991	if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
 992		return true;
 993
 994	/* Metadata is corrupt or failed cross-referencing. */
 995	if (xchk_needs_repair(sc->sm))
 996		return true;
 997
 998	return false;
 999}
1000
1001/* Try to fix some part of a metadata inode by calling another scrubber. */
1002STATIC int
1003xrep_metadata_inode_subtype(
1004	struct xfs_scrub	*sc,
1005	unsigned int		scrub_type)
1006{
1007	__u32			smtype = sc->sm->sm_type;
1008	__u32			smflags = sc->sm->sm_flags;
1009	unsigned int		sick_mask = sc->sick_mask;
1010	int			error;
1011
1012	/*
1013	 * Let's see if the inode needs repair.  We're going to open-code calls
1014	 * to the scrub and repair functions so that we can hang on to the
1015	 * resources that we already acquired instead of using the standard
1016	 * setup/teardown routines.
1017	 */
1018	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
1019	sc->sm->sm_type = scrub_type;
1020
1021	switch (scrub_type) {
1022	case XFS_SCRUB_TYPE_INODE:
1023		error = xchk_inode(sc);
1024		break;
1025	case XFS_SCRUB_TYPE_BMBTD:
1026		error = xchk_bmap_data(sc);
1027		break;
1028	case XFS_SCRUB_TYPE_BMBTA:
1029		error = xchk_bmap_attr(sc);
1030		break;
1031	default:
1032		ASSERT(0);
1033		error = -EFSCORRUPTED;
1034	}
1035	if (error)
1036		goto out;
1037
1038	if (!xrep_will_attempt(sc))
1039		goto out;
1040
1041	/*
1042	 * Repair some part of the inode.  This will potentially join the inode
1043	 * to the transaction.
1044	 */
1045	switch (scrub_type) {
1046	case XFS_SCRUB_TYPE_INODE:
1047		error = xrep_inode(sc);
1048		break;
1049	case XFS_SCRUB_TYPE_BMBTD:
1050		error = xrep_bmap(sc, XFS_DATA_FORK, false);
1051		break;
1052	case XFS_SCRUB_TYPE_BMBTA:
1053		error = xrep_bmap(sc, XFS_ATTR_FORK, false);
1054		break;
1055	}
1056	if (error)
1057		goto out;
1058
1059	/*
1060	 * Finish all deferred intent items and then roll the transaction so
1061	 * that the inode will not be joined to the transaction when we exit
1062	 * the function.
1063	 */
1064	error = xfs_defer_finish(&sc->tp);
1065	if (error)
1066		goto out;
1067	error = xfs_trans_roll(&sc->tp);
1068	if (error)
1069		goto out;
1070
1071	/*
1072	 * Clear the corruption flags and re-check the metadata that we just
1073	 * repaired.
1074	 */
1075	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
1076
1077	switch (scrub_type) {
1078	case XFS_SCRUB_TYPE_INODE:
1079		error = xchk_inode(sc);
1080		break;
1081	case XFS_SCRUB_TYPE_BMBTD:
1082		error = xchk_bmap_data(sc);
1083		break;
1084	case XFS_SCRUB_TYPE_BMBTA:
1085		error = xchk_bmap_attr(sc);
1086		break;
1087	}
1088	if (error)
1089		goto out;
1090
1091	/* If corruption persists, the repair has failed. */
1092	if (xchk_needs_repair(sc->sm)) {
1093		error = -EFSCORRUPTED;
1094		goto out;
1095	}
1096out:
1097	sc->sick_mask = sick_mask;
1098	sc->sm->sm_type = smtype;
1099	sc->sm->sm_flags = smflags;
1100	return error;
1101}
1102
1103/*
1104 * Repair the ondisk forks of a metadata inode.  The caller must ensure that
1105 * sc->ip points to the metadata inode and the ILOCK is held on that inode.
1106 * The inode must not be joined to the transaction before the call, and will
1107 * not be afterwards.
1108 */
1109int
1110xrep_metadata_inode_forks(
1111	struct xfs_scrub	*sc)
1112{
1113	bool			dirty = false;
1114	int			error;
1115
1116	/* Repair the inode record and the data fork. */
1117	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1118	if (error)
1119		return error;
1120
1121	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1122	if (error)
1123		return error;
1124
1125	/* Make sure the attr fork looks ok before we delete it. */
1126	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
1127	if (error)
1128		return error;
1129
1130	/* Clear the reflink flag since metadata never shares. */
1131	if (xfs_is_reflink_inode(sc->ip)) {
1132		dirty = true;
1133		xfs_trans_ijoin(sc->tp, sc->ip, 0);
1134		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1135		if (error)
1136			return error;
1137	}
1138
1139	/*
1140	 * If we modified the inode, roll the transaction but don't rejoin the
1141	 * inode to the new transaction because xrep_bmap_data can do that.
1142	 */
1143	if (dirty) {
1144		error = xfs_trans_roll(&sc->tp);
1145		if (error)
1146			return error;
1147		dirty = false;
1148	}
1149
1150	return 0;
1151}
1152
1153/*
1154 * Set up an in-memory buffer cache so that we can use the xfbtree.  Allocating
1155 * a shmem file might take loks, so we cannot be in transaction context.  Park
1156 * our resources in the scrub context and let the teardown function take care
1157 * of them at the right time.
1158 */
1159int
1160xrep_setup_xfbtree(
1161	struct xfs_scrub	*sc,
1162	const char		*descr)
1163{
1164	ASSERT(sc->tp == NULL);
1165
1166	return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
1167}
1168
1169/*
1170 * Create a dummy transaction for use in a live update hook function.  This
1171 * function MUST NOT be called from regular repair code because the current
1172 * process' transaction is saved via the cookie.
1173 */
1174int
1175xrep_trans_alloc_hook_dummy(
1176	struct xfs_mount	*mp,
1177	void			**cookiep,
1178	struct xfs_trans	**tpp)
1179{
1180	int			error;
1181
1182	*cookiep = current->journal_info;
1183	current->journal_info = NULL;
1184
1185	error = xfs_trans_alloc_empty(mp, tpp);
1186	if (!error)
1187		return 0;
1188
1189	current->journal_info = *cookiep;
1190	*cookiep = NULL;
1191	return error;
1192}
1193
1194/* Cancel a dummy transaction used by a live update hook function. */
1195void
1196xrep_trans_cancel_hook_dummy(
1197	void			**cookiep,
1198	struct xfs_trans	*tp)
1199{
1200	xfs_trans_cancel(tp);
1201	current->journal_info = *cookiep;
1202	*cookiep = NULL;
1203}

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_trans_resv.h"
  11#include "xfs_mount.h"
  12#include "xfs_btree.h"
  13#include "xfs_log_format.h"
  14#include "xfs_trans.h"
  15#include "xfs_sb.h"
  16#include "xfs_inode.h"
  17#include "xfs_alloc.h"
  18#include "xfs_alloc_btree.h"
  19#include "xfs_ialloc.h"
  20#include "xfs_ialloc_btree.h"
  21#include "xfs_rmap.h"
  22#include "xfs_rmap_btree.h"
  23#include "xfs_refcount_btree.h"
  24#include "xfs_extent_busy.h"
  25#include "xfs_ag.h"
  26#include "xfs_ag_resv.h"
  27#include "xfs_quota.h"
  28#include "xfs_qm.h"
  29#include "xfs_defer.h"
  30#include "xfs_errortag.h"
  31#include "xfs_error.h"
  32#include "xfs_reflink.h"
 
 
  33#include "scrub/scrub.h"
  34#include "scrub/common.h"
  35#include "scrub/trace.h"
  36#include "scrub/repair.h"
  37#include "scrub/bitmap.h"
  38#include "scrub/stats.h"
 
  39
  40/*
  41 * Attempt to repair some metadata, if the metadata is corrupt and userspace
  42 * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
  43 * and will set *fixed to true if it thinks it repaired anything.
  44 */
  45int
  46xrep_attempt(
  47	struct xfs_scrub	*sc,
  48	struct xchk_stats_run	*run)
  49{
  50	u64			repair_start;
  51	int			error = 0;
  52
  53	trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
  54
  55	xchk_ag_btcur_free(&sc->sa);
  56
  57	/* Repair whatever's broken. */
  58	ASSERT(sc->ops->repair);
  59	run->repair_attempted = true;
  60	repair_start = xchk_stats_now();
  61	error = sc->ops->repair(sc);
  62	trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
  63	run->repair_ns += xchk_stats_elapsed_ns(repair_start);
  64	switch (error) {
  65	case 0:
  66		/*
  67		 * Repair succeeded.  Commit the fixes and perform a second
  68		 * scrub so that we can tell userspace if we fixed the problem.
  69		 */
  70		sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
  71		sc->flags |= XREP_ALREADY_FIXED;
  72		run->repair_succeeded = true;
  73		return -EAGAIN;
  74	case -ECHRNG:
  75		sc->flags |= XCHK_NEED_DRAIN;
  76		run->retries++;
  77		return -EAGAIN;
  78	case -EDEADLOCK:
  79		/* Tell the caller to try again having grabbed all the locks. */
  80		if (!(sc->flags & XCHK_TRY_HARDER)) {
  81			sc->flags |= XCHK_TRY_HARDER;
  82			run->retries++;
  83			return -EAGAIN;
  84		}
  85		/*
  86		 * We tried harder but still couldn't grab all the resources
  87		 * we needed to fix it.  The corruption has not been fixed,
  88		 * so exit to userspace with the scan's output flags unchanged.
  89		 */
  90		return 0;
  91	default:
  92		/*
  93		 * EAGAIN tells the caller to re-scrub, so we cannot return
  94		 * that here.
  95		 */
  96		ASSERT(error != -EAGAIN);
  97		return error;
  98	}
  99}
 100
 101/*
 102 * Complain about unfixable problems in the filesystem.  We don't log
 103 * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
 104 * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
 105 * administrator isn't running xfs_scrub in no-repairs mode.
 106 *
 107 * Use this helper function because _ratelimited silently declares a static
 108 * structure to track rate limiting information.
 109 */
 110void
 111xrep_failure(
 112	struct xfs_mount	*mp)
 113{
 114	xfs_alert_ratelimited(mp,
 115"Corruption not fixed during online repair.  Unmount and run xfs_repair.");
 116}
 117
 118/*
 119 * Repair probe -- userspace uses this to probe if we're willing to repair a
 120 * given mountpoint.
 121 */
 122int
 123xrep_probe(
 124	struct xfs_scrub	*sc)
 125{
 126	int			error = 0;
 127
 128	if (xchk_should_terminate(sc, &error))
 129		return error;
 130
 131	return 0;
 132}
 133
 134/*
 135 * Roll a transaction, keeping the AG headers locked and reinitializing
 136 * the btree cursors.
 137 */
 138int
 139xrep_roll_ag_trans(
 140	struct xfs_scrub	*sc)
 141{
 142	int			error;
 143
 144	/*
 145	 * Keep the AG header buffers locked while we roll the transaction.
 146	 * Ensure that both AG buffers are dirty and held when we roll the
 147	 * transaction so that they move forward in the log without losing the
 148	 * bli (and hence the bli type) when the transaction commits.
 149	 *
 150	 * Normal code would never hold clean buffers across a roll, but repair
 151	 * needs both buffers to maintain a total lock on the AG.
 152	 */
 153	if (sc->sa.agi_bp) {
 154		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
 155		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 156	}
 157
 158	if (sc->sa.agf_bp) {
 159		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
 160		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 161	}
 162
 163	/*
 164	 * Roll the transaction.  We still hold the AG header buffers locked
 165	 * regardless of whether or not that succeeds.  On failure, the buffers
 166	 * will be released during teardown on our way out of the kernel.  If
 167	 * successful, join the buffers to the new transaction and move on.
 168	 */
 169	error = xfs_trans_roll(&sc->tp);
 170	if (error)
 171		return error;
 172
 173	/* Join the AG headers to the new transaction. */
 174	if (sc->sa.agi_bp)
 175		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 176	if (sc->sa.agf_bp)
 177		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 178
 179	return 0;
 180}
 181
 182/* Roll the scrub transaction, holding the primary metadata locked. */
 183int
 184xrep_roll_trans(
 185	struct xfs_scrub	*sc)
 186{
 187	if (!sc->ip)
 188		return xrep_roll_ag_trans(sc);
 189	return xfs_trans_roll_inode(&sc->tp, sc->ip);
 190}
 191
 192/* Finish all deferred work attached to the repair transaction. */
 193int
 194xrep_defer_finish(
 195	struct xfs_scrub	*sc)
 196{
 197	int			error;
 198
 199	/*
 200	 * Keep the AG header buffers locked while we complete deferred work
 201	 * items.  Ensure that both AG buffers are dirty and held when we roll
 202	 * the transaction so that they move forward in the log without losing
 203	 * the bli (and hence the bli type) when the transaction commits.
 204	 *
 205	 * Normal code would never hold clean buffers across a roll, but repair
 206	 * needs both buffers to maintain a total lock on the AG.
 207	 */
 208	if (sc->sa.agi_bp) {
 209		xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp, XFS_AGI_MAGICNUM);
 210		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 211	}
 212
 213	if (sc->sa.agf_bp) {
 214		xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_MAGICNUM);
 215		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 216	}
 217
 218	/*
 219	 * Finish all deferred work items.  We still hold the AG header buffers
 220	 * locked regardless of whether or not that succeeds.  On failure, the
 221	 * buffers will be released during teardown on our way out of the
 222	 * kernel.  If successful, join the buffers to the new transaction
 223	 * and move on.
 224	 */
 225	error = xfs_defer_finish(&sc->tp);
 226	if (error)
 227		return error;
 228
 229	/*
 230	 * Release the hold that we set above because defer_finish won't do
 231	 * that for us.  The defer roll code redirties held buffers after each
 232	 * roll, so the AG header buffers should be ready for logging.
 233	 */
 234	if (sc->sa.agi_bp)
 235		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
 236	if (sc->sa.agf_bp)
 237		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
 238
 239	return 0;
 240}
 241
 242/*
 243 * Does the given AG have enough space to rebuild a btree?  Neither AG
 244 * reservation can be critical, and we must have enough space (factoring
 245 * in AG reservations) to construct a whole btree.
 246 */
 247bool
 248xrep_ag_has_space(
 249	struct xfs_perag	*pag,
 250	xfs_extlen_t		nr_blocks,
 251	enum xfs_ag_resv_type	type)
 252{
 253	return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
 254		!xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
 255		pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
 256}
 257
 258/*
 259 * Figure out how many blocks to reserve for an AG repair.  We calculate the
 260 * worst case estimate for the number of blocks we'd need to rebuild one of
 261 * any type of per-AG btree.
 262 */
 263xfs_extlen_t
 264xrep_calc_ag_resblks(
 265	struct xfs_scrub		*sc)
 266{
 267	struct xfs_mount		*mp = sc->mp;
 268	struct xfs_scrub_metadata	*sm = sc->sm;
 269	struct xfs_perag		*pag;
 270	struct xfs_buf			*bp;
 271	xfs_agino_t			icount = NULLAGINO;
 272	xfs_extlen_t			aglen = NULLAGBLOCK;
 273	xfs_extlen_t			usedlen;
 274	xfs_extlen_t			freelen;
 275	xfs_extlen_t			bnobt_sz;
 276	xfs_extlen_t			inobt_sz;
 277	xfs_extlen_t			rmapbt_sz;
 278	xfs_extlen_t			refcbt_sz;
 279	int				error;
 280
 281	if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 282		return 0;
 283
 284	pag = xfs_perag_get(mp, sm->sm_agno);
 285	if (xfs_perag_initialised_agi(pag)) {
 286		/* Use in-core icount if possible. */
 287		icount = pag->pagi_count;
 288	} else {
 289		/* Try to get the actual counters from disk. */
 290		error = xfs_ialloc_read_agi(pag, NULL, &bp);
 291		if (!error) {
 292			icount = pag->pagi_count;
 293			xfs_buf_relse(bp);
 294		}
 295	}
 296
 297	/* Now grab the block counters from the AGF. */
 298	error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
 299	if (error) {
 300		aglen = pag->block_count;
 301		freelen = aglen;
 302		usedlen = aglen;
 303	} else {
 304		struct xfs_agf	*agf = bp->b_addr;
 305
 306		aglen = be32_to_cpu(agf->agf_length);
 307		freelen = be32_to_cpu(agf->agf_freeblks);
 308		usedlen = aglen - freelen;
 309		xfs_buf_relse(bp);
 310	}
 311
 312	/* If the icount is impossible, make some worst-case assumptions. */
 313	if (icount == NULLAGINO ||
 314	    !xfs_verify_agino(pag, icount)) {
 315		icount = pag->agino_max - pag->agino_min + 1;
 316	}
 317
 318	/* If the block counts are impossible, make worst-case assumptions. */
 319	if (aglen == NULLAGBLOCK ||
 320	    aglen != pag->block_count ||
 321	    freelen >= aglen) {
 322		aglen = pag->block_count;
 323		freelen = aglen;
 324		usedlen = aglen;
 325	}
 326	xfs_perag_put(pag);
 327
 328	trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
 329			freelen, usedlen);
 330
 331	/*
 332	 * Figure out how many blocks we'd need worst case to rebuild
 333	 * each type of btree.  Note that we can only rebuild the
 334	 * bnobt/cntbt or inobt/finobt as pairs.
 335	 */
 336	bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
 337	if (xfs_has_sparseinodes(mp))
 338		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 339				XFS_INODES_PER_HOLEMASK_BIT);
 340	else
 341		inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 342				XFS_INODES_PER_CHUNK);
 343	if (xfs_has_finobt(mp))
 344		inobt_sz *= 2;
 345	if (xfs_has_reflink(mp))
 346		refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
 347	else
 348		refcbt_sz = 0;
 349	if (xfs_has_rmapbt(mp)) {
 350		/*
 351		 * Guess how many blocks we need to rebuild the rmapbt.
 352		 * For non-reflink filesystems we can't have more records than
 353		 * used blocks.  However, with reflink it's possible to have
 354		 * more than one rmap record per AG block.  We don't know how
 355		 * many rmaps there could be in the AG, so we start off with
 356		 * what we hope is an generous over-estimation.
 357		 */
 358		if (xfs_has_reflink(mp))
 359			rmapbt_sz = xfs_rmapbt_calc_size(mp,
 360					(unsigned long long)aglen * 2);
 361		else
 362			rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
 363	} else {
 364		rmapbt_sz = 0;
 365	}
 366
 367	trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
 368			inobt_sz, rmapbt_sz, refcbt_sz);
 369
 370	return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 371}
 372
 373/*
 374 * Reconstructing per-AG Btrees
 375 *
 376 * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
 377 * we scan secondary space metadata to derive the records that should be in
 378 * the damaged btree, initialize a fresh btree root, and insert the records.
 379 * Note that for rebuilding the rmapbt we scan all the primary data to
 380 * generate the new records.
 381 *
 382 * However, that leaves the matter of removing all the metadata describing the
 383 * old broken structure.  For primary metadata we use the rmap data to collect
 384 * every extent with a matching rmap owner (bitmap); we then iterate all other
 385 * metadata structures with the same rmap owner to collect the extents that
 386 * cannot be removed (sublist).  We then subtract sublist from bitmap to
 387 * derive the blocks that were used by the old btree.  These blocks can be
 388 * reaped.
 389 *
 390 * For rmapbt reconstructions we must use different tactics for extent
 391 * collection.  First we iterate all primary metadata (this excludes the old
 392 * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
 393 * records are collected as bitmap.  The bnobt records are collected as
 394 * sublist.  As with the other btrees we subtract sublist from bitmap, and the
 395 * result (since the rmapbt lives in the free space) are the blocks from the
 396 * old rmapbt.
 397 */
 398
 399/* Ensure the freelist is the correct size. */
 400int
 401xrep_fix_freelist(
 402	struct xfs_scrub	*sc,
 403	bool			can_shrink)
 404{
 405	struct xfs_alloc_arg	args = {0};
 406
 407	args.mp = sc->mp;
 408	args.tp = sc->tp;
 409	args.agno = sc->sa.pag->pag_agno;
 410	args.alignment = 1;
 411	args.pag = sc->sa.pag;
 412
 413	return xfs_alloc_fix_freelist(&args,
 414			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
 415}
 416
 417/*
 418 * Finding per-AG Btree Roots for AGF/AGI Reconstruction
 419 *
 420 * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
 421 * the AG headers by using the rmap data to rummage through the AG looking for
 422 * btree roots.  This is not guaranteed to work if the AG is heavily damaged
 423 * or the rmap data are corrupt.
 424 *
 425 * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
 426 * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
 427 * AGI is being rebuilt.  It must maintain these locks until it's safe for
 428 * other threads to change the btrees' shapes.  The caller provides
 429 * information about the btrees to look for by passing in an array of
 430 * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
 431 * The (root, height) fields will be set on return if anything is found.  The
 432 * last element of the array should have a NULL buf_ops to mark the end of the
 433 * array.
 434 *
 435 * For every rmapbt record matching any of the rmap owners in btree_info,
 436 * read each block referenced by the rmap record.  If the block is a btree
 437 * block from this filesystem matching any of the magic numbers and has a
 438 * level higher than what we've already seen, remember the block and the
 439 * height of the tree required to have such a block.  When the call completes,
 440 * we return the highest block we've found for each btree description; those
 441 * should be the roots.
 442 */
 443
 444struct xrep_findroot {
 445	struct xfs_scrub		*sc;
 446	struct xfs_buf			*agfl_bp;
 447	struct xfs_agf			*agf;
 448	struct xrep_find_ag_btree	*btree_info;
 449};
 450
 451/* See if our block is in the AGFL. */
 452STATIC int
 453xrep_findroot_agfl_walk(
 454	struct xfs_mount	*mp,
 455	xfs_agblock_t		bno,
 456	void			*priv)
 457{
 458	xfs_agblock_t		*agbno = priv;
 459
 460	return (*agbno == bno) ? -ECANCELED : 0;
 461}
 462
 463/* Does this block match the btree information passed in? */
 464STATIC int
 465xrep_findroot_block(
 466	struct xrep_findroot		*ri,
 467	struct xrep_find_ag_btree	*fab,
 468	uint64_t			owner,
 469	xfs_agblock_t			agbno,
 470	bool				*done_with_block)
 471{
 472	struct xfs_mount		*mp = ri->sc->mp;
 473	struct xfs_buf			*bp;
 474	struct xfs_btree_block		*btblock;
 475	xfs_daddr_t			daddr;
 476	int				block_level;
 477	int				error = 0;
 478
 479	daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
 480
 481	/*
 482	 * Blocks in the AGFL have stale contents that might just happen to
 483	 * have a matching magic and uuid.  We don't want to pull these blocks
 484	 * in as part of a tree root, so we have to filter out the AGFL stuff
 485	 * here.  If the AGFL looks insane we'll just refuse to repair.
 486	 */
 487	if (owner == XFS_RMAP_OWN_AG) {
 488		error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 489				xrep_findroot_agfl_walk, &agbno);
 490		if (error == -ECANCELED)
 491			return 0;
 492		if (error)
 493			return error;
 494	}
 495
 496	/*
 497	 * Read the buffer into memory so that we can see if it's a match for
 498	 * our btree type.  We have no clue if it is beforehand, and we want to
 499	 * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
 500	 * will cause needless disk reads in subsequent calls to this function)
 501	 * and logging metadata verifier failures.
 502	 *
 503	 * Therefore, pass in NULL buffer ops.  If the buffer was already in
 504	 * memory from some other caller it will already have b_ops assigned.
 505	 * If it was in memory from a previous unsuccessful findroot_block
 506	 * call, the buffer won't have b_ops but it should be clean and ready
 507	 * for us to try to verify if the read call succeeds.  The same applies
 508	 * if the buffer wasn't in memory at all.
 509	 *
 510	 * Note: If we never match a btree type with this buffer, it will be
 511	 * left in memory with NULL b_ops.  This shouldn't be a problem unless
 512	 * the buffer gets written.
 513	 */
 514	error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
 515			mp->m_bsize, 0, &bp, NULL);
 516	if (error)
 517		return error;
 518
 519	/* Ensure the block magic matches the btree type we're looking for. */
 520	btblock = XFS_BUF_TO_BLOCK(bp);
 521	ASSERT(fab->buf_ops->magic[1] != 0);
 522	if (btblock->bb_magic != fab->buf_ops->magic[1])
 523		goto out;
 524
 525	/*
 526	 * If the buffer already has ops applied and they're not the ones for
 527	 * this btree type, we know this block doesn't match the btree and we
 528	 * can bail out.
 529	 *
 530	 * If the buffer ops match ours, someone else has already validated
 531	 * the block for us, so we can move on to checking if this is a root
 532	 * block candidate.
 533	 *
 534	 * If the buffer does not have ops, nobody has successfully validated
 535	 * the contents and the buffer cannot be dirty.  If the magic, uuid,
 536	 * and structure match this btree type then we'll move on to checking
 537	 * if it's a root block candidate.  If there is no match, bail out.
 538	 */
 539	if (bp->b_ops) {
 540		if (bp->b_ops != fab->buf_ops)
 541			goto out;
 542	} else {
 543		ASSERT(!xfs_trans_buf_is_dirty(bp));
 544		if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
 545				&mp->m_sb.sb_meta_uuid))
 546			goto out;
 547		/*
 548		 * Read verifiers can reference b_ops, so we set the pointer
 549		 * here.  If the verifier fails we'll reset the buffer state
 550		 * to what it was before we touched the buffer.
 551		 */
 552		bp->b_ops = fab->buf_ops;
 553		fab->buf_ops->verify_read(bp);
 554		if (bp->b_error) {
 555			bp->b_ops = NULL;
 556			bp->b_error = 0;
 557			goto out;
 558		}
 559
 560		/*
 561		 * Some read verifiers will (re)set b_ops, so we must be
 562		 * careful not to change b_ops after running the verifier.
 563		 */
 564	}
 565
 566	/*
 567	 * This block passes the magic/uuid and verifier tests for this btree
 568	 * type.  We don't need the caller to try the other tree types.
 569	 */
 570	*done_with_block = true;
 571
 572	/*
 573	 * Compare this btree block's level to the height of the current
 574	 * candidate root block.
 575	 *
 576	 * If the level matches the root we found previously, throw away both
 577	 * blocks because there can't be two candidate roots.
 578	 *
 579	 * If level is lower in the tree than the root we found previously,
 580	 * ignore this block.
 581	 */
 582	block_level = xfs_btree_get_level(btblock);
 583	if (block_level + 1 == fab->height) {
 584		fab->root = NULLAGBLOCK;
 585		goto out;
 586	} else if (block_level < fab->height) {
 587		goto out;
 588	}
 589
 590	/*
 591	 * This is the highest block in the tree that we've found so far.
 592	 * Update the btree height to reflect what we've learned from this
 593	 * block.
 594	 */
 595	fab->height = block_level + 1;
 596
 597	/*
 598	 * If this block doesn't have sibling pointers, then it's the new root
 599	 * block candidate.  Otherwise, the root will be found farther up the
 600	 * tree.
 601	 */
 602	if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
 603	    btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
 604		fab->root = agbno;
 605	else
 606		fab->root = NULLAGBLOCK;
 607
 608	trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
 609			be32_to_cpu(btblock->bb_magic), fab->height - 1);
 610out:
 611	xfs_trans_brelse(ri->sc->tp, bp);
 612	return error;
 613}
 614
 615/*
 616 * Do any of the blocks in this rmap record match one of the btrees we're
 617 * looking for?
 618 */
 619STATIC int
 620xrep_findroot_rmap(
 621	struct xfs_btree_cur		*cur,
 622	const struct xfs_rmap_irec	*rec,
 623	void				*priv)
 624{
 625	struct xrep_findroot		*ri = priv;
 626	struct xrep_find_ag_btree	*fab;
 627	xfs_agblock_t			b;
 628	bool				done;
 629	int				error = 0;
 630
 631	/* Ignore anything that isn't AG metadata. */
 632	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
 633		return 0;
 634
 635	/* Otherwise scan each block + btree type. */
 636	for (b = 0; b < rec->rm_blockcount; b++) {
 637		done = false;
 638		for (fab = ri->btree_info; fab->buf_ops; fab++) {
 639			if (rec->rm_owner != fab->rmap_owner)
 640				continue;
 641			error = xrep_findroot_block(ri, fab,
 642					rec->rm_owner, rec->rm_startblock + b,
 643					&done);
 644			if (error)
 645				return error;
 646			if (done)
 647				break;
 648		}
 649	}
 650
 651	return 0;
 652}
 653
 654/* Find the roots of the per-AG btrees described in btree_info. */
 655int
 656xrep_find_ag_btree_roots(
 657	struct xfs_scrub		*sc,
 658	struct xfs_buf			*agf_bp,
 659	struct xrep_find_ag_btree	*btree_info,
 660	struct xfs_buf			*agfl_bp)
 661{
 662	struct xfs_mount		*mp = sc->mp;
 663	struct xrep_findroot		ri;
 664	struct xrep_find_ag_btree	*fab;
 665	struct xfs_btree_cur		*cur;
 666	int				error;
 667
 668	ASSERT(xfs_buf_islocked(agf_bp));
 669	ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
 670
 671	ri.sc = sc;
 672	ri.btree_info = btree_info;
 673	ri.agf = agf_bp->b_addr;
 674	ri.agfl_bp = agfl_bp;
 675	for (fab = btree_info; fab->buf_ops; fab++) {
 676		ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
 677		ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
 678		fab->root = NULLAGBLOCK;
 679		fab->height = 0;
 680	}
 681
 682	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 683	error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
 684	xfs_btree_del_cursor(cur, error);
 685
 686	return error;
 687}
 688
 689#ifdef CONFIG_XFS_QUOTA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 690/* Force a quotacheck the next time we mount. */
 691void
 692xrep_force_quotacheck(
 693	struct xfs_scrub	*sc,
 694	xfs_dqtype_t		type)
 695{
 696	uint			flag;
 697
 698	flag = xfs_quota_chkd_flag(type);
 699	if (!(flag & sc->mp->m_qflags))
 700		return;
 701
 702	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
 703	sc->mp->m_qflags &= ~flag;
 704	spin_lock(&sc->mp->m_sb_lock);
 705	sc->mp->m_sb.sb_qflags &= ~flag;
 706	spin_unlock(&sc->mp->m_sb_lock);
 707	xfs_log_sb(sc->tp);
 708	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
 709}
 710
 711/*
 712 * Attach dquots to this inode, or schedule quotacheck to fix them.
 713 *
 714 * This function ensures that the appropriate dquots are attached to an inode.
 715 * We cannot allow the dquot code to allocate an on-disk dquot block here
 716 * because we're already in transaction context.  The on-disk dquot should
 717 * already exist anyway.  If the quota code signals corruption or missing quota
 718 * information, schedule quotacheck, which will repair corruptions in the quota
 719 * metadata.
 720 */
 721int
 722xrep_ino_dqattach(
 723	struct xfs_scrub	*sc)
 724{
 725	int			error;
 726
 727	ASSERT(sc->tp != NULL);
 728	ASSERT(sc->ip != NULL);
 729
 730	error = xfs_qm_dqattach(sc->ip);
 731	switch (error) {
 732	case -EFSBADCRC:
 733	case -EFSCORRUPTED:
 734	case -ENOENT:
 735		xfs_err_ratelimited(sc->mp,
 736"inode %llu repair encountered quota error %d, quotacheck forced.",
 737				(unsigned long long)sc->ip->i_ino, error);
 738		if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
 739			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
 740		if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
 741			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
 742		if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
 743			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
 744		fallthrough;
 745	case -ESRCH:
 746		error = 0;
 747		break;
 748	default:
 749		break;
 750	}
 751
 752	return error;
 753}
 754#endif /* CONFIG_XFS_QUOTA */
 755
 756/*
 757 * Ensure that the inode being repaired is ready to handle a certain number of
 758 * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
 759 * being repaired and have joined it to the scrub transaction.
 760 */
 761int
 762xrep_ino_ensure_extent_count(
 763	struct xfs_scrub	*sc,
 764	int			whichfork,
 765	xfs_extnum_t		nextents)
 766{
 767	xfs_extnum_t		max_extents;
 768	bool			inode_has_nrext64;
 769
 770	inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
 771	max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
 772	if (nextents <= max_extents)
 773		return 0;
 774	if (inode_has_nrext64)
 775		return -EFSCORRUPTED;
 776	if (!xfs_has_large_extent_counts(sc->mp))
 777		return -EFSCORRUPTED;
 778
 779	max_extents = xfs_iext_max_nextents(true, whichfork);
 780	if (nextents > max_extents)
 781		return -EFSCORRUPTED;
 782
 783	sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
 784	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
 785	return 0;
 786}
 787
 788/*
 789 * Initialize all the btree cursors for an AG repair except for the btree that
 790 * we're rebuilding.
 791 */
 792void
 793xrep_ag_btcur_init(
 794	struct xfs_scrub	*sc,
 795	struct xchk_ag		*sa)
 796{
 797	struct xfs_mount	*mp = sc->mp;
 798
 799	/* Set up a bnobt cursor for cross-referencing. */
 800	if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
 801	    sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
 802		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 803				sc->sa.pag, XFS_BTNUM_BNO);
 804		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
 805				sc->sa.pag, XFS_BTNUM_CNT);
 806	}
 807
 808	/* Set up a inobt cursor for cross-referencing. */
 809	if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
 810	    sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
 811		sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
 812				sa->agi_bp, XFS_BTNUM_INO);
 813		if (xfs_has_finobt(mp))
 814			sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
 815					sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
 816	}
 817
 818	/* Set up a rmapbt cursor for cross-referencing. */
 819	if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
 820	    xfs_has_rmapbt(mp))
 821		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
 822				sc->sa.pag);
 823
 824	/* Set up a refcountbt cursor for cross-referencing. */
 825	if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
 826	    xfs_has_reflink(mp))
 827		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
 828				sa->agf_bp, sc->sa.pag);
 829}
 830
 831/*
 832 * Reinitialize the in-core AG state after a repair by rereading the AGF
 833 * buffer.  We had better get the same AGF buffer as the one that's attached
 834 * to the scrub context.
 835 */
 836int
 837xrep_reinit_pagf(
 838	struct xfs_scrub	*sc)
 839{
 840	struct xfs_perag	*pag = sc->sa.pag;
 841	struct xfs_buf		*bp;
 842	int			error;
 843
 844	ASSERT(pag);
 845	ASSERT(xfs_perag_initialised_agf(pag));
 846
 847	clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
 848	error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
 849	if (error)
 850		return error;
 851
 852	if (bp != sc->sa.agf_bp) {
 853		ASSERT(bp == sc->sa.agf_bp);
 854		return -EFSCORRUPTED;
 855	}
 856
 857	return 0;
 858}
 859
 860/*
 861 * Reinitialize the in-core AG state after a repair by rereading the AGI
 862 * buffer.  We had better get the same AGI buffer as the one that's attached
 863 * to the scrub context.
 864 */
 865int
 866xrep_reinit_pagi(
 867	struct xfs_scrub	*sc)
 868{
 869	struct xfs_perag	*pag = sc->sa.pag;
 870	struct xfs_buf		*bp;
 871	int			error;
 872
 873	ASSERT(pag);
 874	ASSERT(xfs_perag_initialised_agi(pag));
 875
 876	clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
 877	error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
 878	if (error)
 879		return error;
 880
 881	if (bp != sc->sa.agi_bp) {
 882		ASSERT(bp == sc->sa.agi_bp);
 883		return -EFSCORRUPTED;
 884	}
 885
 886	return 0;
 887}
 888
 889/*
 890 * Given an active reference to a perag structure, load AG headers and cursors.
 891 * This should only be called to scan an AG while repairing file-based metadata.
 892 */
 893int
 894xrep_ag_init(
 895	struct xfs_scrub	*sc,
 896	struct xfs_perag	*pag,
 897	struct xchk_ag		*sa)
 898{
 899	int			error;
 900
 901	ASSERT(!sa->pag);
 902
 903	error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
 904	if (error)
 905		return error;
 906
 907	error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
 908	if (error)
 909		return error;
 910
 911	/* Grab our own passive reference from the caller's ref. */
 912	sa->pag = xfs_perag_hold(pag);
 913	xrep_ag_btcur_init(sc, sa);
 914	return 0;
 915}
 916
 917/* Reinitialize the per-AG block reservation for the AG we just fixed. */
 918int
 919xrep_reset_perag_resv(
 920	struct xfs_scrub	*sc)
 921{
 922	int			error;
 923
 924	if (!(sc->flags & XREP_RESET_PERAG_RESV))
 925		return 0;
 926
 927	ASSERT(sc->sa.pag != NULL);
 928	ASSERT(sc->ops->type == ST_PERAG);
 929	ASSERT(sc->tp);
 930
 931	sc->flags &= ~XREP_RESET_PERAG_RESV;
 932	error = xfs_ag_resv_free(sc->sa.pag);
 933	if (error)
 934		goto out;
 935	error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
 936	if (error == -ENOSPC) {
 937		xfs_err(sc->mp,
 938"Insufficient free space to reset per-AG reservation for AG %u after repair.",
 939				sc->sa.pag->pag_agno);
 940		error = 0;
 941	}
 942
 943out:
 944	return error;
 945}
 946
 947/* Decide if we are going to call the repair function for a scrub type. */
 948bool
 949xrep_will_attempt(
 950	struct xfs_scrub	*sc)
 951{
 952	/* Userspace asked us to rebuild the structure regardless. */
 953	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
 954		return true;
 955
 956	/* Let debug users force us into the repair routines. */
 957	if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
 958		return true;
 959
 960	/* Metadata is corrupt or failed cross-referencing. */
 961	if (xchk_needs_repair(sc->sm))
 962		return true;
 963
 964	return false;
 965}
 966
 967/* Try to fix some part of a metadata inode by calling another scrubber. */
 968STATIC int
 969xrep_metadata_inode_subtype(
 970	struct xfs_scrub	*sc,
 971	unsigned int		scrub_type)
 972{
 973	__u32			smtype = sc->sm->sm_type;
 974	__u32			smflags = sc->sm->sm_flags;
 975	unsigned int		sick_mask = sc->sick_mask;
 976	int			error;
 977
 978	/*
 979	 * Let's see if the inode needs repair.  We're going to open-code calls
 980	 * to the scrub and repair functions so that we can hang on to the
 981	 * resources that we already acquired instead of using the standard
 982	 * setup/teardown routines.
 983	 */
 984	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
 985	sc->sm->sm_type = scrub_type;
 986
 987	switch (scrub_type) {
 988	case XFS_SCRUB_TYPE_INODE:
 989		error = xchk_inode(sc);
 990		break;
 991	case XFS_SCRUB_TYPE_BMBTD:
 992		error = xchk_bmap_data(sc);
 993		break;
 994	case XFS_SCRUB_TYPE_BMBTA:
 995		error = xchk_bmap_attr(sc);
 996		break;
 997	default:
 998		ASSERT(0);
 999		error = -EFSCORRUPTED;
1000	}
1001	if (error)
1002		goto out;
1003
1004	if (!xrep_will_attempt(sc))
1005		goto out;
1006
1007	/*
1008	 * Repair some part of the inode.  This will potentially join the inode
1009	 * to the transaction.
1010	 */
1011	switch (scrub_type) {
1012	case XFS_SCRUB_TYPE_INODE:
1013		error = xrep_inode(sc);
1014		break;
1015	case XFS_SCRUB_TYPE_BMBTD:
1016		error = xrep_bmap(sc, XFS_DATA_FORK, false);
1017		break;
1018	case XFS_SCRUB_TYPE_BMBTA:
1019		error = xrep_bmap(sc, XFS_ATTR_FORK, false);
1020		break;
1021	}
1022	if (error)
1023		goto out;
1024
1025	/*
1026	 * Finish all deferred intent items and then roll the transaction so
1027	 * that the inode will not be joined to the transaction when we exit
1028	 * the function.
1029	 */
1030	error = xfs_defer_finish(&sc->tp);
1031	if (error)
1032		goto out;
1033	error = xfs_trans_roll(&sc->tp);
1034	if (error)
1035		goto out;
1036
1037	/*
1038	 * Clear the corruption flags and re-check the metadata that we just
1039	 * repaired.
1040	 */
1041	sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
1042
1043	switch (scrub_type) {
1044	case XFS_SCRUB_TYPE_INODE:
1045		error = xchk_inode(sc);
1046		break;
1047	case XFS_SCRUB_TYPE_BMBTD:
1048		error = xchk_bmap_data(sc);
1049		break;
1050	case XFS_SCRUB_TYPE_BMBTA:
1051		error = xchk_bmap_attr(sc);
1052		break;
1053	}
1054	if (error)
1055		goto out;
1056
1057	/* If corruption persists, the repair has failed. */
1058	if (xchk_needs_repair(sc->sm)) {
1059		error = -EFSCORRUPTED;
1060		goto out;
1061	}
1062out:
1063	sc->sick_mask = sick_mask;
1064	sc->sm->sm_type = smtype;
1065	sc->sm->sm_flags = smflags;
1066	return error;
1067}
1068
1069/*
1070 * Repair the ondisk forks of a metadata inode.  The caller must ensure that
1071 * sc->ip points to the metadata inode and the ILOCK is held on that inode.
1072 * The inode must not be joined to the transaction before the call, and will
1073 * not be afterwards.
1074 */
1075int
1076xrep_metadata_inode_forks(
1077	struct xfs_scrub	*sc)
1078{
1079	bool			dirty = false;
1080	int			error;
1081
1082	/* Repair the inode record and the data fork. */
1083	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
1084	if (error)
1085		return error;
1086
1087	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
1088	if (error)
1089		return error;
1090
1091	/* Make sure the attr fork looks ok before we delete it. */
1092	error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
1093	if (error)
1094		return error;
1095
1096	/* Clear the reflink flag since metadata never shares. */
1097	if (xfs_is_reflink_inode(sc->ip)) {
1098		dirty = true;
1099		xfs_trans_ijoin(sc->tp, sc->ip, 0);
1100		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1101		if (error)
1102			return error;
1103	}
1104
1105	/*
1106	 * If we modified the inode, roll the transaction but don't rejoin the
1107	 * inode to the new transaction because xrep_bmap_data can do that.
1108	 */
1109	if (dirty) {
1110		error = xfs_trans_roll(&sc->tp);
1111		if (error)
1112			return error;
1113		dirty = false;
1114	}
1115
1116	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1117}