reap.c - fs/xfs/scrub/reap.c - Linux diff v6.9.4 - Bootlin Elixir Cross Referencer

  1// SPDX-License-Identifier: GPL-2.0-or-later
  2/*
  3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
  4 * Author: Darrick J. Wong <djwong@kernel.org>
  5 */
  6#include "xfs.h"
  7#include "xfs_fs.h"
  8#include "xfs_shared.h"
  9#include "xfs_format.h"
 10#include "xfs_trans_resv.h"
 11#include "xfs_mount.h"
 12#include "xfs_btree.h"
 13#include "xfs_log_format.h"
 14#include "xfs_trans.h"
 15#include "xfs_sb.h"
 16#include "xfs_inode.h"
 17#include "xfs_alloc.h"
 18#include "xfs_alloc_btree.h"
 19#include "xfs_ialloc.h"
 20#include "xfs_ialloc_btree.h"
 21#include "xfs_rmap.h"
 22#include "xfs_rmap_btree.h"
 23#include "xfs_refcount.h"
 24#include "xfs_refcount_btree.h"
 25#include "xfs_extent_busy.h"
 26#include "xfs_ag.h"
 27#include "xfs_ag_resv.h"
 28#include "xfs_quota.h"
 29#include "xfs_qm.h"
 30#include "xfs_bmap.h"
 31#include "xfs_da_format.h"
 32#include "xfs_da_btree.h"
 33#include "xfs_attr.h"
 34#include "xfs_attr_remote.h"
 35#include "xfs_defer.h"
 36#include "scrub/scrub.h"
 37#include "scrub/common.h"
 38#include "scrub/trace.h"
 39#include "scrub/repair.h"
 40#include "scrub/bitmap.h"
 41#include "scrub/agb_bitmap.h"
 42#include "scrub/fsb_bitmap.h"
 43#include "scrub/reap.h"
 44
 45/*
 46 * Disposal of Blocks from Old Metadata
 47 *
 48 * Now that we've constructed a new btree to replace the damaged one, we want
 49 * to dispose of the blocks that (we think) the old btree was using.
 50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
 51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
 52 * blocks with the same rmap owner that are owned by another data structure
 53 * (sublist), and subtracted sublist from bitmap.  In theory the extents
 54 * remaining in bitmap are the old btree's blocks.
 55 *
 56 * Unfortunately, it's possible that the btree was crosslinked with other
 57 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
 58 * if the rmapbt says there is an owner of this block other than @oinfo, then
 59 * the block is crosslinked.  Remove the reverse mapping and continue.
 60 *
 61 * If there is one rmap record, we can free the block, which removes the
 62 * reverse mapping but doesn't add the block to the free space.  Our repair
 63 * strategy is to hope the other metadata objects crosslinked on this block
 64 * will be rebuilt (atop different blocks), thereby removing all the cross
 65 * links.
 66 *
 67 * If there are no rmap records at all, we also free the block.  If the btree
 68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
 69 * supposed to be a rmap record and everything is ok.  For other btrees there
 70 * had to have been an rmap entry for the block to have ended up on @bitmap,
 71 * so if it's gone now there's something wrong and the fs will shut down.
 72 *
 73 * Note: If there are multiple rmap records with only the same rmap owner as
 74 * the btree we're trying to rebuild and the block is indeed owned by another
 75 * data structure with the same rmap owner, then the block will be in sublist
 76 * and therefore doesn't need disposal.  If there are multiple rmap records
 77 * with only the same rmap owner but the block is not owned by something with
 78 * the same rmap owner, the block will be freed.
 79 *
 80 * The caller is responsible for locking the AG headers/inode for the entire
 81 * rebuild operation so that nothing else can sneak in and change the incore
 82 * state while we're not looking.  We must also invalidate any buffers
 83 * associated with @bitmap.
 84 */
 85
 86/* Information about reaping extents after a repair. */
 87struct xreap_state {
 88	struct xfs_scrub		*sc;
 89
 90	/* Reverse mapping owner and metadata reservation type. */
 91	const struct xfs_owner_info	*oinfo;
 92	enum xfs_ag_resv_type		resv;
 93
 94	/* If true, roll the transaction before reaping the next extent. */
 95	bool				force_roll;
 96
 97	/* Number of deferred reaps attached to the current transaction. */
 98	unsigned int			deferred;
 99
100	/* Number of invalidated buffers logged to the current transaction. */
101	unsigned int			invalidated;
102
103	/* Number of deferred reaps queued during the whole reap sequence. */
104	unsigned long long		total_deferred;
105};
106
107/* Put a block back on the AGFL. */
108STATIC int
109xreap_put_freelist(
110	struct xfs_scrub	*sc,
111	xfs_agblock_t		agbno)
112{
113	struct xfs_buf		*agfl_bp;
114	int			error;
115
116	/* Make sure there's space on the freelist. */
117	error = xrep_fix_freelist(sc, 0);
118	if (error)
119		return error;
120
121	/*
122	 * Since we're "freeing" a lost block onto the AGFL, we have to
123	 * create an rmap for the block prior to merging it or else other
124	 * parts will break.
125	 */
126	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
127			&XFS_RMAP_OINFO_AG);
128	if (error)
129		return error;
130
131	/* Put the block on the AGFL. */
132	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
133	if (error)
134		return error;
135
136	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
137			agfl_bp, agbno, 0);
138	if (error)
139		return error;
140	xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
141			XFS_EXTENT_BUSY_SKIP_DISCARD);
142
143	return 0;
144}
145
146/* Are there any uncommitted reap operations? */
147static inline bool xreap_dirty(const struct xreap_state *rs)
148{
149	if (rs->force_roll)
150		return true;
151	if (rs->deferred)
152		return true;
153	if (rs->invalidated)
154		return true;
155	if (rs->total_deferred)
156		return true;
157	return false;
158}
159
160#define XREAP_MAX_BINVAL	(2048)
161
162/*
163 * Decide if we want to roll the transaction after reaping an extent.  We don't
164 * want to overrun the transaction reservation, so we prohibit more than
165 * 128 EFIs per transaction.  For the same reason, we limit the number
166 * of buffer invalidations to 2048.
167 */
168static inline bool xreap_want_roll(const struct xreap_state *rs)
169{
170	if (rs->force_roll)
171		return true;
172	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
173		return true;
174	if (rs->invalidated > XREAP_MAX_BINVAL)
175		return true;
176	return false;
177}
178
179static inline void xreap_reset(struct xreap_state *rs)
180{
181	rs->total_deferred += rs->deferred;
182	rs->deferred = 0;
183	rs->invalidated = 0;
184	rs->force_roll = false;
185}
186
187#define XREAP_MAX_DEFER_CHAIN		(2048)
188
189/*
190 * Decide if we want to finish the deferred ops that are attached to the scrub
191 * transaction.  We don't want to queue huge chains of deferred ops because
192 * that can consume a lot of log space and kernel memory.  Hence we trigger a
193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
194 * caller did some real work.
195 */
196static inline bool
197xreap_want_defer_finish(const struct xreap_state *rs)
198{
199	if (rs->force_roll)
200		return true;
201	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
202		return true;
203	return false;
204}
205
206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
207{
208	rs->total_deferred = 0;
209	rs->deferred = 0;
210	rs->invalidated = 0;
211	rs->force_roll = false;
212}
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214/* Try to invalidate the incore buffers for an extent that we're freeing. */
215STATIC void
216xreap_agextent_binval(
217	struct xreap_state	*rs,
218	xfs_agblock_t		agbno,
219	xfs_extlen_t		*aglenp)
220{
221	struct xfs_scrub	*sc = rs->sc;
222	struct xfs_perag	*pag = sc->sa.pag;
223	struct xfs_mount	*mp = sc->mp;
224	xfs_agnumber_t		agno = sc->sa.pag->pag_agno;
225	xfs_agblock_t		agbno_next = agbno + *aglenp;
226	xfs_agblock_t		bno = agbno;
227
228	/*
229	 * Avoid invalidating AG headers and post-EOFS blocks because we never
230	 * own those.
231	 */
232	if (!xfs_verify_agbno(pag, agbno) ||
233	    !xfs_verify_agbno(pag, agbno_next - 1))
234		return;
235
236	/*
237	 * If there are incore buffers for these blocks, invalidate them.  We
238	 * assume that the lack of any other known owners means that the buffer
239	 * can be locked without risk of deadlocking.  The buffer cache cannot
240	 * detect aliasing, so employ nested loops to scan for incore buffers
241	 * of any plausible size.
242	 */
243	while (bno < agbno_next) {
244		xfs_agblock_t	fsbcount;
245		xfs_agblock_t	max_fsbs;
246
247		/*
248		 * Max buffer size is the max remote xattr buffer size, which
249		 * is one fs block larger than 64k.
250		 */
251		max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
252				xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
253
254		for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
255			struct xfs_buf	*bp = NULL;
256			xfs_daddr_t	daddr;
257			int		error;
258
259			daddr = XFS_AGB_TO_DADDR(mp, agno, bno);
260			error = xfs_buf_incore(mp->m_ddev_targp, daddr,
261					XFS_FSB_TO_BB(mp, fsbcount),
262					XBF_LIVESCAN, &bp);
263			if (error)
264				continue;
265
 
266			xfs_trans_bjoin(sc->tp, bp);
267			xfs_trans_binval(sc->tp, bp);
268			rs->invalidated++;
269
270			/*
271			 * Stop invalidating if we've hit the limit; we should
272			 * still have enough reservation left to free however
273			 * far we've gotten.
274			 */
275			if (rs->invalidated > XREAP_MAX_BINVAL) {
276				*aglenp -= agbno_next - bno;
277				goto out;
278			}
279		}
280
281		bno++;
282	}
283
284out:
285	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
286}
287
288/*
289 * Figure out the longest run of blocks that we can dispose of with a single
290 * call.  Cross-linked blocks should have their reverse mappings removed, but
291 * single-owner extents can be freed.  AGFL blocks can only be put back one at
292 * a time.
293 */
294STATIC int
295xreap_agextent_select(
296	struct xreap_state	*rs,
297	xfs_agblock_t		agbno,
298	xfs_agblock_t		agbno_next,
299	bool			*crosslinked,
300	xfs_extlen_t		*aglenp)
301{
302	struct xfs_scrub	*sc = rs->sc;
303	struct xfs_btree_cur	*cur;
304	xfs_agblock_t		bno = agbno + 1;
305	xfs_extlen_t		len = 1;
306	int			error;
307
308	/*
309	 * Determine if there are any other rmap records covering the first
310	 * block of this extent.  If so, the block is crosslinked.
311	 */
312	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
313			sc->sa.pag);
314	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
315			crosslinked);
316	if (error)
317		goto out_cur;
318
319	/* AGFL blocks can only be deal with one at a time. */
320	if (rs->resv == XFS_AG_RESV_AGFL)
321		goto out_found;
322
323	/*
324	 * Figure out how many of the subsequent blocks have the same crosslink
325	 * status.
326	 */
327	while (bno < agbno_next) {
328		bool		also_crosslinked;
329
330		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
331				&also_crosslinked);
332		if (error)
333			goto out_cur;
334
335		if (*crosslinked != also_crosslinked)
336			break;
337
338		len++;
339		bno++;
340	}
341
342out_found:
343	*aglenp = len;
344	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
345out_cur:
346	xfs_btree_del_cursor(cur, error);
347	return error;
348}
349
350/*
351 * Dispose of as much of the beginning of this AG extent as possible.  The
352 * number of blocks disposed of will be returned in @aglenp.
353 */
354STATIC int
355xreap_agextent_iter(
356	struct xreap_state	*rs,
357	xfs_agblock_t		agbno,
358	xfs_extlen_t		*aglenp,
359	bool			crosslinked)
360{
361	struct xfs_scrub	*sc = rs->sc;
362	xfs_fsblock_t		fsbno;
363	int			error = 0;
364
365	fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, agbno);
366
367	/*
368	 * If there are other rmappings, this block is cross linked and must
369	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
370	 * we were the only owner of the block, so free the extent, which will
371	 * also remove the rmap.
372	 *
373	 * XXX: XFS doesn't support detecting the case where a single block
374	 * metadata structure is crosslinked with a multi-block structure
375	 * because the buffer cache doesn't detect aliasing problems, so we
376	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
377	 * blow on writeout, the filesystem will shut down, and the admin gets
378	 * to run xfs_repair.
379	 */
380	if (crosslinked) {
381		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
382
383		rs->force_roll = true;
384
385		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
386			/*
387			 * If we're unmapping CoW staging extents, remove the
388			 * records from the refcountbt, which will remove the
389			 * rmap record as well.
390			 */
391			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
392			return 0;
393		}
394
395		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
396				*aglenp, rs->oinfo);
397	}
398
399	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
400
401	/*
402	 * Invalidate as many buffers as we can, starting at agbno.  If this
403	 * function sets *aglenp to zero, the transaction is full of logged
404	 * buffer invalidations, so we need to return early so that we can
405	 * roll and retry.
406	 */
407	xreap_agextent_binval(rs, agbno, aglenp);
408	if (*aglenp == 0) {
409		ASSERT(xreap_want_roll(rs));
410		return 0;
411	}
412
413	/*
414	 * If we're getting rid of CoW staging extents, use deferred work items
415	 * to remove the refcountbt records (which removes the rmap records)
416	 * and free the extent.  We're not worried about the system going down
417	 * here because log recovery walks the refcount btree to clean out the
418	 * CoW staging extents.
419	 */
420	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
421		ASSERT(rs->resv == XFS_AG_RESV_NONE);
422
423		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
424		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
425				rs->resv, true);
426		if (error)
427			return error;
428
429		rs->force_roll = true;
430		return 0;
431	}
432
433	/* Put blocks back on the AGFL one at a time. */
434	if (rs->resv == XFS_AG_RESV_AGFL) {
435		ASSERT(*aglenp == 1);
436		error = xreap_put_freelist(sc, agbno);
437		if (error)
438			return error;
439
440		rs->force_roll = true;
441		return 0;
442	}
443
444	/*
445	 * Use deferred frees to get rid of the old btree blocks to try to
446	 * minimize the window in which we could crash and lose the old blocks.
447	 * Add a defer ops barrier every other extent to avoid stressing the
448	 * system with large EFIs.
449	 */
450	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
451			rs->resv, true);
452	if (error)
453		return error;
454
455	rs->deferred++;
456	if (rs->deferred % 2 == 0)
457		xfs_defer_add_barrier(sc->tp);
458	return 0;
459}
460
461/*
462 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
463 * crosslinked), and dispose of each sub-extent separately.
464 */
465STATIC int
466xreap_agmeta_extent(
467	uint32_t		agbno,
468	uint32_t		len,
469	void			*priv)
470{
471	struct xreap_state	*rs = priv;
472	struct xfs_scrub	*sc = rs->sc;
473	xfs_agblock_t		agbno_next = agbno + len;
474	int			error = 0;
475
476	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
477	ASSERT(sc->ip == NULL);
478
479	while (agbno < agbno_next) {
480		xfs_extlen_t	aglen;
481		bool		crosslinked;
482
483		error = xreap_agextent_select(rs, agbno, agbno_next,
484				&crosslinked, &aglen);
485		if (error)
486			return error;
487
488		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
489		if (error)
490			return error;
491
492		if (xreap_want_defer_finish(rs)) {
493			error = xrep_defer_finish(sc);
494			if (error)
495				return error;
496			xreap_defer_finish_reset(rs);
497		} else if (xreap_want_roll(rs)) {
498			error = xrep_roll_ag_trans(sc);
499			if (error)
500				return error;
501			xreap_reset(rs);
502		}
503
504		agbno += aglen;
505	}
506
507	return 0;
508}
509
510/* Dispose of every block of every AG metadata extent in the bitmap. */
511int
512xrep_reap_agblocks(
513	struct xfs_scrub		*sc,
514	struct xagb_bitmap		*bitmap,
515	const struct xfs_owner_info	*oinfo,
516	enum xfs_ag_resv_type		type)
517{
518	struct xreap_state		rs = {
519		.sc			= sc,
520		.oinfo			= oinfo,
521		.resv			= type,
522	};
523	int				error;
524
525	ASSERT(xfs_has_rmapbt(sc->mp));
526	ASSERT(sc->ip == NULL);
527
528	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
529	if (error)
530		return error;
531
532	if (xreap_dirty(&rs))
533		return xrep_defer_finish(sc);
534
535	return 0;
536}
537
538/*
539 * Break a file metadata extent into sub-extents by fate (crosslinked, not
540 * crosslinked), and dispose of each sub-extent separately.  The extent must
541 * not cross an AG boundary.
542 */
543STATIC int
544xreap_fsmeta_extent(
545	uint64_t		fsbno,
546	uint64_t		len,
547	void			*priv)
548{
549	struct xreap_state	*rs = priv;
550	struct xfs_scrub	*sc = rs->sc;
551	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
552	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
553	xfs_agblock_t		agbno_next = agbno + len;
554	int			error = 0;
555
556	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
557	ASSERT(sc->ip != NULL);
558	ASSERT(!sc->sa.pag);
559
560	/*
561	 * We're reaping blocks after repairing file metadata, which means that
562	 * we have to init the xchk_ag structure ourselves.
563	 */
564	sc->sa.pag = xfs_perag_get(sc->mp, agno);
565	if (!sc->sa.pag)
566		return -EFSCORRUPTED;
567
568	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
569	if (error)
570		goto out_pag;
571
572	while (agbno < agbno_next) {
573		xfs_extlen_t	aglen;
574		bool		crosslinked;
575
576		error = xreap_agextent_select(rs, agbno, agbno_next,
577				&crosslinked, &aglen);
578		if (error)
579			goto out_agf;
580
581		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
582		if (error)
583			goto out_agf;
584
585		if (xreap_want_defer_finish(rs)) {
586			/*
587			 * Holds the AGF buffer across the deferred chain
588			 * processing.
589			 */
590			error = xrep_defer_finish(sc);
591			if (error)
592				goto out_agf;
593			xreap_defer_finish_reset(rs);
594		} else if (xreap_want_roll(rs)) {
595			/*
596			 * Hold the AGF buffer across the transaction roll so
597			 * that we don't have to reattach it to the scrub
598			 * context.
599			 */
600			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
601			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
602			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
603			if (error)
604				goto out_agf;
605			xreap_reset(rs);
606		}
607
608		agbno += aglen;
609	}
610
611out_agf:
612	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
613	sc->sa.agf_bp = NULL;
614out_pag:
615	xfs_perag_put(sc->sa.pag);
616	sc->sa.pag = NULL;
617	return error;
618}
619
620/*
621 * Dispose of every block of every fs metadata extent in the bitmap.
622 * Do not use this to dispose of the mappings in an ondisk inode fork.
623 */
624int
625xrep_reap_fsblocks(
626	struct xfs_scrub		*sc,
627	struct xfsb_bitmap		*bitmap,
628	const struct xfs_owner_info	*oinfo)
629{
630	struct xreap_state		rs = {
631		.sc			= sc,
632		.oinfo			= oinfo,
633		.resv			= XFS_AG_RESV_NONE,
634	};
635	int				error;
636
637	ASSERT(xfs_has_rmapbt(sc->mp));
638	ASSERT(sc->ip != NULL);
639
640	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
641	if (error)
642		return error;
643
644	if (xreap_dirty(&rs))
645		return xrep_defer_finish(sc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
647	return 0;
648}

   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * Copyright (C) 2022-2023 Oracle.  All Rights Reserved.
   4 * Author: Darrick J. Wong <djwong@kernel.org>
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_trans_resv.h"
  11#include "xfs_mount.h"
  12#include "xfs_btree.h"
  13#include "xfs_log_format.h"
  14#include "xfs_trans.h"
  15#include "xfs_sb.h"
  16#include "xfs_inode.h"
  17#include "xfs_alloc.h"
  18#include "xfs_alloc_btree.h"
  19#include "xfs_ialloc.h"
  20#include "xfs_ialloc_btree.h"
  21#include "xfs_rmap.h"
  22#include "xfs_rmap_btree.h"
  23#include "xfs_refcount.h"
  24#include "xfs_refcount_btree.h"
  25#include "xfs_extent_busy.h"
  26#include "xfs_ag.h"
  27#include "xfs_ag_resv.h"
  28#include "xfs_quota.h"
  29#include "xfs_qm.h"
  30#include "xfs_bmap.h"
  31#include "xfs_da_format.h"
  32#include "xfs_da_btree.h"
  33#include "xfs_attr.h"
  34#include "xfs_attr_remote.h"
  35#include "xfs_defer.h"
  36#include "scrub/scrub.h"
  37#include "scrub/common.h"
  38#include "scrub/trace.h"
  39#include "scrub/repair.h"
  40#include "scrub/bitmap.h"
  41#include "scrub/agb_bitmap.h"
  42#include "scrub/fsb_bitmap.h"
  43#include "scrub/reap.h"
  44
  45/*
  46 * Disposal of Blocks from Old Metadata
  47 *
  48 * Now that we've constructed a new btree to replace the damaged one, we want
  49 * to dispose of the blocks that (we think) the old btree was using.
  50 * Previously, we used the rmapbt to collect the extents (bitmap) with the
  51 * rmap owner corresponding to the tree we rebuilt, collected extents for any
  52 * blocks with the same rmap owner that are owned by another data structure
  53 * (sublist), and subtracted sublist from bitmap.  In theory the extents
  54 * remaining in bitmap are the old btree's blocks.
  55 *
  56 * Unfortunately, it's possible that the btree was crosslinked with other
  57 * blocks on disk.  The rmap data can tell us if there are multiple owners, so
  58 * if the rmapbt says there is an owner of this block other than @oinfo, then
  59 * the block is crosslinked.  Remove the reverse mapping and continue.
  60 *
  61 * If there is one rmap record, we can free the block, which removes the
  62 * reverse mapping but doesn't add the block to the free space.  Our repair
  63 * strategy is to hope the other metadata objects crosslinked on this block
  64 * will be rebuilt (atop different blocks), thereby removing all the cross
  65 * links.
  66 *
  67 * If there are no rmap records at all, we also free the block.  If the btree
  68 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
  69 * supposed to be a rmap record and everything is ok.  For other btrees there
  70 * had to have been an rmap entry for the block to have ended up on @bitmap,
  71 * so if it's gone now there's something wrong and the fs will shut down.
  72 *
  73 * Note: If there are multiple rmap records with only the same rmap owner as
  74 * the btree we're trying to rebuild and the block is indeed owned by another
  75 * data structure with the same rmap owner, then the block will be in sublist
  76 * and therefore doesn't need disposal.  If there are multiple rmap records
  77 * with only the same rmap owner but the block is not owned by something with
  78 * the same rmap owner, the block will be freed.
  79 *
  80 * The caller is responsible for locking the AG headers/inode for the entire
  81 * rebuild operation so that nothing else can sneak in and change the incore
  82 * state while we're not looking.  We must also invalidate any buffers
  83 * associated with @bitmap.
  84 */
  85
  86/* Information about reaping extents after a repair. */
  87struct xreap_state {
  88	struct xfs_scrub		*sc;
  89
  90	/* Reverse mapping owner and metadata reservation type. */
  91	const struct xfs_owner_info	*oinfo;
  92	enum xfs_ag_resv_type		resv;
  93
  94	/* If true, roll the transaction before reaping the next extent. */
  95	bool				force_roll;
  96
  97	/* Number of deferred reaps attached to the current transaction. */
  98	unsigned int			deferred;
  99
 100	/* Number of invalidated buffers logged to the current transaction. */
 101	unsigned int			invalidated;
 102
 103	/* Number of deferred reaps queued during the whole reap sequence. */
 104	unsigned long long		total_deferred;
 105};
 106
 107/* Put a block back on the AGFL. */
 108STATIC int
 109xreap_put_freelist(
 110	struct xfs_scrub	*sc,
 111	xfs_agblock_t		agbno)
 112{
 113	struct xfs_buf		*agfl_bp;
 114	int			error;
 115
 116	/* Make sure there's space on the freelist. */
 117	error = xrep_fix_freelist(sc, 0);
 118	if (error)
 119		return error;
 120
 121	/*
 122	 * Since we're "freeing" a lost block onto the AGFL, we have to
 123	 * create an rmap for the block prior to merging it or else other
 124	 * parts will break.
 125	 */
 126	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
 127			&XFS_RMAP_OINFO_AG);
 128	if (error)
 129		return error;
 130
 131	/* Put the block on the AGFL. */
 132	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
 133	if (error)
 134		return error;
 135
 136	error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
 137			agfl_bp, agbno, 0);
 138	if (error)
 139		return error;
 140	xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
 141			XFS_EXTENT_BUSY_SKIP_DISCARD);
 142
 143	return 0;
 144}
 145
 146/* Are there any uncommitted reap operations? */
 147static inline bool xreap_dirty(const struct xreap_state *rs)
 148{
 149	if (rs->force_roll)
 150		return true;
 151	if (rs->deferred)
 152		return true;
 153	if (rs->invalidated)
 154		return true;
 155	if (rs->total_deferred)
 156		return true;
 157	return false;
 158}
 159
 160#define XREAP_MAX_BINVAL	(2048)
 161
 162/*
 163 * Decide if we want to roll the transaction after reaping an extent.  We don't
 164 * want to overrun the transaction reservation, so we prohibit more than
 165 * 128 EFIs per transaction.  For the same reason, we limit the number
 166 * of buffer invalidations to 2048.
 167 */
 168static inline bool xreap_want_roll(const struct xreap_state *rs)
 169{
 170	if (rs->force_roll)
 171		return true;
 172	if (rs->deferred > XREP_MAX_ITRUNCATE_EFIS)
 173		return true;
 174	if (rs->invalidated > XREAP_MAX_BINVAL)
 175		return true;
 176	return false;
 177}
 178
 179static inline void xreap_reset(struct xreap_state *rs)
 180{
 181	rs->total_deferred += rs->deferred;
 182	rs->deferred = 0;
 183	rs->invalidated = 0;
 184	rs->force_roll = false;
 185}
 186
 187#define XREAP_MAX_DEFER_CHAIN		(2048)
 188
 189/*
 190 * Decide if we want to finish the deferred ops that are attached to the scrub
 191 * transaction.  We don't want to queue huge chains of deferred ops because
 192 * that can consume a lot of log space and kernel memory.  Hence we trigger a
 193 * xfs_defer_finish if there are more than 2048 deferred reap operations or the
 194 * caller did some real work.
 195 */
 196static inline bool
 197xreap_want_defer_finish(const struct xreap_state *rs)
 198{
 199	if (rs->force_roll)
 200		return true;
 201	if (rs->total_deferred > XREAP_MAX_DEFER_CHAIN)
 202		return true;
 203	return false;
 204}
 205
 206static inline void xreap_defer_finish_reset(struct xreap_state *rs)
 207{
 208	rs->total_deferred = 0;
 209	rs->deferred = 0;
 210	rs->invalidated = 0;
 211	rs->force_roll = false;
 212}
 213
 214/*
 215 * Compute the maximum length of a buffer cache scan (in units of sectors),
 216 * given a quantity of fs blocks.
 217 */
 218xfs_daddr_t
 219xrep_bufscan_max_sectors(
 220	struct xfs_mount	*mp,
 221	xfs_extlen_t		fsblocks)
 222{
 223	int			max_fsbs;
 224
 225	/* Remote xattr values are the largest buffers that we support. */
 226	max_fsbs = xfs_attr3_max_rmt_blocks(mp);
 227
 228	return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks, max_fsbs));
 229}
 230
 231/*
 232 * Return an incore buffer from a sector scan, or NULL if there are no buffers
 233 * left to return.
 234 */
 235struct xfs_buf *
 236xrep_bufscan_advance(
 237	struct xfs_mount	*mp,
 238	struct xrep_bufscan	*scan)
 239{
 240	scan->__sector_count += scan->daddr_step;
 241	while (scan->__sector_count <= scan->max_sectors) {
 242		struct xfs_buf	*bp = NULL;
 243		int		error;
 244
 245		error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
 246				scan->__sector_count, XBF_LIVESCAN, &bp);
 247		if (!error)
 248			return bp;
 249
 250		scan->__sector_count += scan->daddr_step;
 251	}
 252
 253	return NULL;
 254}
 255
 256/* Try to invalidate the incore buffers for an extent that we're freeing. */
 257STATIC void
 258xreap_agextent_binval(
 259	struct xreap_state	*rs,
 260	xfs_agblock_t		agbno,
 261	xfs_extlen_t		*aglenp)
 262{
 263	struct xfs_scrub	*sc = rs->sc;
 264	struct xfs_perag	*pag = sc->sa.pag;
 265	struct xfs_mount	*mp = sc->mp;
 
 266	xfs_agblock_t		agbno_next = agbno + *aglenp;
 267	xfs_agblock_t		bno = agbno;
 268
 269	/*
 270	 * Avoid invalidating AG headers and post-EOFS blocks because we never
 271	 * own those.
 272	 */
 273	if (!xfs_verify_agbno(pag, agbno) ||
 274	    !xfs_verify_agbno(pag, agbno_next - 1))
 275		return;
 276
 277	/*
 278	 * If there are incore buffers for these blocks, invalidate them.  We
 279	 * assume that the lack of any other known owners means that the buffer
 280	 * can be locked without risk of deadlocking.  The buffer cache cannot
 281	 * detect aliasing, so employ nested loops to scan for incore buffers
 282	 * of any plausible size.
 283	 */
 284	while (bno < agbno_next) {
 285		struct xrep_bufscan	scan = {
 286			.daddr		= xfs_agbno_to_daddr(pag, bno),
 287			.max_sectors	= xrep_bufscan_max_sectors(mp,
 288							agbno_next - bno),
 289			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
 290		};
 291		struct xfs_buf	*bp;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 292
 293		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
 294			xfs_trans_bjoin(sc->tp, bp);
 295			xfs_trans_binval(sc->tp, bp);
 296			rs->invalidated++;
 297
 298			/*
 299			 * Stop invalidating if we've hit the limit; we should
 300			 * still have enough reservation left to free however
 301			 * far we've gotten.
 302			 */
 303			if (rs->invalidated > XREAP_MAX_BINVAL) {
 304				*aglenp -= agbno_next - bno;
 305				goto out;
 306			}
 307		}
 308
 309		bno++;
 310	}
 311
 312out:
 313	trace_xreap_agextent_binval(sc->sa.pag, agbno, *aglenp);
 314}
 315
 316/*
 317 * Figure out the longest run of blocks that we can dispose of with a single
 318 * call.  Cross-linked blocks should have their reverse mappings removed, but
 319 * single-owner extents can be freed.  AGFL blocks can only be put back one at
 320 * a time.
 321 */
 322STATIC int
 323xreap_agextent_select(
 324	struct xreap_state	*rs,
 325	xfs_agblock_t		agbno,
 326	xfs_agblock_t		agbno_next,
 327	bool			*crosslinked,
 328	xfs_extlen_t		*aglenp)
 329{
 330	struct xfs_scrub	*sc = rs->sc;
 331	struct xfs_btree_cur	*cur;
 332	xfs_agblock_t		bno = agbno + 1;
 333	xfs_extlen_t		len = 1;
 334	int			error;
 335
 336	/*
 337	 * Determine if there are any other rmap records covering the first
 338	 * block of this extent.  If so, the block is crosslinked.
 339	 */
 340	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 341			sc->sa.pag);
 342	error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
 343			crosslinked);
 344	if (error)
 345		goto out_cur;
 346
 347	/* AGFL blocks can only be deal with one at a time. */
 348	if (rs->resv == XFS_AG_RESV_AGFL)
 349		goto out_found;
 350
 351	/*
 352	 * Figure out how many of the subsequent blocks have the same crosslink
 353	 * status.
 354	 */
 355	while (bno < agbno_next) {
 356		bool		also_crosslinked;
 357
 358		error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
 359				&also_crosslinked);
 360		if (error)
 361			goto out_cur;
 362
 363		if (*crosslinked != also_crosslinked)
 364			break;
 365
 366		len++;
 367		bno++;
 368	}
 369
 370out_found:
 371	*aglenp = len;
 372	trace_xreap_agextent_select(sc->sa.pag, agbno, len, *crosslinked);
 373out_cur:
 374	xfs_btree_del_cursor(cur, error);
 375	return error;
 376}
 377
 378/*
 379 * Dispose of as much of the beginning of this AG extent as possible.  The
 380 * number of blocks disposed of will be returned in @aglenp.
 381 */
 382STATIC int
 383xreap_agextent_iter(
 384	struct xreap_state	*rs,
 385	xfs_agblock_t		agbno,
 386	xfs_extlen_t		*aglenp,
 387	bool			crosslinked)
 388{
 389	struct xfs_scrub	*sc = rs->sc;
 390	xfs_fsblock_t		fsbno;
 391	int			error = 0;
 392
 393	fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
 394
 395	/*
 396	 * If there are other rmappings, this block is cross linked and must
 397	 * not be freed.  Remove the reverse mapping and move on.  Otherwise,
 398	 * we were the only owner of the block, so free the extent, which will
 399	 * also remove the rmap.
 400	 *
 401	 * XXX: XFS doesn't support detecting the case where a single block
 402	 * metadata structure is crosslinked with a multi-block structure
 403	 * because the buffer cache doesn't detect aliasing problems, so we
 404	 * can't fix 100% of crosslinking problems (yet).  The verifiers will
 405	 * blow on writeout, the filesystem will shut down, and the admin gets
 406	 * to run xfs_repair.
 407	 */
 408	if (crosslinked) {
 409		trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
 410
 411		rs->force_roll = true;
 412
 413		if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 414			/*
 415			 * If we're unmapping CoW staging extents, remove the
 416			 * records from the refcountbt, which will remove the
 417			 * rmap record as well.
 418			 */
 419			xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
 420			return 0;
 421		}
 422
 423		return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
 424				*aglenp, rs->oinfo);
 425	}
 426
 427	trace_xreap_dispose_free_extent(sc->sa.pag, agbno, *aglenp);
 428
 429	/*
 430	 * Invalidate as many buffers as we can, starting at agbno.  If this
 431	 * function sets *aglenp to zero, the transaction is full of logged
 432	 * buffer invalidations, so we need to return early so that we can
 433	 * roll and retry.
 434	 */
 435	xreap_agextent_binval(rs, agbno, aglenp);
 436	if (*aglenp == 0) {
 437		ASSERT(xreap_want_roll(rs));
 438		return 0;
 439	}
 440
 441	/*
 442	 * If we're getting rid of CoW staging extents, use deferred work items
 443	 * to remove the refcountbt records (which removes the rmap records)
 444	 * and free the extent.  We're not worried about the system going down
 445	 * here because log recovery walks the refcount btree to clean out the
 446	 * CoW staging extents.
 447	 */
 448	if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
 449		ASSERT(rs->resv == XFS_AG_RESV_NONE);
 450
 451		xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
 452		error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
 453				rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 454		if (error)
 455			return error;
 456
 457		rs->force_roll = true;
 458		return 0;
 459	}
 460
 461	/* Put blocks back on the AGFL one at a time. */
 462	if (rs->resv == XFS_AG_RESV_AGFL) {
 463		ASSERT(*aglenp == 1);
 464		error = xreap_put_freelist(sc, agbno);
 465		if (error)
 466			return error;
 467
 468		rs->force_roll = true;
 469		return 0;
 470	}
 471
 472	/*
 473	 * Use deferred frees to get rid of the old btree blocks to try to
 474	 * minimize the window in which we could crash and lose the old blocks.
 475	 * Add a defer ops barrier every other extent to avoid stressing the
 476	 * system with large EFIs.
 477	 */
 478	error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
 479			rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
 480	if (error)
 481		return error;
 482
 483	rs->deferred++;
 484	if (rs->deferred % 2 == 0)
 485		xfs_defer_add_barrier(sc->tp);
 486	return 0;
 487}
 488
 489/*
 490 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
 491 * crosslinked), and dispose of each sub-extent separately.
 492 */
 493STATIC int
 494xreap_agmeta_extent(
 495	uint32_t		agbno,
 496	uint32_t		len,
 497	void			*priv)
 498{
 499	struct xreap_state	*rs = priv;
 500	struct xfs_scrub	*sc = rs->sc;
 501	xfs_agblock_t		agbno_next = agbno + len;
 502	int			error = 0;
 503
 504	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 505	ASSERT(sc->ip == NULL);
 506
 507	while (agbno < agbno_next) {
 508		xfs_extlen_t	aglen;
 509		bool		crosslinked;
 510
 511		error = xreap_agextent_select(rs, agbno, agbno_next,
 512				&crosslinked, &aglen);
 513		if (error)
 514			return error;
 515
 516		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
 517		if (error)
 518			return error;
 519
 520		if (xreap_want_defer_finish(rs)) {
 521			error = xrep_defer_finish(sc);
 522			if (error)
 523				return error;
 524			xreap_defer_finish_reset(rs);
 525		} else if (xreap_want_roll(rs)) {
 526			error = xrep_roll_ag_trans(sc);
 527			if (error)
 528				return error;
 529			xreap_reset(rs);
 530		}
 531
 532		agbno += aglen;
 533	}
 534
 535	return 0;
 536}
 537
 538/* Dispose of every block of every AG metadata extent in the bitmap. */
 539int
 540xrep_reap_agblocks(
 541	struct xfs_scrub		*sc,
 542	struct xagb_bitmap		*bitmap,
 543	const struct xfs_owner_info	*oinfo,
 544	enum xfs_ag_resv_type		type)
 545{
 546	struct xreap_state		rs = {
 547		.sc			= sc,
 548		.oinfo			= oinfo,
 549		.resv			= type,
 550	};
 551	int				error;
 552
 553	ASSERT(xfs_has_rmapbt(sc->mp));
 554	ASSERT(sc->ip == NULL);
 555
 556	error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
 557	if (error)
 558		return error;
 559
 560	if (xreap_dirty(&rs))
 561		return xrep_defer_finish(sc);
 562
 563	return 0;
 564}
 565
 566/*
 567 * Break a file metadata extent into sub-extents by fate (crosslinked, not
 568 * crosslinked), and dispose of each sub-extent separately.  The extent must
 569 * not cross an AG boundary.
 570 */
 571STATIC int
 572xreap_fsmeta_extent(
 573	uint64_t		fsbno,
 574	uint64_t		len,
 575	void			*priv)
 576{
 577	struct xreap_state	*rs = priv;
 578	struct xfs_scrub	*sc = rs->sc;
 579	xfs_agnumber_t		agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
 580	xfs_agblock_t		agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
 581	xfs_agblock_t		agbno_next = agbno + len;
 582	int			error = 0;
 583
 584	ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
 585	ASSERT(sc->ip != NULL);
 586	ASSERT(!sc->sa.pag);
 587
 588	/*
 589	 * We're reaping blocks after repairing file metadata, which means that
 590	 * we have to init the xchk_ag structure ourselves.
 591	 */
 592	sc->sa.pag = xfs_perag_get(sc->mp, agno);
 593	if (!sc->sa.pag)
 594		return -EFSCORRUPTED;
 595
 596	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
 597	if (error)
 598		goto out_pag;
 599
 600	while (agbno < agbno_next) {
 601		xfs_extlen_t	aglen;
 602		bool		crosslinked;
 603
 604		error = xreap_agextent_select(rs, agbno, agbno_next,
 605				&crosslinked, &aglen);
 606		if (error)
 607			goto out_agf;
 608
 609		error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
 610		if (error)
 611			goto out_agf;
 612
 613		if (xreap_want_defer_finish(rs)) {
 614			/*
 615			 * Holds the AGF buffer across the deferred chain
 616			 * processing.
 617			 */
 618			error = xrep_defer_finish(sc);
 619			if (error)
 620				goto out_agf;
 621			xreap_defer_finish_reset(rs);
 622		} else if (xreap_want_roll(rs)) {
 623			/*
 624			 * Hold the AGF buffer across the transaction roll so
 625			 * that we don't have to reattach it to the scrub
 626			 * context.
 627			 */
 628			xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 629			error = xfs_trans_roll_inode(&sc->tp, sc->ip);
 630			xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 631			if (error)
 632				goto out_agf;
 633			xreap_reset(rs);
 634		}
 635
 636		agbno += aglen;
 637	}
 638
 639out_agf:
 640	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
 641	sc->sa.agf_bp = NULL;
 642out_pag:
 643	xfs_perag_put(sc->sa.pag);
 644	sc->sa.pag = NULL;
 645	return error;
 646}
 647
 648/*
 649 * Dispose of every block of every fs metadata extent in the bitmap.
 650 * Do not use this to dispose of the mappings in an ondisk inode fork.
 651 */
 652int
 653xrep_reap_fsblocks(
 654	struct xfs_scrub		*sc,
 655	struct xfsb_bitmap		*bitmap,
 656	const struct xfs_owner_info	*oinfo)
 657{
 658	struct xreap_state		rs = {
 659		.sc			= sc,
 660		.oinfo			= oinfo,
 661		.resv			= XFS_AG_RESV_NONE,
 662	};
 663	int				error;
 664
 665	ASSERT(xfs_has_rmapbt(sc->mp));
 666	ASSERT(sc->ip != NULL);
 667
 668	error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
 669	if (error)
 670		return error;
 671
 672	if (xreap_dirty(&rs))
 673		return xrep_defer_finish(sc);
 674
 675	return 0;
 676}
 677
 678/*
 679 * Metadata files are not supposed to share blocks with anything else.
 680 * If blocks are shared, we remove the reverse mapping (thus reducing the
 681 * crosslink factor); if blocks are not shared, we also need to free them.
 682 *
 683 * This first step determines the longest subset of the passed-in imap
 684 * (starting at its beginning) that is either crosslinked or not crosslinked.
 685 * The blockcount will be adjust down as needed.
 686 */
 687STATIC int
 688xreap_bmapi_select(
 689	struct xfs_scrub	*sc,
 690	struct xfs_inode	*ip,
 691	int			whichfork,
 692	struct xfs_bmbt_irec	*imap,
 693	bool			*crosslinked)
 694{
 695	struct xfs_owner_info	oinfo;
 696	struct xfs_btree_cur	*cur;
 697	xfs_filblks_t		len = 1;
 698	xfs_agblock_t		bno;
 699	xfs_agblock_t		agbno;
 700	xfs_agblock_t		agbno_next;
 701	int			error;
 702
 703	agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
 704	agbno_next = agbno + imap->br_blockcount;
 705
 706	cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
 707			sc->sa.pag);
 708
 709	xfs_rmap_ino_owner(&oinfo, ip->i_ino, whichfork, imap->br_startoff);
 710	error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
 711	if (error)
 712		goto out_cur;
 713
 714	bno = agbno + 1;
 715	while (bno < agbno_next) {
 716		bool		also_crosslinked;
 717
 718		oinfo.oi_offset++;
 719		error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
 720				&also_crosslinked);
 721		if (error)
 722			goto out_cur;
 723
 724		if (also_crosslinked != *crosslinked)
 725			break;
 726
 727		len++;
 728		bno++;
 729	}
 730
 731	imap->br_blockcount = len;
 732	trace_xreap_bmapi_select(sc->sa.pag, agbno, len, *crosslinked);
 733out_cur:
 734	xfs_btree_del_cursor(cur, error);
 735	return error;
 736}
 737
 738/*
 739 * Decide if this buffer can be joined to a transaction.  This is true for most
 740 * buffers, but there are two cases that we want to catch: large remote xattr
 741 * value buffers are not logged and can overflow the buffer log item dirty
 742 * bitmap size; and oversized cached buffers if things have really gone
 743 * haywire.
 744 */
 745static inline bool
 746xreap_buf_loggable(
 747	const struct xfs_buf	*bp)
 748{
 749	int			i;
 750
 751	for (i = 0; i < bp->b_map_count; i++) {
 752		int		chunks;
 753		int		map_size;
 754
 755		chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
 756				XFS_BLF_CHUNK);
 757		map_size = DIV_ROUND_UP(chunks, NBWORD);
 758		if (map_size > XFS_BLF_DATAMAP_SIZE)
 759			return false;
 760	}
 761
 762	return true;
 763}
 764
 765/*
 766 * Invalidate any buffers for this file mapping.  The @imap blockcount may be
 767 * adjusted downward if we need to roll the transaction.
 768 */
 769STATIC int
 770xreap_bmapi_binval(
 771	struct xfs_scrub	*sc,
 772	struct xfs_inode	*ip,
 773	int			whichfork,
 774	struct xfs_bmbt_irec	*imap)
 775{
 776	struct xfs_mount	*mp = sc->mp;
 777	struct xfs_perag	*pag = sc->sa.pag;
 778	int			bmap_flags = xfs_bmapi_aflag(whichfork);
 779	xfs_fileoff_t		off;
 780	xfs_fileoff_t		max_off;
 781	xfs_extlen_t		scan_blocks;
 782	xfs_agblock_t		bno;
 783	xfs_agblock_t		agbno;
 784	xfs_agblock_t		agbno_next;
 785	unsigned int		invalidated = 0;
 786	int			error;
 787
 788	/*
 789	 * Avoid invalidating AG headers and post-EOFS blocks because we never
 790	 * own those.
 791	 */
 792	agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
 793	agbno_next = agbno + imap->br_blockcount;
 794	if (!xfs_verify_agbno(pag, agbno) ||
 795	    !xfs_verify_agbno(pag, agbno_next - 1))
 796		return 0;
 797
 798	/*
 799	 * Buffers for file blocks can span multiple contiguous mappings.  This
 800	 * means that for each block in the mapping, there could exist an
 801	 * xfs_buf indexed by that block with any length up to the maximum
 802	 * buffer size (remote xattr values) or to the next hole in the fork.
 803	 * To set up our binval scan, first we need to figure out the location
 804	 * of the next hole.
 805	 */
 806	off = imap->br_startoff + imap->br_blockcount;
 807	max_off = off + xfs_attr3_max_rmt_blocks(mp);
 808	while (off < max_off) {
 809		struct xfs_bmbt_irec	hmap;
 810		int			nhmaps = 1;
 811
 812		error = xfs_bmapi_read(ip, off, max_off - off, &hmap,
 813				&nhmaps, bmap_flags);
 814		if (error)
 815			return error;
 816		if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
 817			ASSERT(0);
 818			return -EFSCORRUPTED;
 819		}
 820
 821		if (!xfs_bmap_is_real_extent(&hmap))
 822			break;
 823
 824		off = hmap.br_startoff + hmap.br_blockcount;
 825	}
 826	scan_blocks = off - imap->br_startoff;
 827
 828	trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
 829
 830	/*
 831	 * If there are incore buffers for these blocks, invalidate them.  If
 832	 * we can't (try)lock the buffer we assume it's owned by someone else
 833	 * and leave it alone.  The buffer cache cannot detect aliasing, so
 834	 * employ nested loops to detect incore buffers of any plausible size.
 835	 */
 836	while (bno < agbno_next) {
 837		struct xrep_bufscan	scan = {
 838			.daddr		= xfs_agbno_to_daddr(pag, bno),
 839			.max_sectors	= xrep_bufscan_max_sectors(mp,
 840								scan_blocks),
 841			.daddr_step	= XFS_FSB_TO_BB(mp, 1),
 842		};
 843		struct xfs_buf		*bp;
 844
 845		while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
 846			if (xreap_buf_loggable(bp)) {
 847				xfs_trans_bjoin(sc->tp, bp);
 848				xfs_trans_binval(sc->tp, bp);
 849			} else {
 850				xfs_buf_stale(bp);
 851				xfs_buf_relse(bp);
 852			}
 853			invalidated++;
 854
 855			/*
 856			 * Stop invalidating if we've hit the limit; we should
 857			 * still have enough reservation left to free however
 858			 * much of the mapping we've seen so far.
 859			 */
 860			if (invalidated > XREAP_MAX_BINVAL) {
 861				imap->br_blockcount = agbno_next - bno;
 862				goto out;
 863			}
 864		}
 865
 866		bno++;
 867		scan_blocks--;
 868	}
 869
 870out:
 871	trace_xreap_bmapi_binval(sc->sa.pag, agbno, imap->br_blockcount);
 872	return 0;
 873}
 874
 875/*
 876 * Dispose of as much of the beginning of this file fork mapping as possible.
 877 * The number of blocks disposed of is returned in @imap->br_blockcount.
 878 */
 879STATIC int
 880xrep_reap_bmapi_iter(
 881	struct xfs_scrub		*sc,
 882	struct xfs_inode		*ip,
 883	int				whichfork,
 884	struct xfs_bmbt_irec		*imap,
 885	bool				crosslinked)
 886{
 887	int				error;
 888
 889	if (crosslinked) {
 890		/*
 891		 * If there are other rmappings, this block is cross linked and
 892		 * must not be freed.  Remove the reverse mapping, leave the
 893		 * buffer cache in its possibly confused state, and move on.
 894		 * We don't want to risk discarding valid data buffers from
 895		 * anybody else who thinks they own the block, even though that
 896		 * runs the risk of stale buffer warnings in the future.
 897		 */
 898		trace_xreap_dispose_unmap_extent(sc->sa.pag,
 899				XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
 900				imap->br_blockcount);
 901
 902		/*
 903		 * Schedule removal of the mapping from the fork.  We use
 904		 * deferred log intents in this function to control the exact
 905		 * sequence of metadata updates.
 906		 */
 907		xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
 908		xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
 909				-(int64_t)imap->br_blockcount);
 910		xfs_rmap_unmap_extent(sc->tp, ip, whichfork, imap);
 911		return 0;
 912	}
 913
 914	/*
 915	 * If the block is not crosslinked, we can invalidate all the incore
 916	 * buffers for the extent, and then free the extent.  This is a bit of
 917	 * a mess since we don't detect discontiguous buffers that are indexed
 918	 * by a block starting before the first block of the extent but overlap
 919	 * anyway.
 920	 */
 921	trace_xreap_dispose_free_extent(sc->sa.pag,
 922			XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
 923			imap->br_blockcount);
 924
 925	/*
 926	 * Invalidate as many buffers as we can, starting at the beginning of
 927	 * this mapping.  If this function sets blockcount to zero, the
 928	 * transaction is full of logged buffer invalidations, so we need to
 929	 * return early so that we can roll and retry.
 930	 */
 931	error = xreap_bmapi_binval(sc, ip, whichfork, imap);
 932	if (error || imap->br_blockcount == 0)
 933		return error;
 934
 935	/*
 936	 * Schedule removal of the mapping from the fork.  We use deferred log
 937	 * intents in this function to control the exact sequence of metadata
 938	 * updates.
 939	 */
 940	xfs_bmap_unmap_extent(sc->tp, ip, whichfork, imap);
 941	xfs_trans_mod_dquot_byino(sc->tp, ip, XFS_TRANS_DQ_BCOUNT,
 942			-(int64_t)imap->br_blockcount);
 943	return xfs_free_extent_later(sc->tp, imap->br_startblock,
 944			imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
 945			XFS_FREE_EXTENT_SKIP_DISCARD);
 946}
 947
 948/*
 949 * Dispose of as much of this file extent as we can.  Upon successful return,
 950 * the imap will reflect the mapping that was removed from the fork.
 951 */
 952STATIC int
 953xreap_ifork_extent(
 954	struct xfs_scrub		*sc,
 955	struct xfs_inode		*ip,
 956	int				whichfork,
 957	struct xfs_bmbt_irec		*imap)
 958{
 959	xfs_agnumber_t			agno;
 960	bool				crosslinked;
 961	int				error;
 962
 963	ASSERT(sc->sa.pag == NULL);
 964
 965	trace_xreap_ifork_extent(sc, ip, whichfork, imap);
 966
 967	agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
 968	sc->sa.pag = xfs_perag_get(sc->mp, agno);
 969	if (!sc->sa.pag)
 970		return -EFSCORRUPTED;
 971
 972	error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
 973	if (error)
 974		goto out_pag;
 975
 976	/*
 977	 * Decide the fate of the blocks at the beginning of the mapping, then
 978	 * update the mapping to use it with the unmap calls.
 979	 */
 980	error = xreap_bmapi_select(sc, ip, whichfork, imap, &crosslinked);
 981	if (error)
 982		goto out_agf;
 983
 984	error = xrep_reap_bmapi_iter(sc, ip, whichfork, imap, crosslinked);
 985	if (error)
 986		goto out_agf;
 987
 988out_agf:
 989	xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
 990	sc->sa.agf_bp = NULL;
 991out_pag:
 992	xfs_perag_put(sc->sa.pag);
 993	sc->sa.pag = NULL;
 994	return error;
 995}
 996
 997/*
 998 * Dispose of each block mapped to the given fork of the given file.  Callers
 999 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip.  The fork
1000 * must not have any delalloc reservations.
1001 */
1002int
1003xrep_reap_ifork(
1004	struct xfs_scrub	*sc,
1005	struct xfs_inode	*ip,
1006	int			whichfork)
1007{
1008	xfs_fileoff_t		off = 0;
1009	int			bmap_flags = xfs_bmapi_aflag(whichfork);
1010	int			error;
1011
1012	ASSERT(xfs_has_rmapbt(sc->mp));
1013	ASSERT(ip == sc->ip || ip == sc->tempip);
1014	ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1015
1016	while (off < XFS_MAX_FILEOFF) {
1017		struct xfs_bmbt_irec	imap;
1018		int			nimaps = 1;
1019
1020		/* Read the next extent, skip past holes and delalloc. */
1021		error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1022				&nimaps, bmap_flags);
1023		if (error)
1024			return error;
1025		if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1026			ASSERT(0);
1027			return -EFSCORRUPTED;
1028		}
1029
1030		/*
1031		 * If this is a real space mapping, reap as much of it as we
1032		 * can in a single transaction.
1033		 */
1034		if (xfs_bmap_is_real_extent(&imap)) {
1035			error = xreap_ifork_extent(sc, ip, whichfork, &imap);
1036			if (error)
1037				return error;
1038
1039			error = xfs_defer_finish(&sc->tp);
1040			if (error)
1041				return error;
1042		}
1043
1044		off = imap.br_startoff + imap.br_blockcount;
1045	}
1046
1047	return 0;
1048}