scrub.c - fs/xfs/scrub/scrub.c - Linux source code v3.5.6

Note: File does not exist in v3.5.6.
  1/*
  2 * Copyright (C) 2017 Oracle.  All Rights Reserved.
  3 *
  4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
  5 *
  6 * This program is free software; you can redistribute it and/or
  7 * modify it under the terms of the GNU General Public License
  8 * as published by the Free Software Foundation; either version 2
  9 * of the License, or (at your option) any later version.
 10 *
 11 * This program is distributed in the hope that it would be useful,
 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 * GNU General Public License for more details.
 15 *
 16 * You should have received a copy of the GNU General Public License
 17 * along with this program; if not, write the Free Software Foundation,
 18 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
 19 */
 20#include "xfs.h"
 21#include "xfs_fs.h"
 22#include "xfs_shared.h"
 23#include "xfs_format.h"
 24#include "xfs_trans_resv.h"
 25#include "xfs_mount.h"
 26#include "xfs_defer.h"
 27#include "xfs_btree.h"
 28#include "xfs_bit.h"
 29#include "xfs_log_format.h"
 30#include "xfs_trans.h"
 31#include "xfs_sb.h"
 32#include "xfs_inode.h"
 33#include "xfs_icache.h"
 34#include "xfs_itable.h"
 35#include "xfs_alloc.h"
 36#include "xfs_alloc_btree.h"
 37#include "xfs_bmap.h"
 38#include "xfs_bmap_btree.h"
 39#include "xfs_ialloc.h"
 40#include "xfs_ialloc_btree.h"
 41#include "xfs_refcount.h"
 42#include "xfs_refcount_btree.h"
 43#include "xfs_rmap.h"
 44#include "xfs_rmap_btree.h"
 45#include "scrub/xfs_scrub.h"
 46#include "scrub/scrub.h"
 47#include "scrub/common.h"
 48#include "scrub/trace.h"
 49#include "scrub/btree.h"
 50
 51/*
 52 * Online Scrub and Repair
 53 *
 54 * Traditionally, XFS (the kernel driver) did not know how to check or
 55 * repair on-disk data structures.  That task was left to the xfs_check
 56 * and xfs_repair tools, both of which require taking the filesystem
 57 * offline for a thorough but time consuming examination.  Online
 58 * scrub & repair, on the other hand, enables us to check the metadata
 59 * for obvious errors while carefully stepping around the filesystem's
 60 * ongoing operations, locking rules, etc.
 61 *
 62 * Given that most XFS metadata consist of records stored in a btree,
 63 * most of the checking functions iterate the btree blocks themselves
 64 * looking for irregularities.  When a record block is encountered, each
 65 * record can be checked for obviously bad values.  Record values can
 66 * also be cross-referenced against other btrees to look for potential
 67 * misunderstandings between pieces of metadata.
 68 *
 69 * It is expected that the checkers responsible for per-AG metadata
 70 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the
 71 * metadata structure, and perform any relevant cross-referencing before
 72 * unlocking the AG and returning the results to userspace.  These
 73 * scrubbers must not keep an AG locked for too long to avoid tying up
 74 * the block and inode allocators.
 75 *
 76 * Block maps and b-trees rooted in an inode present a special challenge
 77 * because they can involve extents from any AG.  The general scrubber
 78 * structure of lock -> check -> xref -> unlock still holds, but AG
 79 * locking order rules /must/ be obeyed to avoid deadlocks.  The
 80 * ordering rule, of course, is that we must lock in increasing AG
 81 * order.  Helper functions are provided to track which AG headers we've
 82 * already locked.  If we detect an imminent locking order violation, we
 83 * can signal a potential deadlock, in which case the scrubber can jump
 84 * out to the top level, lock all the AGs in order, and retry the scrub.
 85 *
 86 * For file data (directories, extended attributes, symlinks) scrub, we
 87 * can simply lock the inode and walk the data.  For btree data
 88 * (directories and attributes) we follow the same btree-scrubbing
 89 * strategy outlined previously to check the records.
 90 *
 91 * We use a bit of trickery with transactions to avoid buffer deadlocks
 92 * if there is a cycle in the metadata.  The basic problem is that
 93 * travelling down a btree involves locking the current buffer at each
 94 * tree level.  If a pointer should somehow point back to a buffer that
 95 * we've already examined, we will deadlock due to the second buffer
 96 * locking attempt.  Note however that grabbing a buffer in transaction
 97 * context links the locked buffer to the transaction.  If we try to
 98 * re-grab the buffer in the context of the same transaction, we avoid
 99 * the second lock attempt and continue.  Between the verifier and the
100 * scrubber, something will notice that something is amiss and report
101 * the corruption.  Therefore, each scrubber will allocate an empty
102 * transaction, attach buffers to it, and cancel the transaction at the
103 * end of the scrub run.  Cancelling a non-dirty transaction simply
104 * unlocks the buffers.
105 *
106 * There are four pieces of data that scrub can communicate to
107 * userspace.  The first is the error code (errno), which can be used to
108 * communicate operational errors in performing the scrub.  There are
109 * also three flags that can be set in the scrub context.  If the data
110 * structure itself is corrupt, the CORRUPT flag will be set.  If
111 * the metadata is correct but otherwise suboptimal, the PREEN flag
112 * will be set.
113 *
114 * We perform secondary validation of filesystem metadata by
115 * cross-referencing every record with all other available metadata.
116 * For example, for block mapping extents, we verify that there are no
117 * records in the free space and inode btrees corresponding to that
118 * space extent and that there is a corresponding entry in the reverse
119 * mapping btree.  Inconsistent metadata is noted by setting the
120 * XCORRUPT flag; btree query function errors are noted by setting the
121 * XFAIL flag and deleting the cursor to prevent further attempts to
122 * cross-reference with a defective btree.
123 */
124
125/*
126 * Scrub probe -- userspace uses this to probe if we're willing to scrub
127 * or repair a given mountpoint.  This will be used by xfs_scrub to
128 * probe the kernel's abilities to scrub (and repair) the metadata.  We
129 * do this by validating the ioctl inputs from userspace, preparing the
130 * filesystem for a scrub (or a repair) operation, and immediately
131 * returning to userspace.  Userspace can use the returned errno and
132 * structure state to decide (in broad terms) if scrub/repair are
133 * supported by the running kernel.
134 */
135static int
136xfs_scrub_probe(
137	struct xfs_scrub_context	*sc)
138{
139	int				error = 0;
140
141	if (xfs_scrub_should_terminate(sc, &error))
142		return error;
143
144	return 0;
145}
146
147/* Scrub setup and teardown */
148
149/* Free all the resources and finish the transactions. */
150STATIC int
151xfs_scrub_teardown(
152	struct xfs_scrub_context	*sc,
153	struct xfs_inode		*ip_in,
154	int				error)
155{
156	xfs_scrub_ag_free(sc, &sc->sa);
157	if (sc->tp) {
158		xfs_trans_cancel(sc->tp);
159		sc->tp = NULL;
160	}
161	if (sc->ip) {
162		if (sc->ilock_flags)
163			xfs_iunlock(sc->ip, sc->ilock_flags);
164		if (sc->ip != ip_in &&
165		    !xfs_internal_inum(sc->mp, sc->ip->i_ino))
166			iput(VFS_I(sc->ip));
167		sc->ip = NULL;
168	}
169	if (sc->buf) {
170		kmem_free(sc->buf);
171		sc->buf = NULL;
172	}
173	return error;
174}
175
176/* Scrubbing dispatch. */
177
178static const struct xfs_scrub_meta_ops meta_scrub_ops[] = {
179	[XFS_SCRUB_TYPE_PROBE] = {	/* ioctl presence test */
180		.type	= ST_NONE,
181		.setup	= xfs_scrub_setup_fs,
182		.scrub	= xfs_scrub_probe,
183	},
184	[XFS_SCRUB_TYPE_SB] = {		/* superblock */
185		.type	= ST_PERAG,
186		.setup	= xfs_scrub_setup_fs,
187		.scrub	= xfs_scrub_superblock,
188	},
189	[XFS_SCRUB_TYPE_AGF] = {	/* agf */
190		.type	= ST_PERAG,
191		.setup	= xfs_scrub_setup_fs,
192		.scrub	= xfs_scrub_agf,
193	},
194	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
195		.type	= ST_PERAG,
196		.setup	= xfs_scrub_setup_fs,
197		.scrub	= xfs_scrub_agfl,
198	},
199	[XFS_SCRUB_TYPE_AGI] = {	/* agi */
200		.type	= ST_PERAG,
201		.setup	= xfs_scrub_setup_fs,
202		.scrub	= xfs_scrub_agi,
203	},
204	[XFS_SCRUB_TYPE_BNOBT] = {	/* bnobt */
205		.type	= ST_PERAG,
206		.setup	= xfs_scrub_setup_ag_allocbt,
207		.scrub	= xfs_scrub_bnobt,
208	},
209	[XFS_SCRUB_TYPE_CNTBT] = {	/* cntbt */
210		.type	= ST_PERAG,
211		.setup	= xfs_scrub_setup_ag_allocbt,
212		.scrub	= xfs_scrub_cntbt,
213	},
214	[XFS_SCRUB_TYPE_INOBT] = {	/* inobt */
215		.type	= ST_PERAG,
216		.setup	= xfs_scrub_setup_ag_iallocbt,
217		.scrub	= xfs_scrub_inobt,
218	},
219	[XFS_SCRUB_TYPE_FINOBT] = {	/* finobt */
220		.type	= ST_PERAG,
221		.setup	= xfs_scrub_setup_ag_iallocbt,
222		.scrub	= xfs_scrub_finobt,
223		.has	= xfs_sb_version_hasfinobt,
224	},
225	[XFS_SCRUB_TYPE_RMAPBT] = {	/* rmapbt */
226		.type	= ST_PERAG,
227		.setup	= xfs_scrub_setup_ag_rmapbt,
228		.scrub	= xfs_scrub_rmapbt,
229		.has	= xfs_sb_version_hasrmapbt,
230	},
231	[XFS_SCRUB_TYPE_REFCNTBT] = {	/* refcountbt */
232		.type	= ST_PERAG,
233		.setup	= xfs_scrub_setup_ag_refcountbt,
234		.scrub	= xfs_scrub_refcountbt,
235		.has	= xfs_sb_version_hasreflink,
236	},
237	[XFS_SCRUB_TYPE_INODE] = {	/* inode record */
238		.type	= ST_INODE,
239		.setup	= xfs_scrub_setup_inode,
240		.scrub	= xfs_scrub_inode,
241	},
242	[XFS_SCRUB_TYPE_BMBTD] = {	/* inode data fork */
243		.type	= ST_INODE,
244		.setup	= xfs_scrub_setup_inode_bmap,
245		.scrub	= xfs_scrub_bmap_data,
246	},
247	[XFS_SCRUB_TYPE_BMBTA] = {	/* inode attr fork */
248		.type	= ST_INODE,
249		.setup	= xfs_scrub_setup_inode_bmap,
250		.scrub	= xfs_scrub_bmap_attr,
251	},
252	[XFS_SCRUB_TYPE_BMBTC] = {	/* inode CoW fork */
253		.type	= ST_INODE,
254		.setup	= xfs_scrub_setup_inode_bmap,
255		.scrub	= xfs_scrub_bmap_cow,
256	},
257	[XFS_SCRUB_TYPE_DIR] = {	/* directory */
258		.type	= ST_INODE,
259		.setup	= xfs_scrub_setup_directory,
260		.scrub	= xfs_scrub_directory,
261	},
262	[XFS_SCRUB_TYPE_XATTR] = {	/* extended attributes */
263		.type	= ST_INODE,
264		.setup	= xfs_scrub_setup_xattr,
265		.scrub	= xfs_scrub_xattr,
266	},
267	[XFS_SCRUB_TYPE_SYMLINK] = {	/* symbolic link */
268		.type	= ST_INODE,
269		.setup	= xfs_scrub_setup_symlink,
270		.scrub	= xfs_scrub_symlink,
271	},
272	[XFS_SCRUB_TYPE_PARENT] = {	/* parent pointers */
273		.type	= ST_INODE,
274		.setup	= xfs_scrub_setup_parent,
275		.scrub	= xfs_scrub_parent,
276	},
277	[XFS_SCRUB_TYPE_RTBITMAP] = {	/* realtime bitmap */
278		.type	= ST_FS,
279		.setup	= xfs_scrub_setup_rt,
280		.scrub	= xfs_scrub_rtbitmap,
281		.has	= xfs_sb_version_hasrealtime,
282	},
283	[XFS_SCRUB_TYPE_RTSUM] = {	/* realtime summary */
284		.type	= ST_FS,
285		.setup	= xfs_scrub_setup_rt,
286		.scrub	= xfs_scrub_rtsummary,
287		.has	= xfs_sb_version_hasrealtime,
288	},
289	[XFS_SCRUB_TYPE_UQUOTA] = {	/* user quota */
290		.type	= ST_FS,
291		.setup	= xfs_scrub_setup_quota,
292		.scrub	= xfs_scrub_quota,
293	},
294	[XFS_SCRUB_TYPE_GQUOTA] = {	/* group quota */
295		.type	= ST_FS,
296		.setup	= xfs_scrub_setup_quota,
297		.scrub	= xfs_scrub_quota,
298	},
299	[XFS_SCRUB_TYPE_PQUOTA] = {	/* project quota */
300		.type	= ST_FS,
301		.setup	= xfs_scrub_setup_quota,
302		.scrub	= xfs_scrub_quota,
303	},
304};
305
306/* This isn't a stable feature, warn once per day. */
307static inline void
308xfs_scrub_experimental_warning(
309	struct xfs_mount	*mp)
310{
311	static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT(
312			"xfs_scrub_warning", 86400 * HZ, 1);
313	ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE);
314
315	if (__ratelimit(&scrub_warning))
316		xfs_alert(mp,
317"EXPERIMENTAL online scrub feature in use. Use at your own risk!");
318}
319
320static int
321xfs_scrub_validate_inputs(
322	struct xfs_mount		*mp,
323	struct xfs_scrub_metadata	*sm)
324{
325	int				error;
326	const struct xfs_scrub_meta_ops	*ops;
327
328	error = -EINVAL;
329	/* Check our inputs. */
330	sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
331	if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN)
332		goto out;
333	/* sm_reserved[] must be zero */
334	if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved)))
335		goto out;
336
337	error = -ENOENT;
338	/* Do we know about this type of metadata? */
339	if (sm->sm_type >= XFS_SCRUB_TYPE_NR)
340		goto out;
341	ops = &meta_scrub_ops[sm->sm_type];
342	if (ops->setup == NULL || ops->scrub == NULL)
343		goto out;
344	/* Does this fs even support this type of metadata? */
345	if (ops->has && !ops->has(&mp->m_sb))
346		goto out;
347
348	error = -EINVAL;
349	/* restricting fields must be appropriate for type */
350	switch (ops->type) {
351	case ST_NONE:
352	case ST_FS:
353		if (sm->sm_ino || sm->sm_gen || sm->sm_agno)
354			goto out;
355		break;
356	case ST_PERAG:
357		if (sm->sm_ino || sm->sm_gen ||
358		    sm->sm_agno >= mp->m_sb.sb_agcount)
359			goto out;
360		break;
361	case ST_INODE:
362		if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino))
363			goto out;
364		break;
365	default:
366		goto out;
367	}
368
369	error = -EOPNOTSUPP;
370	/*
371	 * We won't scrub any filesystem that doesn't have the ability
372	 * to record unwritten extents.  The option was made default in
373	 * 2003, removed from mkfs in 2007, and cannot be disabled in
374	 * v5, so if we find a filesystem without this flag it's either
375	 * really old or totally unsupported.  Avoid it either way.
376	 * We also don't support v1-v3 filesystems, which aren't
377	 * mountable.
378	 */
379	if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
380		goto out;
381
382	/* We don't know how to repair anything yet. */
383	if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
384		goto out;
385
386	error = 0;
387out:
388	return error;
389}
390
391/* Dispatch metadata scrubbing. */
392int
393xfs_scrub_metadata(
394	struct xfs_inode		*ip,
395	struct xfs_scrub_metadata	*sm)
396{
397	struct xfs_scrub_context	sc;
398	struct xfs_mount		*mp = ip->i_mount;
399	bool				try_harder = false;
400	int				error = 0;
401
402	BUILD_BUG_ON(sizeof(meta_scrub_ops) !=
403		(sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR));
404
405	trace_xfs_scrub_start(ip, sm, error);
406
407	/* Forbidden if we are shut down or mounted norecovery. */
408	error = -ESHUTDOWN;
409	if (XFS_FORCED_SHUTDOWN(mp))
410		goto out;
411	error = -ENOTRECOVERABLE;
412	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
413		goto out;
414
415	error = xfs_scrub_validate_inputs(mp, sm);
416	if (error)
417		goto out;
418
419	xfs_scrub_experimental_warning(mp);
420
421retry_op:
422	/* Set up for the operation. */
423	memset(&sc, 0, sizeof(sc));
424	sc.mp = ip->i_mount;
425	sc.sm = sm;
426	sc.ops = &meta_scrub_ops[sm->sm_type];
427	sc.try_harder = try_harder;
428	sc.sa.agno = NULLAGNUMBER;
429	error = sc.ops->setup(&sc, ip);
430	if (error)
431		goto out_teardown;
432
433	/* Scrub for errors. */
434	error = sc.ops->scrub(&sc);
435	if (!try_harder && error == -EDEADLOCK) {
436		/*
437		 * Scrubbers return -EDEADLOCK to mean 'try harder'.
438		 * Tear down everything we hold, then set up again with
439		 * preparation for worst-case scenarios.
440		 */
441		error = xfs_scrub_teardown(&sc, ip, 0);
442		if (error)
443			goto out;
444		try_harder = true;
445		goto retry_op;
446	} else if (error)
447		goto out_teardown;
448
449	if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
450			       XFS_SCRUB_OFLAG_XCORRUPT))
451		xfs_alert_ratelimited(mp, "Corruption detected during scrub.");
452
453out_teardown:
454	error = xfs_scrub_teardown(&sc, ip, error);
455out:
456	trace_xfs_scrub_done(ip, sm, error);
457	if (error == -EFSCORRUPTED || error == -EFSBADCRC) {
458		sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
459		error = 0;
460	}
461	return error;
462}