Loading...
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_bit.h"
15#include "xfs_log_format.h"
16#include "xfs_trans.h"
17#include "xfs_sb.h"
18#include "xfs_inode.h"
19#include "xfs_icache.h"
20#include "xfs_inode_buf.h"
21#include "xfs_inode_fork.h"
22#include "xfs_ialloc.h"
23#include "xfs_da_format.h"
24#include "xfs_reflink.h"
25#include "xfs_alloc.h"
26#include "xfs_rmap.h"
27#include "xfs_rmap_btree.h"
28#include "xfs_bmap.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_bmap_util.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_quota_defs.h"
34#include "xfs_quota.h"
35#include "xfs_ag.h"
36#include "xfs_rtbitmap.h"
37#include "xfs_attr_leaf.h"
38#include "xfs_log_priv.h"
39#include "xfs_health.h"
40#include "xfs_symlink_remote.h"
41#include "scrub/xfs_scrub.h"
42#include "scrub/scrub.h"
43#include "scrub/common.h"
44#include "scrub/btree.h"
45#include "scrub/trace.h"
46#include "scrub/repair.h"
47#include "scrub/iscan.h"
48#include "scrub/readdir.h"
49
50/*
51 * Inode Record Repair
52 * ===================
53 *
54 * Roughly speaking, inode problems can be classified based on whether or not
55 * they trip the dinode verifiers. If those trip, then we won't be able to
56 * xfs_iget ourselves the inode.
57 *
58 * Therefore, the xrep_dinode_* functions fix anything that will cause the
59 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
60 * fix things on live incore inodes. The inode repair functions make decisions
61 * with security and usability implications when reviving a file:
62 *
63 * - Files with zero di_mode or a garbage di_mode are converted to regular file
64 * that only root can read. This file may not actually contain user data,
65 * if the file was not previously a regular file. Setuid and setgid bits
66 * are cleared.
67 *
68 * - Zero-size directories can be truncated to look empty. It is necessary to
69 * run the bmapbtd and directory repair functions to fully rebuild the
70 * directory.
71 *
72 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
73 * to run the bmapbtd and symlink repair functions to salvage the symlink.
74 *
75 * - Invalid extent size hints will be removed.
76 *
77 * - Quotacheck will be scheduled if we repaired an inode that was so badly
78 * damaged that the ondisk inode had to be rebuilt.
79 *
80 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
81 * Setuid and setgid bits are cleared.
82 *
83 * - Data and attr forks are reset to extents format with zero extents if the
84 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
85 * repair functions to recover the space mapping.
86 *
87 * - ACLs will not be recovered if the attr fork is zapped or the extended
88 * attribute structure itself requires salvaging.
89 *
90 * - If the attr fork is zapped, the user and group ids are reset to root and
91 * the setuid and setgid bits are removed.
92 */
93
94/*
95 * All the information we need to repair the ondisk inode if we can't iget the
96 * incore inode. We don't allocate this buffer unless we're going to perform
97 * a repair to the ondisk inode cluster buffer.
98 */
99struct xrep_inode {
100 /* Inode mapping that we saved from the initial lookup attempt. */
101 struct xfs_imap imap;
102
103 struct xfs_scrub *sc;
104
105 /* Blocks in use on the data device by data extents or bmbt blocks. */
106 xfs_rfsblock_t data_blocks;
107
108 /* Blocks in use on the rt device. */
109 xfs_rfsblock_t rt_blocks;
110
111 /* Blocks in use by the attr fork. */
112 xfs_rfsblock_t attr_blocks;
113
114 /* Number of data device extents for the data fork. */
115 xfs_extnum_t data_extents;
116
117 /*
118 * Number of realtime device extents for the data fork. If
119 * data_extents and rt_extents indicate that the data fork has extents
120 * on both devices, we'll just back away slowly.
121 */
122 xfs_extnum_t rt_extents;
123
124 /* Number of (data device) extents for the attr fork. */
125 xfs_aextnum_t attr_extents;
126
127 /* Sick state to set after zapping parts of the inode. */
128 unsigned int ino_sick_mask;
129
130 /* Must we remove all access from this file? */
131 bool zap_acls;
132
133 /* Inode scanner to see if we can find the ftype from dirents */
134 struct xchk_iscan ftype_iscan;
135 uint8_t alleged_ftype;
136};
137
138/*
139 * Setup function for inode repair. @imap contains the ondisk inode mapping
140 * information so that we can correct the ondisk inode cluster buffer if
141 * necessary to make iget work.
142 */
143int
144xrep_setup_inode(
145 struct xfs_scrub *sc,
146 const struct xfs_imap *imap)
147{
148 struct xrep_inode *ri;
149
150 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
151 if (!sc->buf)
152 return -ENOMEM;
153
154 ri = sc->buf;
155 memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
156 ri->sc = sc;
157 return 0;
158}
159
160/*
161 * Make sure this ondisk inode can pass the inode buffer verifier. This is
162 * not the same as the dinode verifier.
163 */
164STATIC void
165xrep_dinode_buf_core(
166 struct xfs_scrub *sc,
167 struct xfs_buf *bp,
168 unsigned int ioffset)
169{
170 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
171 struct xfs_trans *tp = sc->tp;
172 struct xfs_mount *mp = sc->mp;
173 xfs_agino_t agino;
174 bool crc_ok = false;
175 bool magic_ok = false;
176 bool unlinked_ok = false;
177
178 agino = be32_to_cpu(dip->di_next_unlinked);
179
180 if (xfs_verify_agino_or_null(bp->b_pag, agino))
181 unlinked_ok = true;
182
183 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
184 xfs_dinode_good_version(mp, dip->di_version))
185 magic_ok = true;
186
187 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
188 XFS_DINODE_CRC_OFF))
189 crc_ok = true;
190
191 if (magic_ok && unlinked_ok && crc_ok)
192 return;
193
194 if (!magic_ok) {
195 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
196 dip->di_version = 3;
197 }
198 if (!unlinked_ok)
199 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
200 xfs_dinode_calc_crc(mp, dip);
201 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
202 xfs_trans_log_buf(tp, bp, ioffset,
203 ioffset + sizeof(struct xfs_dinode) - 1);
204}
205
206/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
207STATIC void
208xrep_dinode_buf(
209 struct xfs_scrub *sc,
210 struct xfs_buf *bp)
211{
212 struct xfs_mount *mp = sc->mp;
213 int i;
214 int ni;
215
216 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
217 for (i = 0; i < ni; i++)
218 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
219}
220
221/* Reinitialize things that never change in an inode. */
222STATIC void
223xrep_dinode_header(
224 struct xfs_scrub *sc,
225 struct xfs_dinode *dip)
226{
227 trace_xrep_dinode_header(sc, dip);
228
229 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
230 if (!xfs_dinode_good_version(sc->mp, dip->di_version))
231 dip->di_version = 3;
232 dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
233 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
234 dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
235}
236
237/*
238 * If this directory entry points to the scrub target inode, then the directory
239 * we're scanning is the parent of the scrub target inode.
240 */
241STATIC int
242xrep_dinode_findmode_dirent(
243 struct xfs_scrub *sc,
244 struct xfs_inode *dp,
245 xfs_dir2_dataptr_t dapos,
246 const struct xfs_name *name,
247 xfs_ino_t ino,
248 void *priv)
249{
250 struct xrep_inode *ri = priv;
251 int error = 0;
252
253 if (xchk_should_terminate(ri->sc, &error))
254 return error;
255
256 if (ino != sc->sm->sm_ino)
257 return 0;
258
259 /* Ignore garbage directory entry names. */
260 if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
261 return -EFSCORRUPTED;
262
263 /* Don't pick up dot or dotdot entries; we only want child dirents. */
264 if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
265 xfs_dir2_samename(name, &xfs_name_dot))
266 return 0;
267
268 /*
269 * Uhoh, more than one parent for this inode and they don't agree on
270 * the file type?
271 */
272 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
273 ri->alleged_ftype != name->type) {
274 trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
275 ri->alleged_ftype);
276 return -EFSCORRUPTED;
277 }
278
279 /* We found a potential parent; remember the ftype. */
280 trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
281 ri->alleged_ftype = name->type;
282 return 0;
283}
284
285/*
286 * If this is a directory, walk the dirents looking for any that point to the
287 * scrub target inode.
288 */
289STATIC int
290xrep_dinode_findmode_walk_directory(
291 struct xrep_inode *ri,
292 struct xfs_inode *dp)
293{
294 struct xfs_scrub *sc = ri->sc;
295 unsigned int lock_mode;
296 int error = 0;
297
298 /*
299 * Scan the directory to see if there it contains an entry pointing to
300 * the directory that we are repairing.
301 */
302 lock_mode = xfs_ilock_data_map_shared(dp);
303
304 /*
305 * If this directory is known to be sick, we cannot scan it reliably
306 * and must abort.
307 */
308 if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
309 XFS_SICK_INO_BMBTD |
310 XFS_SICK_INO_DIR)) {
311 error = -EFSCORRUPTED;
312 goto out_unlock;
313 }
314
315 /*
316 * We cannot complete our parent pointer scan if a directory looks as
317 * though it has been zapped by the inode record repair code.
318 */
319 if (xchk_dir_looks_zapped(dp)) {
320 error = -EBUSY;
321 goto out_unlock;
322 }
323
324 error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
325 if (error)
326 goto out_unlock;
327
328out_unlock:
329 xfs_iunlock(dp, lock_mode);
330 return error;
331}
332
333/*
334 * Try to find the mode of the inode being repaired by looking for directories
335 * that point down to this file.
336 */
337STATIC int
338xrep_dinode_find_mode(
339 struct xrep_inode *ri,
340 uint16_t *mode)
341{
342 struct xfs_scrub *sc = ri->sc;
343 struct xfs_inode *dp;
344 int error;
345
346 /* No ftype means we have no other metadata to consult. */
347 if (!xfs_has_ftype(sc->mp)) {
348 *mode = S_IFREG;
349 return 0;
350 }
351
352 /*
353 * Scan all directories for parents that might point down to this
354 * inode. Skip the inode being repaired during the scan since it
355 * cannot be its own parent. Note that we still hold the AGI locked
356 * so there's a real possibility that _iscan_iter can return EBUSY.
357 */
358 xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
359 ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
360 ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
361 while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
362 if (S_ISDIR(VFS_I(dp)->i_mode))
363 error = xrep_dinode_findmode_walk_directory(ri, dp);
364 xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
365 xchk_irele(sc, dp);
366 if (error < 0)
367 break;
368 if (xchk_should_terminate(sc, &error))
369 break;
370 }
371 xchk_iscan_iter_finish(&ri->ftype_iscan);
372 xchk_iscan_teardown(&ri->ftype_iscan);
373
374 if (error == -EBUSY) {
375 if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
376 /*
377 * If we got an EBUSY after finding at least one
378 * dirent, that means the scan found an inode on the
379 * inactivation list and could not open it. Accept the
380 * alleged ftype and install a new mode below.
381 */
382 error = 0;
383 } else if (!(sc->flags & XCHK_TRY_HARDER)) {
384 /*
385 * Otherwise, retry the operation one time to see if
386 * the reason for the delay is an inode from the same
387 * cluster buffer waiting on the inactivation list.
388 */
389 error = -EDEADLOCK;
390 }
391 }
392 if (error)
393 return error;
394
395 /*
396 * Convert the discovered ftype into the file mode. If all else fails,
397 * return S_IFREG.
398 */
399 switch (ri->alleged_ftype) {
400 case XFS_DIR3_FT_DIR:
401 *mode = S_IFDIR;
402 break;
403 case XFS_DIR3_FT_WHT:
404 case XFS_DIR3_FT_CHRDEV:
405 *mode = S_IFCHR;
406 break;
407 case XFS_DIR3_FT_BLKDEV:
408 *mode = S_IFBLK;
409 break;
410 case XFS_DIR3_FT_FIFO:
411 *mode = S_IFIFO;
412 break;
413 case XFS_DIR3_FT_SOCK:
414 *mode = S_IFSOCK;
415 break;
416 case XFS_DIR3_FT_SYMLINK:
417 *mode = S_IFLNK;
418 break;
419 default:
420 *mode = S_IFREG;
421 break;
422 }
423 return 0;
424}
425
426/* Turn di_mode into /something/ recognizable. Returns true if we succeed. */
427STATIC int
428xrep_dinode_mode(
429 struct xrep_inode *ri,
430 struct xfs_dinode *dip)
431{
432 struct xfs_scrub *sc = ri->sc;
433 uint16_t mode = be16_to_cpu(dip->di_mode);
434 int error;
435
436 trace_xrep_dinode_mode(sc, dip);
437
438 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
439 return 0;
440
441 /* Try to fix the mode. If we cannot, then leave everything alone. */
442 error = xrep_dinode_find_mode(ri, &mode);
443 switch (error) {
444 case -EINTR:
445 case -EBUSY:
446 case -EDEADLOCK:
447 /* temporary failure or fatal signal */
448 return error;
449 case 0:
450 /* found mode */
451 break;
452 default:
453 /* some other error, assume S_IFREG */
454 mode = S_IFREG;
455 break;
456 }
457
458 /* bad mode, so we set it to a file that only root can read */
459 dip->di_mode = cpu_to_be16(mode);
460 dip->di_uid = 0;
461 dip->di_gid = 0;
462 ri->zap_acls = true;
463 return 0;
464}
465
466/* Fix any conflicting flags that the verifiers complain about. */
467STATIC void
468xrep_dinode_flags(
469 struct xfs_scrub *sc,
470 struct xfs_dinode *dip,
471 bool isrt)
472{
473 struct xfs_mount *mp = sc->mp;
474 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
475 uint16_t flags = be16_to_cpu(dip->di_flags);
476 uint16_t mode = be16_to_cpu(dip->di_mode);
477
478 trace_xrep_dinode_flags(sc, dip);
479
480 if (isrt)
481 flags |= XFS_DIFLAG_REALTIME;
482 else
483 flags &= ~XFS_DIFLAG_REALTIME;
484
485 /*
486 * For regular files on a reflink filesystem, set the REFLINK flag to
487 * protect shared extents. A later stage will actually check those
488 * extents and clear the flag if possible.
489 */
490 if (xfs_has_reflink(mp) && S_ISREG(mode))
491 flags2 |= XFS_DIFLAG2_REFLINK;
492 else
493 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
494 if (flags & XFS_DIFLAG_REALTIME)
495 flags2 &= ~XFS_DIFLAG2_REFLINK;
496 if (!xfs_has_bigtime(mp))
497 flags2 &= ~XFS_DIFLAG2_BIGTIME;
498 if (!xfs_has_large_extent_counts(mp))
499 flags2 &= ~XFS_DIFLAG2_NREXT64;
500 if (flags2 & XFS_DIFLAG2_NREXT64)
501 dip->di_nrext64_pad = 0;
502 else if (dip->di_version >= 3)
503 dip->di_v3_pad = 0;
504 dip->di_flags = cpu_to_be16(flags);
505 dip->di_flags2 = cpu_to_be64(flags2);
506}
507
508/*
509 * Blow out symlink; now it points nowhere. We don't have to worry about
510 * incore state because this inode is failing the verifiers.
511 */
512STATIC void
513xrep_dinode_zap_symlink(
514 struct xrep_inode *ri,
515 struct xfs_dinode *dip)
516{
517 struct xfs_scrub *sc = ri->sc;
518 char *p;
519
520 trace_xrep_dinode_zap_symlink(sc, dip);
521
522 dip->di_format = XFS_DINODE_FMT_LOCAL;
523 dip->di_size = cpu_to_be64(1);
524 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
525 *p = '?';
526 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
527}
528
529/*
530 * Blow out dir, make the parent point to the root. In the future repair will
531 * reconstruct this directory for us. Note that there's no in-core directory
532 * inode because the sf verifier tripped, so we don't have to worry about the
533 * dentry cache.
534 */
535STATIC void
536xrep_dinode_zap_dir(
537 struct xrep_inode *ri,
538 struct xfs_dinode *dip)
539{
540 struct xfs_scrub *sc = ri->sc;
541 struct xfs_mount *mp = sc->mp;
542 struct xfs_dir2_sf_hdr *sfp;
543 int i8count;
544
545 trace_xrep_dinode_zap_dir(sc, dip);
546
547 dip->di_format = XFS_DINODE_FMT_LOCAL;
548 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
549 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
550 sfp->count = 0;
551 sfp->i8count = i8count;
552 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
553 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
554 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
555}
556
557/* Make sure we don't have a garbage file size. */
558STATIC void
559xrep_dinode_size(
560 struct xrep_inode *ri,
561 struct xfs_dinode *dip)
562{
563 struct xfs_scrub *sc = ri->sc;
564 uint64_t size = be64_to_cpu(dip->di_size);
565 uint16_t mode = be16_to_cpu(dip->di_mode);
566
567 trace_xrep_dinode_size(sc, dip);
568
569 switch (mode & S_IFMT) {
570 case S_IFIFO:
571 case S_IFCHR:
572 case S_IFBLK:
573 case S_IFSOCK:
574 /* di_size can't be nonzero for special files */
575 dip->di_size = 0;
576 break;
577 case S_IFREG:
578 /* Regular files can't be larger than 2^63-1 bytes. */
579 dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
580 break;
581 case S_IFLNK:
582 /*
583 * Truncate ridiculously oversized symlinks. If the size is
584 * zero, reset it to point to the current directory. Both of
585 * these conditions trigger dinode verifier errors, so there
586 * is no in-core state to reset.
587 */
588 if (size > XFS_SYMLINK_MAXLEN)
589 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
590 else if (size == 0)
591 xrep_dinode_zap_symlink(ri, dip);
592 break;
593 case S_IFDIR:
594 /*
595 * Directories can't have a size larger than 32G. If the size
596 * is zero, reset it to an empty directory. Both of these
597 * conditions trigger dinode verifier errors, so there is no
598 * in-core state to reset.
599 */
600 if (size > XFS_DIR2_SPACE_SIZE)
601 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
602 else if (size == 0)
603 xrep_dinode_zap_dir(ri, dip);
604 break;
605 }
606}
607
608/* Fix extent size hints. */
609STATIC void
610xrep_dinode_extsize_hints(
611 struct xfs_scrub *sc,
612 struct xfs_dinode *dip)
613{
614 struct xfs_mount *mp = sc->mp;
615 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
616 uint16_t flags = be16_to_cpu(dip->di_flags);
617 uint16_t mode = be16_to_cpu(dip->di_mode);
618
619 xfs_failaddr_t fa;
620
621 trace_xrep_dinode_extsize_hints(sc, dip);
622
623 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
624 mode, flags);
625 if (fa) {
626 dip->di_extsize = 0;
627 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
628 XFS_DIFLAG_EXTSZINHERIT);
629 }
630
631 if (dip->di_version < 3)
632 return;
633
634 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
635 mode, flags, flags2);
636 if (fa) {
637 dip->di_cowextsize = 0;
638 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
639 }
640}
641
642/* Count extents and blocks for an inode given an rmap. */
643STATIC int
644xrep_dinode_walk_rmap(
645 struct xfs_btree_cur *cur,
646 const struct xfs_rmap_irec *rec,
647 void *priv)
648{
649 struct xrep_inode *ri = priv;
650 int error = 0;
651
652 if (xchk_should_terminate(ri->sc, &error))
653 return error;
654
655 /* We only care about this inode. */
656 if (rec->rm_owner != ri->sc->sm->sm_ino)
657 return 0;
658
659 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
660 ri->attr_blocks += rec->rm_blockcount;
661 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
662 ri->attr_extents++;
663
664 return 0;
665 }
666
667 ri->data_blocks += rec->rm_blockcount;
668 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
669 ri->data_extents++;
670
671 return 0;
672}
673
674/* Count extents and blocks for an inode from all AG rmap data. */
675STATIC int
676xrep_dinode_count_ag_rmaps(
677 struct xrep_inode *ri,
678 struct xfs_perag *pag)
679{
680 struct xfs_btree_cur *cur;
681 struct xfs_buf *agf;
682 int error;
683
684 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
685 if (error)
686 return error;
687
688 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
689 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
690 xfs_btree_del_cursor(cur, error);
691 xfs_trans_brelse(ri->sc->tp, agf);
692 return error;
693}
694
695/* Count extents and blocks for a given inode from all rmap data. */
696STATIC int
697xrep_dinode_count_rmaps(
698 struct xrep_inode *ri)
699{
700 struct xfs_perag *pag;
701 xfs_agnumber_t agno;
702 int error;
703
704 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
705 return -EOPNOTSUPP;
706
707 for_each_perag(ri->sc->mp, agno, pag) {
708 error = xrep_dinode_count_ag_rmaps(ri, pag);
709 if (error) {
710 xfs_perag_rele(pag);
711 return error;
712 }
713 }
714
715 /* Can't have extents on both the rt and the data device. */
716 if (ri->data_extents && ri->rt_extents)
717 return -EFSCORRUPTED;
718
719 trace_xrep_dinode_count_rmaps(ri->sc,
720 ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
721 ri->data_extents, ri->rt_extents, ri->attr_extents);
722 return 0;
723}
724
725/* Return true if this extents-format ifork looks like garbage. */
726STATIC bool
727xrep_dinode_bad_extents_fork(
728 struct xfs_scrub *sc,
729 struct xfs_dinode *dip,
730 unsigned int dfork_size,
731 int whichfork)
732{
733 struct xfs_bmbt_irec new;
734 struct xfs_bmbt_rec *dp;
735 xfs_extnum_t nex;
736 bool isrt;
737 unsigned int i;
738
739 nex = xfs_dfork_nextents(dip, whichfork);
740 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
741 return true;
742
743 dp = XFS_DFORK_PTR(dip, whichfork);
744
745 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
746 for (i = 0; i < nex; i++, dp++) {
747 xfs_failaddr_t fa;
748
749 xfs_bmbt_disk_get_all(dp, &new);
750 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
751 &new);
752 if (fa)
753 return true;
754 }
755
756 return false;
757}
758
759/* Return true if this btree-format ifork looks like garbage. */
760STATIC bool
761xrep_dinode_bad_bmbt_fork(
762 struct xfs_scrub *sc,
763 struct xfs_dinode *dip,
764 unsigned int dfork_size,
765 int whichfork)
766{
767 struct xfs_bmdr_block *dfp;
768 xfs_extnum_t nex;
769 unsigned int i;
770 unsigned int dmxr;
771 unsigned int nrecs;
772 unsigned int level;
773
774 nex = xfs_dfork_nextents(dip, whichfork);
775 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
776 return true;
777
778 if (dfork_size < sizeof(struct xfs_bmdr_block))
779 return true;
780
781 dfp = XFS_DFORK_PTR(dip, whichfork);
782 nrecs = be16_to_cpu(dfp->bb_numrecs);
783 level = be16_to_cpu(dfp->bb_level);
784
785 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
786 return true;
787 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
788 return true;
789
790 dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
791 for (i = 1; i <= nrecs; i++) {
792 struct xfs_bmbt_key *fkp;
793 xfs_bmbt_ptr_t *fpp;
794 xfs_fileoff_t fileoff;
795 xfs_fsblock_t fsbno;
796
797 fkp = XFS_BMDR_KEY_ADDR(dfp, i);
798 fileoff = be64_to_cpu(fkp->br_startoff);
799 if (!xfs_verify_fileoff(sc->mp, fileoff))
800 return true;
801
802 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
803 fsbno = be64_to_cpu(*fpp);
804 if (!xfs_verify_fsbno(sc->mp, fsbno))
805 return true;
806 }
807
808 return false;
809}
810
811/*
812 * Check the data fork for things that will fail the ifork verifiers or the
813 * ifork formatters.
814 */
815STATIC bool
816xrep_dinode_check_dfork(
817 struct xfs_scrub *sc,
818 struct xfs_dinode *dip,
819 uint16_t mode)
820{
821 void *dfork_ptr;
822 int64_t data_size;
823 unsigned int fmt;
824 unsigned int dfork_size;
825
826 /*
827 * Verifier functions take signed int64_t, so check for bogus negative
828 * values first.
829 */
830 data_size = be64_to_cpu(dip->di_size);
831 if (data_size < 0)
832 return true;
833
834 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
835 switch (mode & S_IFMT) {
836 case S_IFIFO:
837 case S_IFCHR:
838 case S_IFBLK:
839 case S_IFSOCK:
840 if (fmt != XFS_DINODE_FMT_DEV)
841 return true;
842 break;
843 case S_IFREG:
844 if (fmt == XFS_DINODE_FMT_LOCAL)
845 return true;
846 fallthrough;
847 case S_IFLNK:
848 case S_IFDIR:
849 switch (fmt) {
850 case XFS_DINODE_FMT_LOCAL:
851 case XFS_DINODE_FMT_EXTENTS:
852 case XFS_DINODE_FMT_BTREE:
853 break;
854 default:
855 return true;
856 }
857 break;
858 default:
859 return true;
860 }
861
862 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
863 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
864
865 switch (fmt) {
866 case XFS_DINODE_FMT_DEV:
867 break;
868 case XFS_DINODE_FMT_LOCAL:
869 /* dir/symlink structure cannot be larger than the fork */
870 if (data_size > dfork_size)
871 return true;
872 /* directory structure must pass verification. */
873 if (S_ISDIR(mode) &&
874 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
875 return true;
876 /* symlink structure must pass verification. */
877 if (S_ISLNK(mode) &&
878 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
879 return true;
880 break;
881 case XFS_DINODE_FMT_EXTENTS:
882 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
883 XFS_DATA_FORK))
884 return true;
885 break;
886 case XFS_DINODE_FMT_BTREE:
887 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
888 XFS_DATA_FORK))
889 return true;
890 break;
891 default:
892 return true;
893 }
894
895 return false;
896}
897
898static void
899xrep_dinode_set_data_nextents(
900 struct xfs_dinode *dip,
901 xfs_extnum_t nextents)
902{
903 if (xfs_dinode_has_large_extent_counts(dip))
904 dip->di_big_nextents = cpu_to_be64(nextents);
905 else
906 dip->di_nextents = cpu_to_be32(nextents);
907}
908
909static void
910xrep_dinode_set_attr_nextents(
911 struct xfs_dinode *dip,
912 xfs_extnum_t nextents)
913{
914 if (xfs_dinode_has_large_extent_counts(dip))
915 dip->di_big_anextents = cpu_to_be32(nextents);
916 else
917 dip->di_anextents = cpu_to_be16(nextents);
918}
919
920/* Reset the data fork to something sane. */
921STATIC void
922xrep_dinode_zap_dfork(
923 struct xrep_inode *ri,
924 struct xfs_dinode *dip,
925 uint16_t mode)
926{
927 struct xfs_scrub *sc = ri->sc;
928
929 trace_xrep_dinode_zap_dfork(sc, dip);
930
931 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
932
933 xrep_dinode_set_data_nextents(dip, 0);
934 ri->data_blocks = 0;
935 ri->rt_blocks = 0;
936
937 /* Special files always get reset to DEV */
938 switch (mode & S_IFMT) {
939 case S_IFIFO:
940 case S_IFCHR:
941 case S_IFBLK:
942 case S_IFSOCK:
943 dip->di_format = XFS_DINODE_FMT_DEV;
944 dip->di_size = 0;
945 return;
946 }
947
948 /*
949 * If we have data extents, reset to an empty map and hope the user
950 * will run the bmapbtd checker next.
951 */
952 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
953 dip->di_format = XFS_DINODE_FMT_EXTENTS;
954 return;
955 }
956
957 /* Otherwise, reset the local format to the minimum. */
958 switch (mode & S_IFMT) {
959 case S_IFLNK:
960 xrep_dinode_zap_symlink(ri, dip);
961 break;
962 case S_IFDIR:
963 xrep_dinode_zap_dir(ri, dip);
964 break;
965 }
966}
967
968/*
969 * Check the attr fork for things that will fail the ifork verifiers or the
970 * ifork formatters.
971 */
972STATIC bool
973xrep_dinode_check_afork(
974 struct xfs_scrub *sc,
975 struct xfs_dinode *dip)
976{
977 struct xfs_attr_sf_hdr *afork_ptr;
978 size_t attr_size;
979 unsigned int afork_size;
980
981 if (XFS_DFORK_BOFF(dip) == 0)
982 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
983 xfs_dfork_attr_extents(dip) != 0;
984
985 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
986 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
987
988 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
989 case XFS_DINODE_FMT_LOCAL:
990 /* Fork has to be large enough to extract the xattr size. */
991 if (afork_size < sizeof(struct xfs_attr_sf_hdr))
992 return true;
993
994 /* xattr structure cannot be larger than the fork */
995 attr_size = be16_to_cpu(afork_ptr->totsize);
996 if (attr_size > afork_size)
997 return true;
998
999 /* xattr structure must pass verification. */
1000 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1001 case XFS_DINODE_FMT_EXTENTS:
1002 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1003 XFS_ATTR_FORK))
1004 return true;
1005 break;
1006 case XFS_DINODE_FMT_BTREE:
1007 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1008 XFS_ATTR_FORK))
1009 return true;
1010 break;
1011 default:
1012 return true;
1013 }
1014
1015 return false;
1016}
1017
1018/*
1019 * Reset the attr fork to empty. Since the attr fork could have contained
1020 * ACLs, make the file readable only by root.
1021 */
1022STATIC void
1023xrep_dinode_zap_afork(
1024 struct xrep_inode *ri,
1025 struct xfs_dinode *dip,
1026 uint16_t mode)
1027{
1028 struct xfs_scrub *sc = ri->sc;
1029
1030 trace_xrep_dinode_zap_afork(sc, dip);
1031
1032 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
1033
1034 dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1035 xrep_dinode_set_attr_nextents(dip, 0);
1036 ri->attr_blocks = 0;
1037
1038 /*
1039 * If the data fork is in btree format, removing the attr fork entirely
1040 * might cause verifier failures if the next level down in the bmbt
1041 * could now fit in the data fork area.
1042 */
1043 if (dip->di_format != XFS_DINODE_FMT_BTREE)
1044 dip->di_forkoff = 0;
1045 dip->di_mode = cpu_to_be16(mode & ~0777);
1046 dip->di_uid = 0;
1047 dip->di_gid = 0;
1048}
1049
1050/* Make sure the fork offset is a sensible value. */
1051STATIC void
1052xrep_dinode_ensure_forkoff(
1053 struct xrep_inode *ri,
1054 struct xfs_dinode *dip,
1055 uint16_t mode)
1056{
1057 struct xfs_bmdr_block *bmdr;
1058 struct xfs_scrub *sc = ri->sc;
1059 xfs_extnum_t attr_extents, data_extents;
1060 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
1061 unsigned int lit_sz = XFS_LITINO(sc->mp);
1062 unsigned int afork_min, dfork_min;
1063
1064 trace_xrep_dinode_ensure_forkoff(sc, dip);
1065
1066 /*
1067 * Before calling this function, xrep_dinode_core ensured that both
1068 * forks actually fit inside their respective literal areas. If this
1069 * was not the case, the fork was reset to FMT_EXTENTS with zero
1070 * records. If the rmapbt scan found attr or data fork blocks, this
1071 * will be noted in the dinode_stats, and we must leave enough room
1072 * for the bmap repair code to reconstruct the mapping structure.
1073 *
1074 * First, compute the minimum space required for the attr fork.
1075 */
1076 switch (dip->di_aformat) {
1077 case XFS_DINODE_FMT_LOCAL:
1078 /*
1079 * If we still have a shortform xattr structure at all, that
1080 * means the attr fork area was exactly large enough to fit
1081 * the sf structure.
1082 */
1083 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1084 break;
1085 case XFS_DINODE_FMT_EXTENTS:
1086 attr_extents = xfs_dfork_attr_extents(dip);
1087 if (attr_extents) {
1088 /*
1089 * We must maintain sufficient space to hold the entire
1090 * extent map array in the data fork. Note that we
1091 * previously zapped the fork if it had no chance of
1092 * fitting in the inode.
1093 */
1094 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1095 } else if (ri->attr_extents > 0) {
1096 /*
1097 * The attr fork thinks it has zero extents, but we
1098 * found some xattr extents. We need to leave enough
1099 * empty space here so that the incore attr fork will
1100 * get created (and hence trigger the attr fork bmap
1101 * repairer).
1102 */
1103 afork_min = bmdr_minsz;
1104 } else {
1105 /* No extents on disk or found in rmapbt. */
1106 afork_min = 0;
1107 }
1108 break;
1109 case XFS_DINODE_FMT_BTREE:
1110 /* Must have space for btree header and key/pointers. */
1111 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1112 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1113 break;
1114 default:
1115 /* We should never see any other formats. */
1116 afork_min = 0;
1117 break;
1118 }
1119
1120 /* Compute the minimum space required for the data fork. */
1121 switch (dip->di_format) {
1122 case XFS_DINODE_FMT_DEV:
1123 dfork_min = sizeof(__be32);
1124 break;
1125 case XFS_DINODE_FMT_UUID:
1126 dfork_min = sizeof(uuid_t);
1127 break;
1128 case XFS_DINODE_FMT_LOCAL:
1129 /*
1130 * If we still have a shortform data fork at all, that means
1131 * the data fork area was large enough to fit whatever was in
1132 * there.
1133 */
1134 dfork_min = be64_to_cpu(dip->di_size);
1135 break;
1136 case XFS_DINODE_FMT_EXTENTS:
1137 data_extents = xfs_dfork_data_extents(dip);
1138 if (data_extents) {
1139 /*
1140 * We must maintain sufficient space to hold the entire
1141 * extent map array in the data fork. Note that we
1142 * previously zapped the fork if it had no chance of
1143 * fitting in the inode.
1144 */
1145 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1146 } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
1147 /*
1148 * The data fork thinks it has zero extents, but we
1149 * found some data extents. We need to leave enough
1150 * empty space here so that the data fork bmap repair
1151 * will recover the mappings.
1152 */
1153 dfork_min = bmdr_minsz;
1154 } else {
1155 /* No extents on disk or found in rmapbt. */
1156 dfork_min = 0;
1157 }
1158 break;
1159 case XFS_DINODE_FMT_BTREE:
1160 /* Must have space for btree header and key/pointers. */
1161 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1162 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1163 break;
1164 default:
1165 dfork_min = 0;
1166 break;
1167 }
1168
1169 /*
1170 * Round all values up to the nearest 8 bytes, because that is the
1171 * precision of di_forkoff.
1172 */
1173 afork_min = roundup(afork_min, 8);
1174 dfork_min = roundup(dfork_min, 8);
1175 bmdr_minsz = roundup(bmdr_minsz, 8);
1176
1177 ASSERT(dfork_min <= lit_sz);
1178 ASSERT(afork_min <= lit_sz);
1179
1180 /*
1181 * If the data fork was zapped and we don't have enough space for the
1182 * recovery fork, move the attr fork up.
1183 */
1184 if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1185 xfs_dfork_data_extents(dip) == 0 &&
1186 (ri->data_extents > 0 || ri->rt_extents > 0) &&
1187 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1188 if (bmdr_minsz + afork_min > lit_sz) {
1189 /*
1190 * The attr for and the stub fork we need to recover
1191 * the data fork won't both fit. Zap the attr fork.
1192 */
1193 xrep_dinode_zap_afork(ri, dip, mode);
1194 afork_min = bmdr_minsz;
1195 } else {
1196 void *before, *after;
1197
1198 /* Otherwise, just slide the attr fork up. */
1199 before = XFS_DFORK_APTR(dip);
1200 dip->di_forkoff = bmdr_minsz >> 3;
1201 after = XFS_DFORK_APTR(dip);
1202 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1203 }
1204 }
1205
1206 /*
1207 * If the attr fork was zapped and we don't have enough space for the
1208 * recovery fork, move the attr fork down.
1209 */
1210 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1211 xfs_dfork_attr_extents(dip) == 0 &&
1212 ri->attr_extents > 0 &&
1213 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1214 if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1215 /*
1216 * If the data fork is in btree format then we can't
1217 * adjust forkoff because that runs the risk of
1218 * violating the extents/btree format transition rules.
1219 */
1220 } else if (bmdr_minsz + dfork_min > lit_sz) {
1221 /*
1222 * If we can't move the attr fork, too bad, we lose the
1223 * attr fork and leak its blocks.
1224 */
1225 xrep_dinode_zap_afork(ri, dip, mode);
1226 } else {
1227 /*
1228 * Otherwise, just slide the attr fork down. The attr
1229 * fork is empty, so we don't have any old contents to
1230 * move here.
1231 */
1232 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1233 }
1234 }
1235}
1236
1237/*
1238 * Zap the data/attr forks if we spot anything that isn't going to pass the
1239 * ifork verifiers or the ifork formatters, because we need to get the inode
1240 * into good enough shape that the higher level repair functions can run.
1241 */
1242STATIC void
1243xrep_dinode_zap_forks(
1244 struct xrep_inode *ri,
1245 struct xfs_dinode *dip)
1246{
1247 struct xfs_scrub *sc = ri->sc;
1248 xfs_extnum_t data_extents;
1249 xfs_extnum_t attr_extents;
1250 xfs_filblks_t nblocks;
1251 uint16_t mode;
1252 bool zap_datafork = false;
1253 bool zap_attrfork = ri->zap_acls;
1254
1255 trace_xrep_dinode_zap_forks(sc, dip);
1256
1257 mode = be16_to_cpu(dip->di_mode);
1258
1259 data_extents = xfs_dfork_data_extents(dip);
1260 attr_extents = xfs_dfork_attr_extents(dip);
1261 nblocks = be64_to_cpu(dip->di_nblocks);
1262
1263 /* Inode counters don't make sense? */
1264 if (data_extents > nblocks)
1265 zap_datafork = true;
1266 if (attr_extents > nblocks)
1267 zap_attrfork = true;
1268 if (data_extents + attr_extents > nblocks)
1269 zap_datafork = zap_attrfork = true;
1270
1271 if (!zap_datafork)
1272 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1273 if (!zap_attrfork)
1274 zap_attrfork = xrep_dinode_check_afork(sc, dip);
1275
1276 /* Zap whatever's bad. */
1277 if (zap_attrfork)
1278 xrep_dinode_zap_afork(ri, dip, mode);
1279 if (zap_datafork)
1280 xrep_dinode_zap_dfork(ri, dip, mode);
1281 xrep_dinode_ensure_forkoff(ri, dip, mode);
1282
1283 /*
1284 * Zero di_nblocks if we don't have any extents at all to satisfy the
1285 * buffer verifier.
1286 */
1287 data_extents = xfs_dfork_data_extents(dip);
1288 attr_extents = xfs_dfork_attr_extents(dip);
1289 if (data_extents + attr_extents == 0)
1290 dip->di_nblocks = 0;
1291}
1292
1293/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1294STATIC int
1295xrep_dinode_core(
1296 struct xrep_inode *ri)
1297{
1298 struct xfs_scrub *sc = ri->sc;
1299 struct xfs_buf *bp;
1300 struct xfs_dinode *dip;
1301 xfs_ino_t ino = sc->sm->sm_ino;
1302 int error;
1303 int iget_error;
1304
1305 /* Figure out what this inode had mapped in both forks. */
1306 error = xrep_dinode_count_rmaps(ri);
1307 if (error)
1308 return error;
1309
1310 /* Read the inode cluster buffer. */
1311 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1312 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1313 NULL);
1314 if (error)
1315 return error;
1316
1317 /* Make sure we can pass the inode buffer verifier. */
1318 xrep_dinode_buf(sc, bp);
1319 bp->b_ops = &xfs_inode_buf_ops;
1320
1321 /* Fix everything the verifier will complain about. */
1322 dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1323 xrep_dinode_header(sc, dip);
1324 iget_error = xrep_dinode_mode(ri, dip);
1325 if (iget_error)
1326 goto write;
1327 xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1328 xrep_dinode_size(ri, dip);
1329 xrep_dinode_extsize_hints(sc, dip);
1330 xrep_dinode_zap_forks(ri, dip);
1331
1332write:
1333 /* Write out the inode. */
1334 trace_xrep_dinode_fixed(sc, dip);
1335 xfs_dinode_calc_crc(sc->mp, dip);
1336 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1337 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1338 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1339
1340 /*
1341 * In theory, we've fixed the ondisk inode record enough that we should
1342 * be able to load the inode into the cache. Try to iget that inode
1343 * now while we hold the AGI and the inode cluster buffer and take the
1344 * IOLOCK so that we can continue with repairs without anyone else
1345 * accessing the inode. If iget fails, we still need to commit the
1346 * changes.
1347 */
1348 if (!iget_error)
1349 iget_error = xchk_iget(sc, ino, &sc->ip);
1350 if (!iget_error)
1351 xchk_ilock(sc, XFS_IOLOCK_EXCL);
1352
1353 /*
1354 * Commit the inode cluster buffer updates and drop the AGI buffer that
1355 * we've been holding since scrub setup. From here on out, repairs
1356 * deal only with the cached inode.
1357 */
1358 error = xrep_trans_commit(sc);
1359 if (error)
1360 return error;
1361
1362 if (iget_error)
1363 return iget_error;
1364
1365 error = xchk_trans_alloc(sc, 0);
1366 if (error)
1367 return error;
1368
1369 error = xrep_ino_dqattach(sc);
1370 if (error)
1371 return error;
1372
1373 xchk_ilock(sc, XFS_ILOCK_EXCL);
1374 if (ri->ino_sick_mask)
1375 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1376 return 0;
1377}
1378
1379/* Fix everything xfs_dinode_verify cares about. */
1380STATIC int
1381xrep_dinode_problems(
1382 struct xrep_inode *ri)
1383{
1384 struct xfs_scrub *sc = ri->sc;
1385 int error;
1386
1387 error = xrep_dinode_core(ri);
1388 if (error)
1389 return error;
1390
1391 /* We had to fix a totally busted inode, schedule quotacheck. */
1392 if (XFS_IS_UQUOTA_ON(sc->mp))
1393 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1394 if (XFS_IS_GQUOTA_ON(sc->mp))
1395 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1396 if (XFS_IS_PQUOTA_ON(sc->mp))
1397 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1398
1399 return 0;
1400}
1401
1402/*
1403 * Fix problems that the verifiers don't care about. In general these are
1404 * errors that don't cause problems elsewhere in the kernel that we can easily
1405 * detect, so we don't check them all that rigorously.
1406 */
1407
1408/* Make sure block and extent counts are ok. */
1409STATIC int
1410xrep_inode_blockcounts(
1411 struct xfs_scrub *sc)
1412{
1413 struct xfs_ifork *ifp;
1414 xfs_filblks_t count;
1415 xfs_filblks_t acount;
1416 xfs_extnum_t nextents;
1417 int error;
1418
1419 trace_xrep_inode_blockcounts(sc);
1420
1421 /* Set data fork counters from the data fork mappings. */
1422 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1423 &nextents, &count);
1424 if (error)
1425 return error;
1426 if (xfs_is_reflink_inode(sc->ip)) {
1427 /*
1428 * data fork blockcount can exceed physical storage if a user
1429 * reflinks the same block over and over again.
1430 */
1431 ;
1432 } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1433 if (count >= sc->mp->m_sb.sb_rblocks)
1434 return -EFSCORRUPTED;
1435 } else {
1436 if (count >= sc->mp->m_sb.sb_dblocks)
1437 return -EFSCORRUPTED;
1438 }
1439 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1440 if (error)
1441 return error;
1442 sc->ip->i_df.if_nextents = nextents;
1443
1444 /* Set attr fork counters from the attr fork mappings. */
1445 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1446 if (ifp) {
1447 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1448 &nextents, &acount);
1449 if (error)
1450 return error;
1451 if (count >= sc->mp->m_sb.sb_dblocks)
1452 return -EFSCORRUPTED;
1453 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1454 nextents);
1455 if (error)
1456 return error;
1457 ifp->if_nextents = nextents;
1458 } else {
1459 acount = 0;
1460 }
1461
1462 sc->ip->i_nblocks = count + acount;
1463 return 0;
1464}
1465
1466/* Check for invalid uid/gid/prid. */
1467STATIC void
1468xrep_inode_ids(
1469 struct xfs_scrub *sc)
1470{
1471 bool dirty = false;
1472
1473 trace_xrep_inode_ids(sc);
1474
1475 if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1476 i_uid_write(VFS_I(sc->ip), 0);
1477 dirty = true;
1478 if (XFS_IS_UQUOTA_ON(sc->mp))
1479 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1480 }
1481
1482 if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1483 i_gid_write(VFS_I(sc->ip), 0);
1484 dirty = true;
1485 if (XFS_IS_GQUOTA_ON(sc->mp))
1486 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1487 }
1488
1489 if (sc->ip->i_projid == -1U) {
1490 sc->ip->i_projid = 0;
1491 dirty = true;
1492 if (XFS_IS_PQUOTA_ON(sc->mp))
1493 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1494 }
1495
1496 /* strip setuid/setgid if we touched any of the ids */
1497 if (dirty)
1498 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1499}
1500
1501static inline void
1502xrep_clamp_timestamp(
1503 struct xfs_inode *ip,
1504 struct timespec64 *ts)
1505{
1506 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1507 *ts = timestamp_truncate(*ts, VFS_I(ip));
1508}
1509
1510/* Nanosecond counters can't have more than 1 billion. */
1511STATIC void
1512xrep_inode_timestamps(
1513 struct xfs_inode *ip)
1514{
1515 struct timespec64 tstamp;
1516 struct inode *inode = VFS_I(ip);
1517
1518 tstamp = inode_get_atime(inode);
1519 xrep_clamp_timestamp(ip, &tstamp);
1520 inode_set_atime_to_ts(inode, tstamp);
1521
1522 tstamp = inode_get_mtime(inode);
1523 xrep_clamp_timestamp(ip, &tstamp);
1524 inode_set_mtime_to_ts(inode, tstamp);
1525
1526 tstamp = inode_get_ctime(inode);
1527 xrep_clamp_timestamp(ip, &tstamp);
1528 inode_set_ctime_to_ts(inode, tstamp);
1529
1530 xrep_clamp_timestamp(ip, &ip->i_crtime);
1531}
1532
1533/* Fix inode flags that don't make sense together. */
1534STATIC void
1535xrep_inode_flags(
1536 struct xfs_scrub *sc)
1537{
1538 uint16_t mode;
1539
1540 trace_xrep_inode_flags(sc);
1541
1542 mode = VFS_I(sc->ip)->i_mode;
1543
1544 /* Clear junk flags */
1545 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1546 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1547
1548 /* NEWRTBM only applies to realtime bitmaps */
1549 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1550 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1551 else
1552 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1553
1554 /* These only make sense for directories. */
1555 if (!S_ISDIR(mode))
1556 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1557 XFS_DIFLAG_EXTSZINHERIT |
1558 XFS_DIFLAG_PROJINHERIT |
1559 XFS_DIFLAG_NOSYMLINKS);
1560
1561 /* These only make sense for files. */
1562 if (!S_ISREG(mode))
1563 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1564 XFS_DIFLAG_EXTSIZE);
1565
1566 /* These only make sense for non-rt files. */
1567 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1568 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1569
1570 /* Immutable and append only? Drop the append. */
1571 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1572 (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1573 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1574
1575 /* Clear junk flags. */
1576 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1577 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1578
1579 /* No reflink flag unless we support it and it's a file. */
1580 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1581 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1582
1583 /* DAX only applies to files and dirs. */
1584 if (!(S_ISREG(mode) || S_ISDIR(mode)))
1585 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1586
1587 /* No reflink files on the realtime device. */
1588 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1589 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1590}
1591
1592/*
1593 * Fix size problems with block/node format directories. If we fail to find
1594 * the extent list, just bail out and let the bmapbtd repair functions clean
1595 * up that mess.
1596 */
1597STATIC void
1598xrep_inode_blockdir_size(
1599 struct xfs_scrub *sc)
1600{
1601 struct xfs_iext_cursor icur;
1602 struct xfs_bmbt_irec got;
1603 struct xfs_ifork *ifp;
1604 xfs_fileoff_t off;
1605 int error;
1606
1607 trace_xrep_inode_blockdir_size(sc);
1608
1609 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1610 if (error)
1611 return;
1612
1613 /* Find the last block before 32G; this is the dir size. */
1614 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1615 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1616 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1617 /* zero-extents directory? */
1618 return;
1619 }
1620
1621 off = got.br_startoff + got.br_blockcount;
1622 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1623 XFS_FSB_TO_B(sc->mp, off));
1624}
1625
1626/* Fix size problems with short format directories. */
1627STATIC void
1628xrep_inode_sfdir_size(
1629 struct xfs_scrub *sc)
1630{
1631 struct xfs_ifork *ifp;
1632
1633 trace_xrep_inode_sfdir_size(sc);
1634
1635 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1636 sc->ip->i_disk_size = ifp->if_bytes;
1637}
1638
1639/*
1640 * Fix any irregularities in a directory inode's size now that we can iterate
1641 * extent maps and access other regular inode data.
1642 */
1643STATIC void
1644xrep_inode_dir_size(
1645 struct xfs_scrub *sc)
1646{
1647 trace_xrep_inode_dir_size(sc);
1648
1649 switch (sc->ip->i_df.if_format) {
1650 case XFS_DINODE_FMT_EXTENTS:
1651 case XFS_DINODE_FMT_BTREE:
1652 xrep_inode_blockdir_size(sc);
1653 break;
1654 case XFS_DINODE_FMT_LOCAL:
1655 xrep_inode_sfdir_size(sc);
1656 break;
1657 }
1658}
1659
1660/* Fix extent size hint problems. */
1661STATIC void
1662xrep_inode_extsize(
1663 struct xfs_scrub *sc)
1664{
1665 /* Fix misaligned extent size hints on a directory. */
1666 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1667 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1668 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1669 sc->ip->i_extsize = 0;
1670 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1671 }
1672}
1673
1674/* Fix any irregularities in an inode that the verifiers don't catch. */
1675STATIC int
1676xrep_inode_problems(
1677 struct xfs_scrub *sc)
1678{
1679 int error;
1680
1681 error = xrep_inode_blockcounts(sc);
1682 if (error)
1683 return error;
1684 xrep_inode_timestamps(sc->ip);
1685 xrep_inode_flags(sc);
1686 xrep_inode_ids(sc);
1687 /*
1688 * We can now do a better job fixing the size of a directory now that
1689 * we can scan the data fork extents than we could in xrep_dinode_size.
1690 */
1691 if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1692 xrep_inode_dir_size(sc);
1693 xrep_inode_extsize(sc);
1694
1695 trace_xrep_inode_fixed(sc);
1696 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1697 return xrep_roll_trans(sc);
1698}
1699
1700/* Repair an inode's fields. */
1701int
1702xrep_inode(
1703 struct xfs_scrub *sc)
1704{
1705 int error = 0;
1706
1707 /*
1708 * No inode? That means we failed the _iget verifiers. Repair all
1709 * the things that the inode verifiers care about, then retry _iget.
1710 */
1711 if (!sc->ip) {
1712 struct xrep_inode *ri = sc->buf;
1713
1714 ASSERT(ri != NULL);
1715
1716 error = xrep_dinode_problems(ri);
1717 if (error == -EBUSY) {
1718 /*
1719 * Directory scan to recover inode mode encountered a
1720 * busy inode, so we did not continue repairing things.
1721 */
1722 return 0;
1723 }
1724 if (error)
1725 return error;
1726
1727 /* By this point we had better have a working incore inode. */
1728 if (!sc->ip)
1729 return -EFSCORRUPTED;
1730 }
1731
1732 xfs_trans_ijoin(sc->tp, sc->ip, 0);
1733
1734 /* If we found corruption of any kind, try to fix it. */
1735 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1736 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1737 error = xrep_inode_problems(sc);
1738 if (error)
1739 return error;
1740 }
1741
1742 /* See if we can clear the reflink flag. */
1743 if (xfs_is_reflink_inode(sc->ip)) {
1744 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1745 if (error)
1746 return error;
1747 }
1748
1749 return xrep_defer_finish(sc);
1750}
1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6#include "xfs.h"
7#include "xfs_fs.h"
8#include "xfs_shared.h"
9#include "xfs_format.h"
10#include "xfs_trans_resv.h"
11#include "xfs_mount.h"
12#include "xfs_defer.h"
13#include "xfs_btree.h"
14#include "xfs_bit.h"
15#include "xfs_log_format.h"
16#include "xfs_trans.h"
17#include "xfs_sb.h"
18#include "xfs_inode.h"
19#include "xfs_icache.h"
20#include "xfs_inode_buf.h"
21#include "xfs_inode_fork.h"
22#include "xfs_ialloc.h"
23#include "xfs_da_format.h"
24#include "xfs_reflink.h"
25#include "xfs_alloc.h"
26#include "xfs_rmap.h"
27#include "xfs_rmap_btree.h"
28#include "xfs_bmap.h"
29#include "xfs_bmap_btree.h"
30#include "xfs_bmap_util.h"
31#include "xfs_dir2.h"
32#include "xfs_dir2_priv.h"
33#include "xfs_quota_defs.h"
34#include "xfs_quota.h"
35#include "xfs_ag.h"
36#include "xfs_rtbitmap.h"
37#include "xfs_attr_leaf.h"
38#include "xfs_log_priv.h"
39#include "xfs_health.h"
40#include "scrub/xfs_scrub.h"
41#include "scrub/scrub.h"
42#include "scrub/common.h"
43#include "scrub/btree.h"
44#include "scrub/trace.h"
45#include "scrub/repair.h"
46
47/*
48 * Inode Record Repair
49 * ===================
50 *
51 * Roughly speaking, inode problems can be classified based on whether or not
52 * they trip the dinode verifiers. If those trip, then we won't be able to
53 * xfs_iget ourselves the inode.
54 *
55 * Therefore, the xrep_dinode_* functions fix anything that will cause the
56 * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
57 * fix things on live incore inodes. The inode repair functions make decisions
58 * with security and usability implications when reviving a file:
59 *
60 * - Files with zero di_mode or a garbage di_mode are converted to regular file
61 * that only root can read. This file may not actually contain user data,
62 * if the file was not previously a regular file. Setuid and setgid bits
63 * are cleared.
64 *
65 * - Zero-size directories can be truncated to look empty. It is necessary to
66 * run the bmapbtd and directory repair functions to fully rebuild the
67 * directory.
68 *
69 * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
70 * to run the bmapbtd and symlink repair functions to salvage the symlink.
71 *
72 * - Invalid extent size hints will be removed.
73 *
74 * - Quotacheck will be scheduled if we repaired an inode that was so badly
75 * damaged that the ondisk inode had to be rebuilt.
76 *
77 * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
78 * Setuid and setgid bits are cleared.
79 *
80 * - Data and attr forks are reset to extents format with zero extents if the
81 * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
82 * repair functions to recover the space mapping.
83 *
84 * - ACLs will not be recovered if the attr fork is zapped or the extended
85 * attribute structure itself requires salvaging.
86 *
87 * - If the attr fork is zapped, the user and group ids are reset to root and
88 * the setuid and setgid bits are removed.
89 */
90
91/*
92 * All the information we need to repair the ondisk inode if we can't iget the
93 * incore inode. We don't allocate this buffer unless we're going to perform
94 * a repair to the ondisk inode cluster buffer.
95 */
96struct xrep_inode {
97 /* Inode mapping that we saved from the initial lookup attempt. */
98 struct xfs_imap imap;
99
100 struct xfs_scrub *sc;
101
102 /* Blocks in use on the data device by data extents or bmbt blocks. */
103 xfs_rfsblock_t data_blocks;
104
105 /* Blocks in use on the rt device. */
106 xfs_rfsblock_t rt_blocks;
107
108 /* Blocks in use by the attr fork. */
109 xfs_rfsblock_t attr_blocks;
110
111 /* Number of data device extents for the data fork. */
112 xfs_extnum_t data_extents;
113
114 /*
115 * Number of realtime device extents for the data fork. If
116 * data_extents and rt_extents indicate that the data fork has extents
117 * on both devices, we'll just back away slowly.
118 */
119 xfs_extnum_t rt_extents;
120
121 /* Number of (data device) extents for the attr fork. */
122 xfs_aextnum_t attr_extents;
123
124 /* Sick state to set after zapping parts of the inode. */
125 unsigned int ino_sick_mask;
126
127 /* Must we remove all access from this file? */
128 bool zap_acls;
129};
130
131/*
132 * Setup function for inode repair. @imap contains the ondisk inode mapping
133 * information so that we can correct the ondisk inode cluster buffer if
134 * necessary to make iget work.
135 */
136int
137xrep_setup_inode(
138 struct xfs_scrub *sc,
139 const struct xfs_imap *imap)
140{
141 struct xrep_inode *ri;
142
143 sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
144 if (!sc->buf)
145 return -ENOMEM;
146
147 ri = sc->buf;
148 memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
149 ri->sc = sc;
150 return 0;
151}
152
153/*
154 * Make sure this ondisk inode can pass the inode buffer verifier. This is
155 * not the same as the dinode verifier.
156 */
157STATIC void
158xrep_dinode_buf_core(
159 struct xfs_scrub *sc,
160 struct xfs_buf *bp,
161 unsigned int ioffset)
162{
163 struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
164 struct xfs_trans *tp = sc->tp;
165 struct xfs_mount *mp = sc->mp;
166 xfs_agino_t agino;
167 bool crc_ok = false;
168 bool magic_ok = false;
169 bool unlinked_ok = false;
170
171 agino = be32_to_cpu(dip->di_next_unlinked);
172
173 if (xfs_verify_agino_or_null(bp->b_pag, agino))
174 unlinked_ok = true;
175
176 if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
177 xfs_dinode_good_version(mp, dip->di_version))
178 magic_ok = true;
179
180 if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
181 XFS_DINODE_CRC_OFF))
182 crc_ok = true;
183
184 if (magic_ok && unlinked_ok && crc_ok)
185 return;
186
187 if (!magic_ok) {
188 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
189 dip->di_version = 3;
190 }
191 if (!unlinked_ok)
192 dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
193 xfs_dinode_calc_crc(mp, dip);
194 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
195 xfs_trans_log_buf(tp, bp, ioffset,
196 ioffset + sizeof(struct xfs_dinode) - 1);
197}
198
199/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
200STATIC void
201xrep_dinode_buf(
202 struct xfs_scrub *sc,
203 struct xfs_buf *bp)
204{
205 struct xfs_mount *mp = sc->mp;
206 int i;
207 int ni;
208
209 ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
210 for (i = 0; i < ni; i++)
211 xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
212}
213
214/* Reinitialize things that never change in an inode. */
215STATIC void
216xrep_dinode_header(
217 struct xfs_scrub *sc,
218 struct xfs_dinode *dip)
219{
220 trace_xrep_dinode_header(sc, dip);
221
222 dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
223 if (!xfs_dinode_good_version(sc->mp, dip->di_version))
224 dip->di_version = 3;
225 dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
226 uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
227 dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
228}
229
230/* Turn di_mode into /something/ recognizable. */
231STATIC void
232xrep_dinode_mode(
233 struct xrep_inode *ri,
234 struct xfs_dinode *dip)
235{
236 struct xfs_scrub *sc = ri->sc;
237 uint16_t mode = be16_to_cpu(dip->di_mode);
238
239 trace_xrep_dinode_mode(sc, dip);
240
241 if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
242 return;
243
244 /* bad mode, so we set it to a file that only root can read */
245 mode = S_IFREG;
246 dip->di_mode = cpu_to_be16(mode);
247 dip->di_uid = 0;
248 dip->di_gid = 0;
249 ri->zap_acls = true;
250}
251
252/* Fix any conflicting flags that the verifiers complain about. */
253STATIC void
254xrep_dinode_flags(
255 struct xfs_scrub *sc,
256 struct xfs_dinode *dip,
257 bool isrt)
258{
259 struct xfs_mount *mp = sc->mp;
260 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
261 uint16_t flags = be16_to_cpu(dip->di_flags);
262 uint16_t mode = be16_to_cpu(dip->di_mode);
263
264 trace_xrep_dinode_flags(sc, dip);
265
266 if (isrt)
267 flags |= XFS_DIFLAG_REALTIME;
268 else
269 flags &= ~XFS_DIFLAG_REALTIME;
270
271 /*
272 * For regular files on a reflink filesystem, set the REFLINK flag to
273 * protect shared extents. A later stage will actually check those
274 * extents and clear the flag if possible.
275 */
276 if (xfs_has_reflink(mp) && S_ISREG(mode))
277 flags2 |= XFS_DIFLAG2_REFLINK;
278 else
279 flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
280 if (flags & XFS_DIFLAG_REALTIME)
281 flags2 &= ~XFS_DIFLAG2_REFLINK;
282 if (!xfs_has_bigtime(mp))
283 flags2 &= ~XFS_DIFLAG2_BIGTIME;
284 if (!xfs_has_large_extent_counts(mp))
285 flags2 &= ~XFS_DIFLAG2_NREXT64;
286 if (flags2 & XFS_DIFLAG2_NREXT64)
287 dip->di_nrext64_pad = 0;
288 else if (dip->di_version >= 3)
289 dip->di_v3_pad = 0;
290 dip->di_flags = cpu_to_be16(flags);
291 dip->di_flags2 = cpu_to_be64(flags2);
292}
293
294/*
295 * Blow out symlink; now it points nowhere. We don't have to worry about
296 * incore state because this inode is failing the verifiers.
297 */
298STATIC void
299xrep_dinode_zap_symlink(
300 struct xrep_inode *ri,
301 struct xfs_dinode *dip)
302{
303 struct xfs_scrub *sc = ri->sc;
304 char *p;
305
306 trace_xrep_dinode_zap_symlink(sc, dip);
307
308 dip->di_format = XFS_DINODE_FMT_LOCAL;
309 dip->di_size = cpu_to_be64(1);
310 p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
311 *p = '?';
312 ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
313}
314
315/*
316 * Blow out dir, make the parent point to the root. In the future repair will
317 * reconstruct this directory for us. Note that there's no in-core directory
318 * inode because the sf verifier tripped, so we don't have to worry about the
319 * dentry cache.
320 */
321STATIC void
322xrep_dinode_zap_dir(
323 struct xrep_inode *ri,
324 struct xfs_dinode *dip)
325{
326 struct xfs_scrub *sc = ri->sc;
327 struct xfs_mount *mp = sc->mp;
328 struct xfs_dir2_sf_hdr *sfp;
329 int i8count;
330
331 trace_xrep_dinode_zap_dir(sc, dip);
332
333 dip->di_format = XFS_DINODE_FMT_LOCAL;
334 i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
335 sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
336 sfp->count = 0;
337 sfp->i8count = i8count;
338 xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
339 dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
340 ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
341}
342
343/* Make sure we don't have a garbage file size. */
344STATIC void
345xrep_dinode_size(
346 struct xrep_inode *ri,
347 struct xfs_dinode *dip)
348{
349 struct xfs_scrub *sc = ri->sc;
350 uint64_t size = be64_to_cpu(dip->di_size);
351 uint16_t mode = be16_to_cpu(dip->di_mode);
352
353 trace_xrep_dinode_size(sc, dip);
354
355 switch (mode & S_IFMT) {
356 case S_IFIFO:
357 case S_IFCHR:
358 case S_IFBLK:
359 case S_IFSOCK:
360 /* di_size can't be nonzero for special files */
361 dip->di_size = 0;
362 break;
363 case S_IFREG:
364 /* Regular files can't be larger than 2^63-1 bytes. */
365 dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
366 break;
367 case S_IFLNK:
368 /*
369 * Truncate ridiculously oversized symlinks. If the size is
370 * zero, reset it to point to the current directory. Both of
371 * these conditions trigger dinode verifier errors, so there
372 * is no in-core state to reset.
373 */
374 if (size > XFS_SYMLINK_MAXLEN)
375 dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
376 else if (size == 0)
377 xrep_dinode_zap_symlink(ri, dip);
378 break;
379 case S_IFDIR:
380 /*
381 * Directories can't have a size larger than 32G. If the size
382 * is zero, reset it to an empty directory. Both of these
383 * conditions trigger dinode verifier errors, so there is no
384 * in-core state to reset.
385 */
386 if (size > XFS_DIR2_SPACE_SIZE)
387 dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
388 else if (size == 0)
389 xrep_dinode_zap_dir(ri, dip);
390 break;
391 }
392}
393
394/* Fix extent size hints. */
395STATIC void
396xrep_dinode_extsize_hints(
397 struct xfs_scrub *sc,
398 struct xfs_dinode *dip)
399{
400 struct xfs_mount *mp = sc->mp;
401 uint64_t flags2 = be64_to_cpu(dip->di_flags2);
402 uint16_t flags = be16_to_cpu(dip->di_flags);
403 uint16_t mode = be16_to_cpu(dip->di_mode);
404
405 xfs_failaddr_t fa;
406
407 trace_xrep_dinode_extsize_hints(sc, dip);
408
409 fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
410 mode, flags);
411 if (fa) {
412 dip->di_extsize = 0;
413 dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
414 XFS_DIFLAG_EXTSZINHERIT);
415 }
416
417 if (dip->di_version < 3)
418 return;
419
420 fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
421 mode, flags, flags2);
422 if (fa) {
423 dip->di_cowextsize = 0;
424 dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
425 }
426}
427
428/* Count extents and blocks for an inode given an rmap. */
429STATIC int
430xrep_dinode_walk_rmap(
431 struct xfs_btree_cur *cur,
432 const struct xfs_rmap_irec *rec,
433 void *priv)
434{
435 struct xrep_inode *ri = priv;
436 int error = 0;
437
438 if (xchk_should_terminate(ri->sc, &error))
439 return error;
440
441 /* We only care about this inode. */
442 if (rec->rm_owner != ri->sc->sm->sm_ino)
443 return 0;
444
445 if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
446 ri->attr_blocks += rec->rm_blockcount;
447 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
448 ri->attr_extents++;
449
450 return 0;
451 }
452
453 ri->data_blocks += rec->rm_blockcount;
454 if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
455 ri->data_extents++;
456
457 return 0;
458}
459
460/* Count extents and blocks for an inode from all AG rmap data. */
461STATIC int
462xrep_dinode_count_ag_rmaps(
463 struct xrep_inode *ri,
464 struct xfs_perag *pag)
465{
466 struct xfs_btree_cur *cur;
467 struct xfs_buf *agf;
468 int error;
469
470 error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
471 if (error)
472 return error;
473
474 cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
475 error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
476 xfs_btree_del_cursor(cur, error);
477 xfs_trans_brelse(ri->sc->tp, agf);
478 return error;
479}
480
481/* Count extents and blocks for a given inode from all rmap data. */
482STATIC int
483xrep_dinode_count_rmaps(
484 struct xrep_inode *ri)
485{
486 struct xfs_perag *pag;
487 xfs_agnumber_t agno;
488 int error;
489
490 if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
491 return -EOPNOTSUPP;
492
493 for_each_perag(ri->sc->mp, agno, pag) {
494 error = xrep_dinode_count_ag_rmaps(ri, pag);
495 if (error) {
496 xfs_perag_rele(pag);
497 return error;
498 }
499 }
500
501 /* Can't have extents on both the rt and the data device. */
502 if (ri->data_extents && ri->rt_extents)
503 return -EFSCORRUPTED;
504
505 trace_xrep_dinode_count_rmaps(ri->sc,
506 ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
507 ri->data_extents, ri->rt_extents, ri->attr_extents);
508 return 0;
509}
510
511/* Return true if this extents-format ifork looks like garbage. */
512STATIC bool
513xrep_dinode_bad_extents_fork(
514 struct xfs_scrub *sc,
515 struct xfs_dinode *dip,
516 unsigned int dfork_size,
517 int whichfork)
518{
519 struct xfs_bmbt_irec new;
520 struct xfs_bmbt_rec *dp;
521 xfs_extnum_t nex;
522 bool isrt;
523 unsigned int i;
524
525 nex = xfs_dfork_nextents(dip, whichfork);
526 if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
527 return true;
528
529 dp = XFS_DFORK_PTR(dip, whichfork);
530
531 isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
532 for (i = 0; i < nex; i++, dp++) {
533 xfs_failaddr_t fa;
534
535 xfs_bmbt_disk_get_all(dp, &new);
536 fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
537 &new);
538 if (fa)
539 return true;
540 }
541
542 return false;
543}
544
545/* Return true if this btree-format ifork looks like garbage. */
546STATIC bool
547xrep_dinode_bad_bmbt_fork(
548 struct xfs_scrub *sc,
549 struct xfs_dinode *dip,
550 unsigned int dfork_size,
551 int whichfork)
552{
553 struct xfs_bmdr_block *dfp;
554 xfs_extnum_t nex;
555 unsigned int i;
556 unsigned int dmxr;
557 unsigned int nrecs;
558 unsigned int level;
559
560 nex = xfs_dfork_nextents(dip, whichfork);
561 if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
562 return true;
563
564 if (dfork_size < sizeof(struct xfs_bmdr_block))
565 return true;
566
567 dfp = XFS_DFORK_PTR(dip, whichfork);
568 nrecs = be16_to_cpu(dfp->bb_numrecs);
569 level = be16_to_cpu(dfp->bb_level);
570
571 if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
572 return true;
573 if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
574 return true;
575
576 dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
577 for (i = 1; i <= nrecs; i++) {
578 struct xfs_bmbt_key *fkp;
579 xfs_bmbt_ptr_t *fpp;
580 xfs_fileoff_t fileoff;
581 xfs_fsblock_t fsbno;
582
583 fkp = XFS_BMDR_KEY_ADDR(dfp, i);
584 fileoff = be64_to_cpu(fkp->br_startoff);
585 if (!xfs_verify_fileoff(sc->mp, fileoff))
586 return true;
587
588 fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
589 fsbno = be64_to_cpu(*fpp);
590 if (!xfs_verify_fsbno(sc->mp, fsbno))
591 return true;
592 }
593
594 return false;
595}
596
597/*
598 * Check the data fork for things that will fail the ifork verifiers or the
599 * ifork formatters.
600 */
601STATIC bool
602xrep_dinode_check_dfork(
603 struct xfs_scrub *sc,
604 struct xfs_dinode *dip,
605 uint16_t mode)
606{
607 void *dfork_ptr;
608 int64_t data_size;
609 unsigned int fmt;
610 unsigned int dfork_size;
611
612 /*
613 * Verifier functions take signed int64_t, so check for bogus negative
614 * values first.
615 */
616 data_size = be64_to_cpu(dip->di_size);
617 if (data_size < 0)
618 return true;
619
620 fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
621 switch (mode & S_IFMT) {
622 case S_IFIFO:
623 case S_IFCHR:
624 case S_IFBLK:
625 case S_IFSOCK:
626 if (fmt != XFS_DINODE_FMT_DEV)
627 return true;
628 break;
629 case S_IFREG:
630 if (fmt == XFS_DINODE_FMT_LOCAL)
631 return true;
632 fallthrough;
633 case S_IFLNK:
634 case S_IFDIR:
635 switch (fmt) {
636 case XFS_DINODE_FMT_LOCAL:
637 case XFS_DINODE_FMT_EXTENTS:
638 case XFS_DINODE_FMT_BTREE:
639 break;
640 default:
641 return true;
642 }
643 break;
644 default:
645 return true;
646 }
647
648 dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
649 dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
650
651 switch (fmt) {
652 case XFS_DINODE_FMT_DEV:
653 break;
654 case XFS_DINODE_FMT_LOCAL:
655 /* dir/symlink structure cannot be larger than the fork */
656 if (data_size > dfork_size)
657 return true;
658 /* directory structure must pass verification. */
659 if (S_ISDIR(mode) &&
660 xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
661 return true;
662 /* symlink structure must pass verification. */
663 if (S_ISLNK(mode) &&
664 xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
665 return true;
666 break;
667 case XFS_DINODE_FMT_EXTENTS:
668 if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
669 XFS_DATA_FORK))
670 return true;
671 break;
672 case XFS_DINODE_FMT_BTREE:
673 if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
674 XFS_DATA_FORK))
675 return true;
676 break;
677 default:
678 return true;
679 }
680
681 return false;
682}
683
684static void
685xrep_dinode_set_data_nextents(
686 struct xfs_dinode *dip,
687 xfs_extnum_t nextents)
688{
689 if (xfs_dinode_has_large_extent_counts(dip))
690 dip->di_big_nextents = cpu_to_be64(nextents);
691 else
692 dip->di_nextents = cpu_to_be32(nextents);
693}
694
695static void
696xrep_dinode_set_attr_nextents(
697 struct xfs_dinode *dip,
698 xfs_extnum_t nextents)
699{
700 if (xfs_dinode_has_large_extent_counts(dip))
701 dip->di_big_anextents = cpu_to_be32(nextents);
702 else
703 dip->di_anextents = cpu_to_be16(nextents);
704}
705
706/* Reset the data fork to something sane. */
707STATIC void
708xrep_dinode_zap_dfork(
709 struct xrep_inode *ri,
710 struct xfs_dinode *dip,
711 uint16_t mode)
712{
713 struct xfs_scrub *sc = ri->sc;
714
715 trace_xrep_dinode_zap_dfork(sc, dip);
716
717 ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
718
719 xrep_dinode_set_data_nextents(dip, 0);
720 ri->data_blocks = 0;
721 ri->rt_blocks = 0;
722
723 /* Special files always get reset to DEV */
724 switch (mode & S_IFMT) {
725 case S_IFIFO:
726 case S_IFCHR:
727 case S_IFBLK:
728 case S_IFSOCK:
729 dip->di_format = XFS_DINODE_FMT_DEV;
730 dip->di_size = 0;
731 return;
732 }
733
734 /*
735 * If we have data extents, reset to an empty map and hope the user
736 * will run the bmapbtd checker next.
737 */
738 if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
739 dip->di_format = XFS_DINODE_FMT_EXTENTS;
740 return;
741 }
742
743 /* Otherwise, reset the local format to the minimum. */
744 switch (mode & S_IFMT) {
745 case S_IFLNK:
746 xrep_dinode_zap_symlink(ri, dip);
747 break;
748 case S_IFDIR:
749 xrep_dinode_zap_dir(ri, dip);
750 break;
751 }
752}
753
754/*
755 * Check the attr fork for things that will fail the ifork verifiers or the
756 * ifork formatters.
757 */
758STATIC bool
759xrep_dinode_check_afork(
760 struct xfs_scrub *sc,
761 struct xfs_dinode *dip)
762{
763 struct xfs_attr_sf_hdr *afork_ptr;
764 size_t attr_size;
765 unsigned int afork_size;
766
767 if (XFS_DFORK_BOFF(dip) == 0)
768 return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
769 xfs_dfork_attr_extents(dip) != 0;
770
771 afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
772 afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
773
774 switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
775 case XFS_DINODE_FMT_LOCAL:
776 /* Fork has to be large enough to extract the xattr size. */
777 if (afork_size < sizeof(struct xfs_attr_sf_hdr))
778 return true;
779
780 /* xattr structure cannot be larger than the fork */
781 attr_size = be16_to_cpu(afork_ptr->totsize);
782 if (attr_size > afork_size)
783 return true;
784
785 /* xattr structure must pass verification. */
786 return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
787 case XFS_DINODE_FMT_EXTENTS:
788 if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
789 XFS_ATTR_FORK))
790 return true;
791 break;
792 case XFS_DINODE_FMT_BTREE:
793 if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
794 XFS_ATTR_FORK))
795 return true;
796 break;
797 default:
798 return true;
799 }
800
801 return false;
802}
803
804/*
805 * Reset the attr fork to empty. Since the attr fork could have contained
806 * ACLs, make the file readable only by root.
807 */
808STATIC void
809xrep_dinode_zap_afork(
810 struct xrep_inode *ri,
811 struct xfs_dinode *dip,
812 uint16_t mode)
813{
814 struct xfs_scrub *sc = ri->sc;
815
816 trace_xrep_dinode_zap_afork(sc, dip);
817
818 ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
819
820 dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
821 xrep_dinode_set_attr_nextents(dip, 0);
822 ri->attr_blocks = 0;
823
824 /*
825 * If the data fork is in btree format, removing the attr fork entirely
826 * might cause verifier failures if the next level down in the bmbt
827 * could now fit in the data fork area.
828 */
829 if (dip->di_format != XFS_DINODE_FMT_BTREE)
830 dip->di_forkoff = 0;
831 dip->di_mode = cpu_to_be16(mode & ~0777);
832 dip->di_uid = 0;
833 dip->di_gid = 0;
834}
835
836/* Make sure the fork offset is a sensible value. */
837STATIC void
838xrep_dinode_ensure_forkoff(
839 struct xrep_inode *ri,
840 struct xfs_dinode *dip,
841 uint16_t mode)
842{
843 struct xfs_bmdr_block *bmdr;
844 struct xfs_scrub *sc = ri->sc;
845 xfs_extnum_t attr_extents, data_extents;
846 size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
847 unsigned int lit_sz = XFS_LITINO(sc->mp);
848 unsigned int afork_min, dfork_min;
849
850 trace_xrep_dinode_ensure_forkoff(sc, dip);
851
852 /*
853 * Before calling this function, xrep_dinode_core ensured that both
854 * forks actually fit inside their respective literal areas. If this
855 * was not the case, the fork was reset to FMT_EXTENTS with zero
856 * records. If the rmapbt scan found attr or data fork blocks, this
857 * will be noted in the dinode_stats, and we must leave enough room
858 * for the bmap repair code to reconstruct the mapping structure.
859 *
860 * First, compute the minimum space required for the attr fork.
861 */
862 switch (dip->di_aformat) {
863 case XFS_DINODE_FMT_LOCAL:
864 /*
865 * If we still have a shortform xattr structure at all, that
866 * means the attr fork area was exactly large enough to fit
867 * the sf structure.
868 */
869 afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
870 break;
871 case XFS_DINODE_FMT_EXTENTS:
872 attr_extents = xfs_dfork_attr_extents(dip);
873 if (attr_extents) {
874 /*
875 * We must maintain sufficient space to hold the entire
876 * extent map array in the data fork. Note that we
877 * previously zapped the fork if it had no chance of
878 * fitting in the inode.
879 */
880 afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
881 } else if (ri->attr_extents > 0) {
882 /*
883 * The attr fork thinks it has zero extents, but we
884 * found some xattr extents. We need to leave enough
885 * empty space here so that the incore attr fork will
886 * get created (and hence trigger the attr fork bmap
887 * repairer).
888 */
889 afork_min = bmdr_minsz;
890 } else {
891 /* No extents on disk or found in rmapbt. */
892 afork_min = 0;
893 }
894 break;
895 case XFS_DINODE_FMT_BTREE:
896 /* Must have space for btree header and key/pointers. */
897 bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
898 afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
899 break;
900 default:
901 /* We should never see any other formats. */
902 afork_min = 0;
903 break;
904 }
905
906 /* Compute the minimum space required for the data fork. */
907 switch (dip->di_format) {
908 case XFS_DINODE_FMT_DEV:
909 dfork_min = sizeof(__be32);
910 break;
911 case XFS_DINODE_FMT_UUID:
912 dfork_min = sizeof(uuid_t);
913 break;
914 case XFS_DINODE_FMT_LOCAL:
915 /*
916 * If we still have a shortform data fork at all, that means
917 * the data fork area was large enough to fit whatever was in
918 * there.
919 */
920 dfork_min = be64_to_cpu(dip->di_size);
921 break;
922 case XFS_DINODE_FMT_EXTENTS:
923 data_extents = xfs_dfork_data_extents(dip);
924 if (data_extents) {
925 /*
926 * We must maintain sufficient space to hold the entire
927 * extent map array in the data fork. Note that we
928 * previously zapped the fork if it had no chance of
929 * fitting in the inode.
930 */
931 dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
932 } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
933 /*
934 * The data fork thinks it has zero extents, but we
935 * found some data extents. We need to leave enough
936 * empty space here so that the data fork bmap repair
937 * will recover the mappings.
938 */
939 dfork_min = bmdr_minsz;
940 } else {
941 /* No extents on disk or found in rmapbt. */
942 dfork_min = 0;
943 }
944 break;
945 case XFS_DINODE_FMT_BTREE:
946 /* Must have space for btree header and key/pointers. */
947 bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
948 dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
949 break;
950 default:
951 dfork_min = 0;
952 break;
953 }
954
955 /*
956 * Round all values up to the nearest 8 bytes, because that is the
957 * precision of di_forkoff.
958 */
959 afork_min = roundup(afork_min, 8);
960 dfork_min = roundup(dfork_min, 8);
961 bmdr_minsz = roundup(bmdr_minsz, 8);
962
963 ASSERT(dfork_min <= lit_sz);
964 ASSERT(afork_min <= lit_sz);
965
966 /*
967 * If the data fork was zapped and we don't have enough space for the
968 * recovery fork, move the attr fork up.
969 */
970 if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
971 xfs_dfork_data_extents(dip) == 0 &&
972 (ri->data_extents > 0 || ri->rt_extents > 0) &&
973 bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
974 if (bmdr_minsz + afork_min > lit_sz) {
975 /*
976 * The attr for and the stub fork we need to recover
977 * the data fork won't both fit. Zap the attr fork.
978 */
979 xrep_dinode_zap_afork(ri, dip, mode);
980 afork_min = bmdr_minsz;
981 } else {
982 void *before, *after;
983
984 /* Otherwise, just slide the attr fork up. */
985 before = XFS_DFORK_APTR(dip);
986 dip->di_forkoff = bmdr_minsz >> 3;
987 after = XFS_DFORK_APTR(dip);
988 memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
989 }
990 }
991
992 /*
993 * If the attr fork was zapped and we don't have enough space for the
994 * recovery fork, move the attr fork down.
995 */
996 if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
997 xfs_dfork_attr_extents(dip) == 0 &&
998 ri->attr_extents > 0 &&
999 bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1000 if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1001 /*
1002 * If the data fork is in btree format then we can't
1003 * adjust forkoff because that runs the risk of
1004 * violating the extents/btree format transition rules.
1005 */
1006 } else if (bmdr_minsz + dfork_min > lit_sz) {
1007 /*
1008 * If we can't move the attr fork, too bad, we lose the
1009 * attr fork and leak its blocks.
1010 */
1011 xrep_dinode_zap_afork(ri, dip, mode);
1012 } else {
1013 /*
1014 * Otherwise, just slide the attr fork down. The attr
1015 * fork is empty, so we don't have any old contents to
1016 * move here.
1017 */
1018 dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1019 }
1020 }
1021}
1022
1023/*
1024 * Zap the data/attr forks if we spot anything that isn't going to pass the
1025 * ifork verifiers or the ifork formatters, because we need to get the inode
1026 * into good enough shape that the higher level repair functions can run.
1027 */
1028STATIC void
1029xrep_dinode_zap_forks(
1030 struct xrep_inode *ri,
1031 struct xfs_dinode *dip)
1032{
1033 struct xfs_scrub *sc = ri->sc;
1034 xfs_extnum_t data_extents;
1035 xfs_extnum_t attr_extents;
1036 xfs_filblks_t nblocks;
1037 uint16_t mode;
1038 bool zap_datafork = false;
1039 bool zap_attrfork = ri->zap_acls;
1040
1041 trace_xrep_dinode_zap_forks(sc, dip);
1042
1043 mode = be16_to_cpu(dip->di_mode);
1044
1045 data_extents = xfs_dfork_data_extents(dip);
1046 attr_extents = xfs_dfork_attr_extents(dip);
1047 nblocks = be64_to_cpu(dip->di_nblocks);
1048
1049 /* Inode counters don't make sense? */
1050 if (data_extents > nblocks)
1051 zap_datafork = true;
1052 if (attr_extents > nblocks)
1053 zap_attrfork = true;
1054 if (data_extents + attr_extents > nblocks)
1055 zap_datafork = zap_attrfork = true;
1056
1057 if (!zap_datafork)
1058 zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1059 if (!zap_attrfork)
1060 zap_attrfork = xrep_dinode_check_afork(sc, dip);
1061
1062 /* Zap whatever's bad. */
1063 if (zap_attrfork)
1064 xrep_dinode_zap_afork(ri, dip, mode);
1065 if (zap_datafork)
1066 xrep_dinode_zap_dfork(ri, dip, mode);
1067 xrep_dinode_ensure_forkoff(ri, dip, mode);
1068
1069 /*
1070 * Zero di_nblocks if we don't have any extents at all to satisfy the
1071 * buffer verifier.
1072 */
1073 data_extents = xfs_dfork_data_extents(dip);
1074 attr_extents = xfs_dfork_attr_extents(dip);
1075 if (data_extents + attr_extents == 0)
1076 dip->di_nblocks = 0;
1077}
1078
1079/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1080STATIC int
1081xrep_dinode_core(
1082 struct xrep_inode *ri)
1083{
1084 struct xfs_scrub *sc = ri->sc;
1085 struct xfs_buf *bp;
1086 struct xfs_dinode *dip;
1087 xfs_ino_t ino = sc->sm->sm_ino;
1088 int error;
1089 int iget_error;
1090
1091 /* Figure out what this inode had mapped in both forks. */
1092 error = xrep_dinode_count_rmaps(ri);
1093 if (error)
1094 return error;
1095
1096 /* Read the inode cluster buffer. */
1097 error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1098 ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1099 NULL);
1100 if (error)
1101 return error;
1102
1103 /* Make sure we can pass the inode buffer verifier. */
1104 xrep_dinode_buf(sc, bp);
1105 bp->b_ops = &xfs_inode_buf_ops;
1106
1107 /* Fix everything the verifier will complain about. */
1108 dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1109 xrep_dinode_header(sc, dip);
1110 xrep_dinode_mode(ri, dip);
1111 xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1112 xrep_dinode_size(ri, dip);
1113 xrep_dinode_extsize_hints(sc, dip);
1114 xrep_dinode_zap_forks(ri, dip);
1115
1116 /* Write out the inode. */
1117 trace_xrep_dinode_fixed(sc, dip);
1118 xfs_dinode_calc_crc(sc->mp, dip);
1119 xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1120 xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1121 ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1122
1123 /*
1124 * In theory, we've fixed the ondisk inode record enough that we should
1125 * be able to load the inode into the cache. Try to iget that inode
1126 * now while we hold the AGI and the inode cluster buffer and take the
1127 * IOLOCK so that we can continue with repairs without anyone else
1128 * accessing the inode. If iget fails, we still need to commit the
1129 * changes.
1130 */
1131 iget_error = xchk_iget(sc, ino, &sc->ip);
1132 if (!iget_error)
1133 xchk_ilock(sc, XFS_IOLOCK_EXCL);
1134
1135 /*
1136 * Commit the inode cluster buffer updates and drop the AGI buffer that
1137 * we've been holding since scrub setup. From here on out, repairs
1138 * deal only with the cached inode.
1139 */
1140 error = xrep_trans_commit(sc);
1141 if (error)
1142 return error;
1143
1144 if (iget_error)
1145 return iget_error;
1146
1147 error = xchk_trans_alloc(sc, 0);
1148 if (error)
1149 return error;
1150
1151 error = xrep_ino_dqattach(sc);
1152 if (error)
1153 return error;
1154
1155 xchk_ilock(sc, XFS_ILOCK_EXCL);
1156 if (ri->ino_sick_mask)
1157 xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1158 return 0;
1159}
1160
1161/* Fix everything xfs_dinode_verify cares about. */
1162STATIC int
1163xrep_dinode_problems(
1164 struct xrep_inode *ri)
1165{
1166 struct xfs_scrub *sc = ri->sc;
1167 int error;
1168
1169 error = xrep_dinode_core(ri);
1170 if (error)
1171 return error;
1172
1173 /* We had to fix a totally busted inode, schedule quotacheck. */
1174 if (XFS_IS_UQUOTA_ON(sc->mp))
1175 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1176 if (XFS_IS_GQUOTA_ON(sc->mp))
1177 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1178 if (XFS_IS_PQUOTA_ON(sc->mp))
1179 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1180
1181 return 0;
1182}
1183
1184/*
1185 * Fix problems that the verifiers don't care about. In general these are
1186 * errors that don't cause problems elsewhere in the kernel that we can easily
1187 * detect, so we don't check them all that rigorously.
1188 */
1189
1190/* Make sure block and extent counts are ok. */
1191STATIC int
1192xrep_inode_blockcounts(
1193 struct xfs_scrub *sc)
1194{
1195 struct xfs_ifork *ifp;
1196 xfs_filblks_t count;
1197 xfs_filblks_t acount;
1198 xfs_extnum_t nextents;
1199 int error;
1200
1201 trace_xrep_inode_blockcounts(sc);
1202
1203 /* Set data fork counters from the data fork mappings. */
1204 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1205 &nextents, &count);
1206 if (error)
1207 return error;
1208 if (xfs_is_reflink_inode(sc->ip)) {
1209 /*
1210 * data fork blockcount can exceed physical storage if a user
1211 * reflinks the same block over and over again.
1212 */
1213 ;
1214 } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1215 if (count >= sc->mp->m_sb.sb_rblocks)
1216 return -EFSCORRUPTED;
1217 } else {
1218 if (count >= sc->mp->m_sb.sb_dblocks)
1219 return -EFSCORRUPTED;
1220 }
1221 error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1222 if (error)
1223 return error;
1224 sc->ip->i_df.if_nextents = nextents;
1225
1226 /* Set attr fork counters from the attr fork mappings. */
1227 ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1228 if (ifp) {
1229 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1230 &nextents, &acount);
1231 if (error)
1232 return error;
1233 if (count >= sc->mp->m_sb.sb_dblocks)
1234 return -EFSCORRUPTED;
1235 error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1236 nextents);
1237 if (error)
1238 return error;
1239 ifp->if_nextents = nextents;
1240 } else {
1241 acount = 0;
1242 }
1243
1244 sc->ip->i_nblocks = count + acount;
1245 return 0;
1246}
1247
1248/* Check for invalid uid/gid/prid. */
1249STATIC void
1250xrep_inode_ids(
1251 struct xfs_scrub *sc)
1252{
1253 bool dirty = false;
1254
1255 trace_xrep_inode_ids(sc);
1256
1257 if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1258 i_uid_write(VFS_I(sc->ip), 0);
1259 dirty = true;
1260 if (XFS_IS_UQUOTA_ON(sc->mp))
1261 xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1262 }
1263
1264 if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1265 i_gid_write(VFS_I(sc->ip), 0);
1266 dirty = true;
1267 if (XFS_IS_GQUOTA_ON(sc->mp))
1268 xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1269 }
1270
1271 if (sc->ip->i_projid == -1U) {
1272 sc->ip->i_projid = 0;
1273 dirty = true;
1274 if (XFS_IS_PQUOTA_ON(sc->mp))
1275 xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1276 }
1277
1278 /* strip setuid/setgid if we touched any of the ids */
1279 if (dirty)
1280 VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1281}
1282
1283static inline void
1284xrep_clamp_timestamp(
1285 struct xfs_inode *ip,
1286 struct timespec64 *ts)
1287{
1288 ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1289 *ts = timestamp_truncate(*ts, VFS_I(ip));
1290}
1291
1292/* Nanosecond counters can't have more than 1 billion. */
1293STATIC void
1294xrep_inode_timestamps(
1295 struct xfs_inode *ip)
1296{
1297 struct timespec64 tstamp;
1298 struct inode *inode = VFS_I(ip);
1299
1300 tstamp = inode_get_atime(inode);
1301 xrep_clamp_timestamp(ip, &tstamp);
1302 inode_set_atime_to_ts(inode, tstamp);
1303
1304 tstamp = inode_get_mtime(inode);
1305 xrep_clamp_timestamp(ip, &tstamp);
1306 inode_set_mtime_to_ts(inode, tstamp);
1307
1308 tstamp = inode_get_ctime(inode);
1309 xrep_clamp_timestamp(ip, &tstamp);
1310 inode_set_ctime_to_ts(inode, tstamp);
1311
1312 xrep_clamp_timestamp(ip, &ip->i_crtime);
1313}
1314
1315/* Fix inode flags that don't make sense together. */
1316STATIC void
1317xrep_inode_flags(
1318 struct xfs_scrub *sc)
1319{
1320 uint16_t mode;
1321
1322 trace_xrep_inode_flags(sc);
1323
1324 mode = VFS_I(sc->ip)->i_mode;
1325
1326 /* Clear junk flags */
1327 if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1328 sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1329
1330 /* NEWRTBM only applies to realtime bitmaps */
1331 if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1332 sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1333 else
1334 sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1335
1336 /* These only make sense for directories. */
1337 if (!S_ISDIR(mode))
1338 sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1339 XFS_DIFLAG_EXTSZINHERIT |
1340 XFS_DIFLAG_PROJINHERIT |
1341 XFS_DIFLAG_NOSYMLINKS);
1342
1343 /* These only make sense for files. */
1344 if (!S_ISREG(mode))
1345 sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1346 XFS_DIFLAG_EXTSIZE);
1347
1348 /* These only make sense for non-rt files. */
1349 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1350 sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1351
1352 /* Immutable and append only? Drop the append. */
1353 if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1354 (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1355 sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1356
1357 /* Clear junk flags. */
1358 if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1359 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1360
1361 /* No reflink flag unless we support it and it's a file. */
1362 if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1363 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1364
1365 /* DAX only applies to files and dirs. */
1366 if (!(S_ISREG(mode) || S_ISDIR(mode)))
1367 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1368
1369 /* No reflink files on the realtime device. */
1370 if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1371 sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1372}
1373
1374/*
1375 * Fix size problems with block/node format directories. If we fail to find
1376 * the extent list, just bail out and let the bmapbtd repair functions clean
1377 * up that mess.
1378 */
1379STATIC void
1380xrep_inode_blockdir_size(
1381 struct xfs_scrub *sc)
1382{
1383 struct xfs_iext_cursor icur;
1384 struct xfs_bmbt_irec got;
1385 struct xfs_ifork *ifp;
1386 xfs_fileoff_t off;
1387 int error;
1388
1389 trace_xrep_inode_blockdir_size(sc);
1390
1391 error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1392 if (error)
1393 return;
1394
1395 /* Find the last block before 32G; this is the dir size. */
1396 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1397 off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1398 if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1399 /* zero-extents directory? */
1400 return;
1401 }
1402
1403 off = got.br_startoff + got.br_blockcount;
1404 sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1405 XFS_FSB_TO_B(sc->mp, off));
1406}
1407
1408/* Fix size problems with short format directories. */
1409STATIC void
1410xrep_inode_sfdir_size(
1411 struct xfs_scrub *sc)
1412{
1413 struct xfs_ifork *ifp;
1414
1415 trace_xrep_inode_sfdir_size(sc);
1416
1417 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1418 sc->ip->i_disk_size = ifp->if_bytes;
1419}
1420
1421/*
1422 * Fix any irregularities in a directory inode's size now that we can iterate
1423 * extent maps and access other regular inode data.
1424 */
1425STATIC void
1426xrep_inode_dir_size(
1427 struct xfs_scrub *sc)
1428{
1429 trace_xrep_inode_dir_size(sc);
1430
1431 switch (sc->ip->i_df.if_format) {
1432 case XFS_DINODE_FMT_EXTENTS:
1433 case XFS_DINODE_FMT_BTREE:
1434 xrep_inode_blockdir_size(sc);
1435 break;
1436 case XFS_DINODE_FMT_LOCAL:
1437 xrep_inode_sfdir_size(sc);
1438 break;
1439 }
1440}
1441
1442/* Fix extent size hint problems. */
1443STATIC void
1444xrep_inode_extsize(
1445 struct xfs_scrub *sc)
1446{
1447 /* Fix misaligned extent size hints on a directory. */
1448 if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1449 (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1450 xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1451 sc->ip->i_extsize = 0;
1452 sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1453 }
1454}
1455
1456/* Fix any irregularities in an inode that the verifiers don't catch. */
1457STATIC int
1458xrep_inode_problems(
1459 struct xfs_scrub *sc)
1460{
1461 int error;
1462
1463 error = xrep_inode_blockcounts(sc);
1464 if (error)
1465 return error;
1466 xrep_inode_timestamps(sc->ip);
1467 xrep_inode_flags(sc);
1468 xrep_inode_ids(sc);
1469 /*
1470 * We can now do a better job fixing the size of a directory now that
1471 * we can scan the data fork extents than we could in xrep_dinode_size.
1472 */
1473 if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1474 xrep_inode_dir_size(sc);
1475 xrep_inode_extsize(sc);
1476
1477 trace_xrep_inode_fixed(sc);
1478 xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1479 return xrep_roll_trans(sc);
1480}
1481
1482/* Repair an inode's fields. */
1483int
1484xrep_inode(
1485 struct xfs_scrub *sc)
1486{
1487 int error = 0;
1488
1489 /*
1490 * No inode? That means we failed the _iget verifiers. Repair all
1491 * the things that the inode verifiers care about, then retry _iget.
1492 */
1493 if (!sc->ip) {
1494 struct xrep_inode *ri = sc->buf;
1495
1496 ASSERT(ri != NULL);
1497
1498 error = xrep_dinode_problems(ri);
1499 if (error)
1500 return error;
1501
1502 /* By this point we had better have a working incore inode. */
1503 if (!sc->ip)
1504 return -EFSCORRUPTED;
1505 }
1506
1507 xfs_trans_ijoin(sc->tp, sc->ip, 0);
1508
1509 /* If we found corruption of any kind, try to fix it. */
1510 if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1511 (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1512 error = xrep_inode_problems(sc);
1513 if (error)
1514 return error;
1515 }
1516
1517 /* See if we can clear the reflink flag. */
1518 if (xfs_is_reflink_inode(sc->ip)) {
1519 error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1520 if (error)
1521 return error;
1522 }
1523
1524 return xrep_defer_finish(sc);
1525}