fs/xfs/scrub/repair.c

   1 /*
   2  * Copyright (C) 2018 Oracle.  All Rights Reserved.
   3  *
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  *
   6  * This program is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU General Public License
   8  * as published by the Free Software Foundation; either version 2
   9  * of the License, or (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it would be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write the Free Software Foundation,
  18  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
  19  */
  20 #include "xfs.h"
  21 #include "xfs_fs.h"
  22 #include "xfs_shared.h"
  23 #include "xfs_format.h"
  24 #include "xfs_trans_resv.h"
  25 #include "xfs_mount.h"
  26 #include "xfs_defer.h"
  27 #include "xfs_btree.h"
  28 #include "xfs_bit.h"
  29 #include "xfs_log_format.h"
  30 #include "xfs_trans.h"
  31 #include "xfs_sb.h"
  32 #include "xfs_inode.h"
  33 #include "xfs_icache.h"
  34 #include "xfs_alloc.h"
  35 #include "xfs_alloc_btree.h"
  36 #include "xfs_ialloc.h"
  37 #include "xfs_ialloc_btree.h"
  38 #include "xfs_rmap.h"
  39 #include "xfs_rmap_btree.h"
  40 #include "xfs_refcount.h"
  41 #include "xfs_refcount_btree.h"
  42 #include "xfs_extent_busy.h"
  43 #include "xfs_ag_resv.h"
  44 #include "xfs_trans_space.h"
  45 #include "xfs_quota.h"
  46 #include "scrub/xfs_scrub.h"
  47 #include "scrub/scrub.h"
  48 #include "scrub/common.h"
  49 #include "scrub/trace.h"
  50 #include "scrub/repair.h"
  51
  52 /*
  53  * Attempt to repair some metadata, if the metadata is corrupt and userspace
  54  * told us to fix it.  This function returns -EAGAIN to mean "re-run scrub",
  55  * and will set *fixed to true if it thinks it repaired anything.
  56  */
  57 int
  58 xfs_repair_attempt(
  59         struct xfs_inode                *ip,
  60         struct xfs_scrub_context        *sc,
  61         bool                            *fixed)
  62 {
  63         int                             error = 0;
  64
  65         trace_xfs_repair_attempt(ip, sc->sm, error);
  66
  67         xfs_scrub_ag_btcur_free(&sc->sa);
  68
  69         /* Repair whatever's broken. */
  70         ASSERT(sc->ops->repair);
  71         error = sc->ops->repair(sc);
  72         trace_xfs_repair_done(ip, sc->sm, error);
  73         switch (error) {
  74         case 0:
  75                 /*
  76                  * Repair succeeded.  Commit the fixes and perform a second
  77                  * scrub so that we can tell userspace if we fixed the problem.
  78                  */
  79                 sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
  80                 *fixed = true;
  81                 return -EAGAIN;
  82         case -EDEADLOCK:
  83         case -EAGAIN:
  84                 /* Tell the caller to try again having grabbed all the locks. */
  85                 if (!sc->try_harder) {
  86                         sc->try_harder = true;
  87                         return -EAGAIN;
  88                 }
  89                 /*
  90                  * We tried harder but still couldn't grab all the resources
  91                  * we needed to fix it.  The corruption has not been fixed,
  92                  * so report back to userspace.
  93                  */
  94                 return -EFSCORRUPTED;
  95         default:
  96                 return error;
  97         }
  98 }
  99
 100 /*
 101  * Complain about unfixable problems in the filesystem.  We don't log
 102  * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
 103  * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
 104  * administrator isn't running xfs_scrub in no-repairs mode.
 105  *
 106  * Use this helper function because _ratelimited silently declares a static
 107  * structure to track rate limiting information.
 108  */
 109 void
 110 xfs_repair_failure(
 111         struct xfs_mount                *mp)
 112 {
 113         xfs_alert_ratelimited(mp,
 114 "Corruption not fixed during online repair.  Unmount and run xfs_repair.");
 115 }
 116
 117 /*
 118  * Repair probe -- userspace uses this to probe if we're willing to repair a
 119  * given mountpoint.
 120  */
 121 int
 122 xfs_repair_probe(
 123         struct xfs_scrub_context        *sc)
 124 {
 125         int                             error = 0;
 126
 127         if (xfs_scrub_should_terminate(sc, &error))
 128                 return error;
 129
 130         return 0;
 131 }
 132
 133 /*
 134  * Roll a transaction, keeping the AG headers locked and reinitializing
 135  * the btree cursors.
 136  */
 137 int
 138 xfs_repair_roll_ag_trans(
 139         struct xfs_scrub_context        *sc)
 140 {
 141         int                             error;
 142
 143         /* Keep the AG header buffers locked so we can keep going. */
 144         xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
 145         xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
 146         xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
 147
 148         /* Roll the transaction. */
 149         error = xfs_trans_roll(&sc->tp);
 150         if (error)
 151                 goto out_release;
 152
 153         /* Join AG headers to the new transaction. */
 154         xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
 155         xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
 156         xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
 157
 158         return 0;
 159
 160 out_release:
 161         /*
 162          * Rolling failed, so release the hold on the buffers.  The
 163          * buffers will be released during teardown on our way out
 164          * of the kernel.
 165          */
 166         xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
 167         xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
 168         xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
 169
 170         return error;
 171 }
 172
 173 /*
 174  * Does the given AG have enough space to rebuild a btree?  Neither AG
 175  * reservation can be critical, and we must have enough space (factoring
 176  * in AG reservations) to construct a whole btree.
 177  */
 178 bool
 179 xfs_repair_ag_has_space(
 180         struct xfs_perag                *pag,
 181         xfs_extlen_t                    nr_blocks,
 182         enum xfs_ag_resv_type           type)
 183 {
 184         return  !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
 185                 !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
 186                 pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
 187 }
 188
 189 /*
 190  * Figure out how many blocks to reserve for an AG repair.  We calculate the
 191  * worst case estimate for the number of blocks we'd need to rebuild one of
 192  * any type of per-AG btree.
 193  */
 194 xfs_extlen_t
 195 xfs_repair_calc_ag_resblks(
 196         struct xfs_scrub_context        *sc)
 197 {
 198         struct xfs_mount                *mp = sc->mp;
 199         struct xfs_scrub_metadata       *sm = sc->sm;
 200         struct xfs_perag                *pag;
 201         struct xfs_buf                  *bp;
 202         xfs_agino_t                     icount = 0;
 203         xfs_extlen_t                    aglen = 0;
 204         xfs_extlen_t                    usedlen;
 205         xfs_extlen_t                    freelen;
 206         xfs_extlen_t                    bnobt_sz;
 207         xfs_extlen_t                    inobt_sz;
 208         xfs_extlen_t                    rmapbt_sz;
 209         xfs_extlen_t                    refcbt_sz;
 210         int                             error;
 211
 212         if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
 213                 return 0;
 214
 215         /* Use in-core counters if possible. */
 216         pag = xfs_perag_get(mp, sm->sm_agno);
 217         if (pag->pagi_init)
 218                 icount = pag->pagi_count;
 219
 220         /*
 221          * Otherwise try to get the actual counters from disk; if not, make
 222          * some worst case assumptions.
 223          */
 224         if (icount == 0) {
 225                 error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp);
 226                 if (error) {
 227                         icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock;
 228                 } else {
 229                         icount = pag->pagi_count;
 230                         xfs_buf_relse(bp);
 231                 }
 232         }
 233
 234         /* Now grab the block counters from the AGF. */
 235         error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp);
 236         if (error) {
 237                 aglen = mp->m_sb.sb_agblocks;
 238                 freelen = aglen;
 239                 usedlen = aglen;
 240         } else {
 241                 aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length);
 242                 freelen = pag->pagf_freeblks;
 243                 usedlen = aglen - freelen;
 244                 xfs_buf_relse(bp);
 245         }
 246         xfs_perag_put(pag);
 247
 248         trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
 249                         freelen, usedlen);
 250
 251         /*
 252          * Figure out how many blocks we'd need worst case to rebuild
 253          * each type of btree.  Note that we can only rebuild the
 254          * bnobt/cntbt or inobt/finobt as pairs.
 255          */
 256         bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
 257         if (xfs_sb_version_hassparseinodes(&mp->m_sb))
 258                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 259                                 XFS_INODES_PER_HOLEMASK_BIT);
 260         else
 261                 inobt_sz = xfs_iallocbt_calc_size(mp, icount /
 262                                 XFS_INODES_PER_CHUNK);
 263         if (xfs_sb_version_hasfinobt(&mp->m_sb))
 264                 inobt_sz *= 2;
 265         if (xfs_sb_version_hasreflink(&mp->m_sb))
 266                 refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
 267         else
 268                 refcbt_sz = 0;
 269         if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
 270                 /*
 271                  * Guess how many blocks we need to rebuild the rmapbt.
 272                  * For non-reflink filesystems we can't have more records than
 273                  * used blocks.  However, with reflink it's possible to have
 274                  * more than one rmap record per AG block.  We don't know how
 275                  * many rmaps there could be in the AG, so we start off with
 276                  * what we hope is an generous over-estimation.
 277                  */
 278                 if (xfs_sb_version_hasreflink(&mp->m_sb))
 279                         rmapbt_sz = xfs_rmapbt_calc_size(mp,
 280                                         (unsigned long long)aglen * 2);
 281                 else
 282                         rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
 283         } else {
 284                 rmapbt_sz = 0;
 285         }
 286
 287         trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
 288                         inobt_sz, rmapbt_sz, refcbt_sz);
 289
 290         return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
 291 }
 292
 293 /* Allocate a block in an AG. */
 294 int
 295 xfs_repair_alloc_ag_block(
 296         struct xfs_scrub_context        *sc,
 297         struct xfs_owner_info           *oinfo,
 298         xfs_fsblock_t                   *fsbno,
 299         enum xfs_ag_resv_type           resv)
 300 {
 301         struct xfs_alloc_arg            args = {0};
 302         xfs_agblock_t                   bno;
 303         int                             error;
 304
 305         switch (resv) {
 306         case XFS_AG_RESV_AGFL:
 307         case XFS_AG_RESV_RMAPBT:
 308                 error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1);
 309                 if (error)
 310                         return error;
 311                 if (bno == NULLAGBLOCK)
 312                         return -ENOSPC;
 313                 xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno,
 314                                 1, false);
 315                 *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno);
 316                 if (resv == XFS_AG_RESV_RMAPBT)
 317                         xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno);
 318                 return 0;
 319         default:
 320                 break;
 321         }
 322
 323         args.tp = sc->tp;
 324         args.mp = sc->mp;
 325         args.oinfo = *oinfo;
 326         args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0);
 327         args.minlen = 1;
 328         args.maxlen = 1;
 329         args.prod = 1;
 330         args.type = XFS_ALLOCTYPE_THIS_AG;
 331         args.resv = resv;
 332
 333         error = xfs_alloc_vextent(&args);
 334         if (error)
 335                 return error;
 336         if (args.fsbno == NULLFSBLOCK)
 337                 return -ENOSPC;
 338         ASSERT(args.len == 1);
 339         *fsbno = args.fsbno;
 340
 341         return 0;
 342 }
 343
 344 /* Initialize a new AG btree root block with zero entries. */
 345 int
 346 xfs_repair_init_btblock(
 347         struct xfs_scrub_context        *sc,
 348         xfs_fsblock_t                   fsb,
 349         struct xfs_buf                  **bpp,
 350         xfs_btnum_t                     btnum,
 351         const struct xfs_buf_ops        *ops)
 352 {
 353         struct xfs_trans                *tp = sc->tp;
 354         struct xfs_mount                *mp = sc->mp;
 355         struct xfs_buf                  *bp;
 356
 357         trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
 358                         XFS_FSB_TO_AGBNO(mp, fsb), btnum);
 359
 360         ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno);
 361         bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb),
 362                         XFS_FSB_TO_BB(mp, 1), 0);
 363         xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 364         xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0);
 365         xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
 366         xfs_trans_log_buf(tp, bp, 0, bp->b_length);
 367         bp->b_ops = ops;
 368         *bpp = bp;
 369
 370         return 0;
 371 }
 372
 373 /*
 374  * Reconstructing per-AG Btrees
 375  *
 376  * When a space btree is corrupt, we don't bother trying to fix it.  Instead,
 377  * we scan secondary space metadata to derive the records that should be in
 378  * the damaged btree, initialize a fresh btree root, and insert the records.
 379  * Note that for rebuilding the rmapbt we scan all the primary data to
 380  * generate the new records.
 381  *
 382  * However, that leaves the matter of removing all the metadata describing the
 383  * old broken structure.  For primary metadata we use the rmap data to collect
 384  * every extent with a matching rmap owner (exlist); we then iterate all other
 385  * metadata structures with the same rmap owner to collect the extents that
 386  * cannot be removed (sublist).  We then subtract sublist from exlist to
 387  * derive the blocks that were used by the old btree.  These blocks can be
 388  * reaped.
 389  *
 390  * For rmapbt reconstructions we must use different tactics for extent
 391  * collection.  First we iterate all primary metadata (this excludes the old
 392  * rmapbt, obviously) to generate new rmap records.  The gaps in the rmap
 393  * records are collected as exlist.  The bnobt records are collected as
 394  * sublist.  As with the other btrees we subtract sublist from exlist, and the
 395  * result (since the rmapbt lives in the free space) are the blocks from the
 396  * old rmapbt.
 397  */
 398
 399 /* Collect a dead btree extent for later disposal. */
 400 int
 401 xfs_repair_collect_btree_extent(
 402         struct xfs_scrub_context        *sc,
 403         struct xfs_repair_extent_list   *exlist,
 404         xfs_fsblock_t                   fsbno,
 405         xfs_extlen_t                    len)
 406 {
 407         struct xfs_repair_extent        *rex;
 408
 409         trace_xfs_repair_collect_btree_extent(sc->mp,
 410                         XFS_FSB_TO_AGNO(sc->mp, fsbno),
 411                         XFS_FSB_TO_AGBNO(sc->mp, fsbno), len);
 412
 413         rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL);
 414         if (!rex)
 415                 return -ENOMEM;
 416
 417         INIT_LIST_HEAD(&rex->list);
 418         rex->fsbno = fsbno;
 419         rex->len = len;
 420         list_add_tail(&rex->list, &exlist->list);
 421
 422         return 0;
 423 }
 424
 425 /*
 426  * An error happened during the rebuild so the transaction will be cancelled.
 427  * The fs will shut down, and the administrator has to unmount and run repair.
 428  * Therefore, free all the memory associated with the list so we can die.
 429  */
 430 void
 431 xfs_repair_cancel_btree_extents(
 432         struct xfs_scrub_context        *sc,
 433         struct xfs_repair_extent_list   *exlist)
 434 {
 435         struct xfs_repair_extent        *rex;
 436         struct xfs_repair_extent        *n;
 437
 438         for_each_xfs_repair_extent_safe(rex, n, exlist) {
 439                 list_del(&rex->list);
 440                 kmem_free(rex);
 441         }
 442 }
 443
 444 /* Compare two btree extents. */
 445 static int
 446 xfs_repair_btree_extent_cmp(
 447         void                            *priv,
 448         struct list_head                *a,
 449         struct list_head                *b)
 450 {
 451         struct xfs_repair_extent        *ap;
 452         struct xfs_repair_extent        *bp;
 453
 454         ap = container_of(a, struct xfs_repair_extent, list);
 455         bp = container_of(b, struct xfs_repair_extent, list);
 456
 457         if (ap->fsbno > bp->fsbno)
 458                 return 1;
 459         if (ap->fsbno < bp->fsbno)
 460                 return -1;
 461         return 0;
 462 }
 463
 464 /*
 465  * Remove all the blocks mentioned in @sublist from the extents in @exlist.
 466  *
 467  * The intent is that callers will iterate the rmapbt for all of its records
 468  * for a given owner to generate @exlist; and iterate all the blocks of the
 469  * metadata structures that are not being rebuilt and have the same rmapbt
 470  * owner to generate @sublist.  This routine subtracts all the extents
 471  * mentioned in sublist from all the extents linked in @exlist, which leaves
 472  * @exlist as the list of blocks that are not accounted for, which we assume
 473  * are the dead blocks of the old metadata structure.  The blocks mentioned in
 474  * @exlist can be reaped.
 475  */
 476 #define LEFT_ALIGNED    (1 << 0)
 477 #define RIGHT_ALIGNED   (1 << 1)
 478 int
 479 xfs_repair_subtract_extents(
 480         struct xfs_scrub_context        *sc,
 481         struct xfs_repair_extent_list   *exlist,
 482         struct xfs_repair_extent_list   *sublist)
 483 {
 484         struct list_head                *lp;
 485         struct xfs_repair_extent        *ex;
 486         struct xfs_repair_extent        *newex;
 487         struct xfs_repair_extent        *subex;
 488         xfs_fsblock_t                   sub_fsb;
 489         xfs_extlen_t                    sub_len;
 490         int                             state;
 491         int                             error = 0;
 492
 493         if (list_empty(&exlist->list) || list_empty(&sublist->list))
 494                 return 0;
 495         ASSERT(!list_empty(&sublist->list));
 496
 497         list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp);
 498         list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp);
 499
 500         /*
 501          * Now that we've sorted both lists, we iterate exlist once, rolling
 502          * forward through sublist and/or exlist as necessary until we find an
 503          * overlap or reach the end of either list.  We do not reset lp to the
 504          * head of exlist nor do we reset subex to the head of sublist.  The
 505          * list traversal is similar to merge sort, but we're deleting
 506          * instead.  In this manner we avoid O(n^2) operations.
 507          */
 508         subex = list_first_entry(&sublist->list, struct xfs_repair_extent,
 509                         list);
 510         lp = exlist->list.next;
 511         while (lp != &exlist->list) {
 512                 ex = list_entry(lp, struct xfs_repair_extent, list);
 513
 514                 /*
 515                  * Advance subex and/or ex until we find a pair that
 516                  * intersect or we run out of extents.
 517                  */
 518                 while (subex->fsbno + subex->len <= ex->fsbno) {
 519                         if (list_is_last(&subex->list, &sublist->list))
 520                                 goto out;
 521                         subex = list_next_entry(subex, list);
 522                 }
 523                 if (subex->fsbno >= ex->fsbno + ex->len) {
 524                         lp = lp->next;
 525                         continue;
 526                 }
 527
 528                 /* trim subex to fit the extent we have */
 529                 sub_fsb = subex->fsbno;
 530                 sub_len = subex->len;
 531                 if (subex->fsbno < ex->fsbno) {
 532                         sub_len -= ex->fsbno - subex->fsbno;
 533                         sub_fsb = ex->fsbno;
 534                 }
 535                 if (sub_len > ex->len)
 536                         sub_len = ex->len;
 537
 538                 state = 0;
 539                 if (sub_fsb == ex->fsbno)
 540                         state |= LEFT_ALIGNED;
 541                 if (sub_fsb + sub_len == ex->fsbno + ex->len)
 542                         state |= RIGHT_ALIGNED;
 543                 switch (state) {
 544                 case LEFT_ALIGNED:
 545                         /* Coincides with only the left. */
 546                         ex->fsbno += sub_len;
 547                         ex->len -= sub_len;
 548                         break;
 549                 case RIGHT_ALIGNED:
 550                         /* Coincides with only the right. */
 551                         ex->len -= sub_len;
 552                         lp = lp->next;
 553                         break;
 554                 case LEFT_ALIGNED | RIGHT_ALIGNED:
 555                         /* Total overlap, just delete ex. */
 556                         lp = lp->next;
 557                         list_del(&ex->list);
 558                         kmem_free(ex);
 559                         break;
 560                 case 0:
 561                         /*
 562                          * Deleting from the middle: add the new right extent
 563                          * and then shrink the left extent.
 564                          */
 565                         newex = kmem_alloc(sizeof(struct xfs_repair_extent),
 566                                         KM_MAYFAIL);
 567                         if (!newex) {
 568                                 error = -ENOMEM;
 569                                 goto out;
 570                         }
 571                         INIT_LIST_HEAD(&newex->list);
 572                         newex->fsbno = sub_fsb + sub_len;
 573                         newex->len = ex->fsbno + ex->len - newex->fsbno;
 574                         list_add(&newex->list, &ex->list);
 575                         ex->len = sub_fsb - ex->fsbno;
 576                         lp = lp->next;
 577                         break;
 578                 default:
 579                         ASSERT(0);
 580                         break;
 581                 }
 582         }
 583
 584 out:
 585         return error;
 586 }
 587 #undef LEFT_ALIGNED
 588 #undef RIGHT_ALIGNED
 589
 590 /*
 591  * Disposal of Blocks from Old per-AG Btrees
 592  *
 593  * Now that we've constructed a new btree to replace the damaged one, we want
 594  * to dispose of the blocks that (we think) the old btree was using.
 595  * Previously, we used the rmapbt to collect the extents (exlist) with the
 596  * rmap owner corresponding to the tree we rebuilt, collected extents for any
 597  * blocks with the same rmap owner that are owned by another data structure
 598  * (sublist), and subtracted sublist from exlist.  In theory the extents
 599  * remaining in exlist are the old btree's blocks.
 600  *
 601  * Unfortunately, it's possible that the btree was crosslinked with other
 602  * blocks on disk.  The rmap data can tell us if there are multiple owners, so
 603  * if the rmapbt says there is an owner of this block other than @oinfo, then
 604  * the block is crosslinked.  Remove the reverse mapping and continue.
 605  *
 606  * If there is one rmap record, we can free the block, which removes the
 607  * reverse mapping but doesn't add the block to the free space.  Our repair
 608  * strategy is to hope the other metadata objects crosslinked on this block
 609  * will be rebuilt (atop different blocks), thereby removing all the cross
 610  * links.
 611  *
 612  * If there are no rmap records at all, we also free the block.  If the btree
 613  * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
 614  * supposed to be a rmap record and everything is ok.  For other btrees there
 615  * had to have been an rmap entry for the block to have ended up on @exlist,
 616  * so if it's gone now there's something wrong and the fs will shut down.
 617  *
 618  * Note: If there are multiple rmap records with only the same rmap owner as
 619  * the btree we're trying to rebuild and the block is indeed owned by another
 620  * data structure with the same rmap owner, then the block will be in sublist
 621  * and therefore doesn't need disposal.  If there are multiple rmap records
 622  * with only the same rmap owner but the block is not owned by something with
 623  * the same rmap owner, the block will be freed.
 624  *
 625  * The caller is responsible for locking the AG headers for the entire rebuild
 626  * operation so that nothing else can sneak in and change the AG state while
 627  * we're not looking.  We also assume that the caller already invalidated any
 628  * buffers associated with @exlist.
 629  */
 630
 631 /*
 632  * Invalidate buffers for per-AG btree blocks we're dumping.  This function
 633  * is not intended for use with file data repairs; we have bunmapi for that.
 634  */
 635 int
 636 xfs_repair_invalidate_blocks(
 637         struct xfs_scrub_context        *sc,
 638         struct xfs_repair_extent_list   *exlist)
 639 {
 640         struct xfs_repair_extent        *rex;
 641         struct xfs_repair_extent        *n;
 642         struct xfs_buf                  *bp;
 643         xfs_fsblock_t                   fsbno;
 644         xfs_agblock_t                   i;
 645
 646         /*
 647          * For each block in each extent, see if there's an incore buffer for
 648          * exactly that block; if so, invalidate it.  The buffer cache only
 649          * lets us look for one buffer at a time, so we have to look one block
 650          * at a time.  Avoid invalidating AG headers and post-EOFS blocks
 651          * because we never own those; and if we can't TRYLOCK the buffer we
 652          * assume it's owned by someone else.
 653          */
 654         for_each_xfs_repair_extent_safe(rex, n, exlist) {
 655                 for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) {
 656                         /* Skip AG headers and post-EOFS blocks */
 657                         if (!xfs_verify_fsbno(sc->mp, fsbno))
 658                                 continue;
 659                         bp = xfs_buf_incore(sc->mp->m_ddev_targp,
 660                                         XFS_FSB_TO_DADDR(sc->mp, fsbno),
 661                                         XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK);
 662                         if (bp) {
 663                                 xfs_trans_bjoin(sc->tp, bp);
 664                                 xfs_trans_binval(sc->tp, bp);
 665                         }
 666                 }
 667         }
 668
 669         return 0;
 670 }
 671
 672 /* Ensure the freelist is the correct size. */
 673 int
 674 xfs_repair_fix_freelist(
 675         struct xfs_scrub_context        *sc,
 676         bool                            can_shrink)
 677 {
 678         struct xfs_alloc_arg            args = {0};
 679
 680         args.mp = sc->mp;
 681         args.tp = sc->tp;
 682         args.agno = sc->sa.agno;
 683         args.alignment = 1;
 684         args.pag = sc->sa.pag;
 685
 686         return xfs_alloc_fix_freelist(&args,
 687                         can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
 688 }
 689
 690 /*
 691  * Put a block back on the AGFL.
 692  */
 693 STATIC int
 694 xfs_repair_put_freelist(
 695         struct xfs_scrub_context        *sc,
 696         xfs_agblock_t                   agbno)
 697 {
 698         struct xfs_owner_info           oinfo;
 699         int                             error;
 700
 701         /* Make sure there's space on the freelist. */
 702         error = xfs_repair_fix_freelist(sc, true);
 703         if (error)
 704                 return error;
 705
 706         /*
 707          * Since we're "freeing" a lost block onto the AGFL, we have to
 708          * create an rmap for the block prior to merging it or else other
 709          * parts will break.
 710          */
 711         xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
 712         error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1,
 713                         &oinfo);
 714         if (error)
 715                 return error;
 716
 717         /* Put the block on the AGFL. */
 718         error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp,
 719                         agbno, 0);
 720         if (error)
 721                 return error;
 722         xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1,
 723                         XFS_EXTENT_BUSY_SKIP_DISCARD);
 724
 725         return 0;
 726 }
 727
 728 /* Dispose of a single metadata block. */
 729 STATIC int
 730 xfs_repair_dispose_btree_block(
 731         struct xfs_scrub_context        *sc,
 732         xfs_fsblock_t                   fsbno,
 733         struct xfs_owner_info           *oinfo,
 734         enum xfs_ag_resv_type           resv)
 735 {
 736         struct xfs_btree_cur            *cur;
 737         struct xfs_buf                  *agf_bp = NULL;
 738         xfs_agnumber_t                  agno;
 739         xfs_agblock_t                   agbno;
 740         bool                            has_other_rmap;
 741         int                             error;
 742
 743         agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
 744         agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
 745
 746         /*
 747          * If we are repairing per-inode metadata, we need to read in the AGF
 748          * buffer.  Otherwise, we're repairing a per-AG structure, so reuse
 749          * the AGF buffer that the setup functions already grabbed.
 750          */
 751         if (sc->ip) {
 752                 error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp);
 753                 if (error)
 754                         return error;
 755                 if (!agf_bp)
 756                         return -ENOMEM;
 757         } else {
 758                 agf_bp = sc->sa.agf_bp;
 759         }
 760         cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno);
 761
 762         /* Can we find any other rmappings? */
 763         error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
 764         if (error)
 765                 goto out_cur;
 766         xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 767
 768         /*
 769          * If there are other rmappings, this block is cross linked and must
 770          * not be freed.  Remove the reverse mapping and move on.  Otherwise,
 771          * we were the only owner of the block, so free the extent, which will
 772          * also remove the rmap.
 773          *
 774          * XXX: XFS doesn't support detecting the case where a single block
 775          * metadata structure is crosslinked with a multi-block structure
 776          * because the buffer cache doesn't detect aliasing problems, so we
 777          * can't fix 100% of crosslinking problems (yet).  The verifiers will
 778          * blow on writeout, the filesystem will shut down, and the admin gets
 779          * to run xfs_repair.
 780          */
 781         if (has_other_rmap)
 782                 error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo);
 783         else if (resv == XFS_AG_RESV_AGFL)
 784                 error = xfs_repair_put_freelist(sc, agbno);
 785         else
 786                 error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
 787         if (agf_bp != sc->sa.agf_bp)
 788                 xfs_trans_brelse(sc->tp, agf_bp);
 789         if (error)
 790                 return error;
 791
 792         if (sc->ip)
 793                 return xfs_trans_roll_inode(&sc->tp, sc->ip);
 794         return xfs_repair_roll_ag_trans(sc);
 795
 796 out_cur:
 797         xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 798         if (agf_bp != sc->sa.agf_bp)
 799                 xfs_trans_brelse(sc->tp, agf_bp);
 800         return error;
 801 }
 802
 803 /* Dispose of btree blocks from an old per-AG btree. */
 804 int
 805 xfs_repair_reap_btree_extents(
 806         struct xfs_scrub_context        *sc,
 807         struct xfs_repair_extent_list   *exlist,
 808         struct xfs_owner_info           *oinfo,
 809         enum xfs_ag_resv_type           type)
 810 {
 811         struct xfs_repair_extent        *rex;
 812         struct xfs_repair_extent        *n;
 813         int                             error = 0;
 814
 815         ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb));
 816
 817         /* Dispose of every block from the old btree. */
 818         for_each_xfs_repair_extent_safe(rex, n, exlist) {
 819                 ASSERT(sc->ip != NULL ||
 820                        XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno);
 821
 822                 trace_xfs_repair_dispose_btree_extent(sc->mp,
 823                                 XFS_FSB_TO_AGNO(sc->mp, rex->fsbno),
 824                                 XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len);
 825
 826                 for (; rex->len > 0; rex->len--, rex->fsbno++) {
 827                         error = xfs_repair_dispose_btree_block(sc, rex->fsbno,
 828                                         oinfo, type);
 829                         if (error)
 830                                 goto out;
 831                 }
 832                 list_del(&rex->list);
 833                 kmem_free(rex);
 834         }
 835
 836 out:
 837         xfs_repair_cancel_btree_extents(sc, exlist);
 838         return error;
 839 }
 840
 841 /*
 842  * Finding per-AG Btree Roots for AGF/AGI Reconstruction
 843  *
 844  * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
 845  * the AG headers by using the rmap data to rummage through the AG looking for
 846  * btree roots.  This is not guaranteed to work if the AG is heavily damaged
 847  * or the rmap data are corrupt.
 848  *
 849  * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL
 850  * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
 851  * AGI is being rebuilt.  It must maintain these locks until it's safe for
 852  * other threads to change the btrees' shapes.  The caller provides
 853  * information about the btrees to look for by passing in an array of
 854  * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
 855  * The (root, height) fields will be set on return if anything is found.  The
 856  * last element of the array should have a NULL buf_ops to mark the end of the
 857  * array.
 858  *
 859  * For every rmapbt record matching any of the rmap owners in btree_info,
 860  * read each block referenced by the rmap record.  If the block is a btree
 861  * block from this filesystem matching any of the magic numbers and has a
 862  * level higher than what we've already seen, remember the block and the
 863  * height of the tree required to have such a block.  When the call completes,
 864  * we return the highest block we've found for each btree description; those
 865  * should be the roots.
 866  */
 867
 868 struct xfs_repair_findroot {
 869         struct xfs_scrub_context        *sc;
 870         struct xfs_buf                  *agfl_bp;
 871         struct xfs_agf                  *agf;
 872         struct xfs_repair_find_ag_btree *btree_info;
 873 };
 874
 875 /* See if our block is in the AGFL. */
 876 STATIC int
 877 xfs_repair_findroot_agfl_walk(
 878         struct xfs_mount                *mp,
 879         xfs_agblock_t                   bno,
 880         void                            *priv)
 881 {
 882         xfs_agblock_t                   *agbno = priv;
 883
 884         return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0;
 885 }
 886
 887 /* Does this block match the btree information passed in? */
 888 STATIC int
 889 xfs_repair_findroot_block(
 890         struct xfs_repair_findroot      *ri,
 891         struct xfs_repair_find_ag_btree *fab,
 892         uint64_t                        owner,
 893         xfs_agblock_t                   agbno,
 894         bool                            *found_it)
 895 {
 896         struct xfs_mount                *mp = ri->sc->mp;
 897         struct xfs_buf                  *bp;
 898         struct xfs_btree_block          *btblock;
 899         xfs_daddr_t                     daddr;
 900         int                             error;
 901
 902         daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
 903
 904         /*
 905          * Blocks in the AGFL have stale contents that might just happen to
 906          * have a matching magic and uuid.  We don't want to pull these blocks
 907          * in as part of a tree root, so we have to filter out the AGFL stuff
 908          * here.  If the AGFL looks insane we'll just refuse to repair.
 909          */
 910         if (owner == XFS_RMAP_OWN_AG) {
 911                 error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
 912                                 xfs_repair_findroot_agfl_walk, &agbno);
 913                 if (error == XFS_BTREE_QUERY_RANGE_ABORT)
 914                         return 0;
 915                 if (error)
 916                         return error;
 917         }
 918
 919         error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
 920                         mp->m_bsize, 0, &bp, NULL);
 921         if (error)
 922                 return error;
 923
 924         /*
 925          * Does this look like a block matching our fs and higher than any
 926          * other block we've found so far?  If so, reattach buffer verifiers
 927          * so the AIL won't complain if the buffer is also dirty.
 928          */
 929         btblock = XFS_BUF_TO_BLOCK(bp);
 930         if (be32_to_cpu(btblock->bb_magic) != fab->magic)
 931                 goto out;
 932         if (xfs_sb_version_hascrc(&mp->m_sb) &&
 933             !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 934                 goto out;
 935         bp->b_ops = fab->buf_ops;
 936
 937         /* Ignore this block if it's lower in the tree than we've seen. */
 938         if (fab->root != NULLAGBLOCK &&
 939             xfs_btree_get_level(btblock) < fab->height)
 940                 goto out;
 941
 942         /* Make sure we pass the verifiers. */
 943         bp->b_ops->verify_read(bp);
 944         if (bp->b_error)
 945                 goto out;
 946         fab->root = agbno;
 947         fab->height = xfs_btree_get_level(btblock) + 1;
 948         *found_it = true;
 949
 950         trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno,
 951                         be32_to_cpu(btblock->bb_magic), fab->height - 1);
 952 out:
 953         xfs_trans_brelse(ri->sc->tp, bp);
 954         return error;
 955 }
 956
 957 /*
 958  * Do any of the blocks in this rmap record match one of the btrees we're
 959  * looking for?
 960  */
 961 STATIC int
 962 xfs_repair_findroot_rmap(
 963         struct xfs_btree_cur            *cur,
 964         struct xfs_rmap_irec            *rec,
 965         void                            *priv)
 966 {
 967         struct xfs_repair_findroot      *ri = priv;
 968         struct xfs_repair_find_ag_btree *fab;
 969         xfs_agblock_t                   b;
 970         bool                            found_it;
 971         int                             error = 0;
 972
 973         /* Ignore anything that isn't AG metadata. */
 974         if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
 975                 return 0;
 976
 977         /* Otherwise scan each block + btree type. */
 978         for (b = 0; b < rec->rm_blockcount; b++) {
 979                 found_it = false;
 980                 for (fab = ri->btree_info; fab->buf_ops; fab++) {
 981                         if (rec->rm_owner != fab->rmap_owner)
 982                                 continue;
 983                         error = xfs_repair_findroot_block(ri, fab,
 984                                         rec->rm_owner, rec->rm_startblock + b,
 985                                         &found_it);
 986                         if (error)
 987                                 return error;
 988                         if (found_it)
 989                                 break;
 990                 }
 991         }
 992
 993         return 0;
 994 }
 995
 996 /* Find the roots of the per-AG btrees described in btree_info. */
 997 int
 998 xfs_repair_find_ag_btree_roots(
 999         struct xfs_scrub_context        *sc,
1000         struct xfs_buf                  *agf_bp,
1001         struct xfs_repair_find_ag_btree *btree_info,
1002         struct xfs_buf                  *agfl_bp)
1003 {
1004         struct xfs_mount                *mp = sc->mp;
1005         struct xfs_repair_findroot      ri;
1006         struct xfs_repair_find_ag_btree *fab;
1007         struct xfs_btree_cur            *cur;
1008         int                             error;
1009
1010         ASSERT(xfs_buf_islocked(agf_bp));
1011         ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
1012
1013         ri.sc = sc;
1014         ri.btree_info = btree_info;
1015         ri.agf = XFS_BUF_TO_AGF(agf_bp);
1016         ri.agfl_bp = agfl_bp;
1017         for (fab = btree_info; fab->buf_ops; fab++) {
1018                 ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
1019                 ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
1020                 fab->root = NULLAGBLOCK;
1021                 fab->height = 0;
1022         }
1023
1024         cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
1025         error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri);
1026         xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
1027
1028         return error;
1029 }
1030
1031 /* Force a quotacheck the next time we mount. */
1032 void
1033 xfs_repair_force_quotacheck(
1034         struct xfs_scrub_context        *sc,
1035         uint                            dqtype)
1036 {
1037         uint                            flag;
1038
1039         flag = xfs_quota_chkd_flag(dqtype);
1040         if (!(flag & sc->mp->m_qflags))
1041                 return;
1042
1043         sc->mp->m_qflags &= ~flag;
1044         spin_lock(&sc->mp->m_sb_lock);
1045         sc->mp->m_sb.sb_qflags &= ~flag;
1046         spin_unlock(&sc->mp->m_sb_lock);
1047         xfs_log_sb(sc->tp);
1048 }
1049
1050 /*
1051  * Attach dquots to this inode, or schedule quotacheck to fix them.
1052  *
1053  * This function ensures that the appropriate dquots are attached to an inode.
1054  * We cannot allow the dquot code to allocate an on-disk dquot block here
1055  * because we're already in transaction context with the inode locked.  The
1056  * on-disk dquot should already exist anyway.  If the quota code signals
1057  * corruption or missing quota information, schedule quotacheck, which will
1058  * repair corruptions in the quota metadata.
1059  */
1060 int
1061 xfs_repair_ino_dqattach(
1062         struct xfs_scrub_context        *sc)
1063 {
1064         int                             error;
1065
1066         error = xfs_qm_dqattach_locked(sc->ip, false);
1067         switch (error) {
1068         case -EFSBADCRC:
1069         case -EFSCORRUPTED:
1070         case -ENOENT:
1071                 xfs_err_ratelimited(sc->mp,
1072 "inode %llu repair encountered quota error %d, quotacheck forced.",
1073                                 (unsigned long long)sc->ip->i_ino, error);
1074                 if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
1075                         xfs_repair_force_quotacheck(sc, XFS_DQ_USER);
1076                 if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
1077                         xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP);
1078                 if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
1079                         xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ);
1080                 /* fall through */
1081         case -ESRCH:
1082                 error = 0;
1083                 break;
1084         default:
1085                 break;
1086         }
1087
1088         return error;
1089 }