fs/xfs/xfs_fsops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_fs.h"
  20 #include "xfs_shared.h"
  21 #include "xfs_format.h"
  22 #include "xfs_log_format.h"
  23 #include "xfs_trans_resv.h"
  24 #include "xfs_sb.h"
  25 #include "xfs_mount.h"
  26 #include "xfs_defer.h"
  27 #include "xfs_trans.h"
  28 #include "xfs_error.h"
  29 #include "xfs_btree.h"
  30 #include "xfs_alloc.h"
  31 #include "xfs_fsops.h"
  32 #include "xfs_trans_space.h"
  33 #include "xfs_rtalloc.h"
  34 #include "xfs_trace.h"
  35 #include "xfs_log.h"
  36 #include "xfs_ag.h"
  37 #include "xfs_ag_resv.h"
  38
  39 /*
  40  * growfs operations
  41  */
  42 static int
  43 xfs_growfs_data_private(
  44         xfs_mount_t             *mp,            /* mount point for filesystem */
  45         xfs_growfs_data_t       *in)            /* growfs data input struct */
  46 {
  47         xfs_buf_t               *bp;
  48         int                     error;
  49         xfs_agnumber_t          nagcount;
  50         xfs_agnumber_t          nagimax = 0;
  51         xfs_rfsblock_t          nb, nb_mod;
  52         xfs_rfsblock_t          new;
  53         xfs_agnumber_t          oagcount;
  54         xfs_trans_t             *tp;
  55         LIST_HEAD               (buffer_list);
  56         struct aghdr_init_data  id = {};
  57
  58         nb = in->newblocks;
  59         if (nb < mp->m_sb.sb_dblocks)
  60                 return -EINVAL;
  61         if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
  62                 return error;
  63         error = xfs_buf_read_uncached(mp->m_ddev_targp,
  64                                 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
  65                                 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
  66         if (error)
  67                 return error;
  68         xfs_buf_relse(bp);
  69
  70         new = nb;       /* use new as a temporary here */
  71         nb_mod = do_div(new, mp->m_sb.sb_agblocks);
  72         nagcount = new + (nb_mod != 0);
  73         if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
  74                 nagcount--;
  75                 nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
  76                 if (nb < mp->m_sb.sb_dblocks)
  77                         return -EINVAL;
  78         }
  79         new = nb - mp->m_sb.sb_dblocks;
  80         oagcount = mp->m_sb.sb_agcount;
  81
  82         /* allocate the new per-ag structures */
  83         if (nagcount > oagcount) {
  84                 error = xfs_initialize_perag(mp, nagcount, &nagimax);
  85                 if (error)
  86                         return error;
  87         }
  88
  89         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
  90                         XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
  91         if (error)
  92                 return error;
  93
  94         /*
  95          * Write new AG headers to disk. Non-transactional, but need to be
  96          * written and completed prior to the growfs transaction being logged.
  97          * To do this, we use a delayed write buffer list and wait for
  98          * submission and IO completion of the list as a whole. This allows the
  99          * IO subsystem to merge all the AG headers in a single AG into a single
 100          * IO and hide most of the latency of the IO from us.
 101          *
 102          * This also means that if we get an error whilst building the buffer
 103          * list to write, we can cancel the entire list without having written
 104          * anything.
 105          */
 106         INIT_LIST_HEAD(&id.buffer_list);
 107         for (id.agno = nagcount - 1;
 108              id.agno >= oagcount;
 109              id.agno--, new -= id.agsize) {
 110
 111                 if (id.agno == nagcount - 1)
 112                         id.agsize = nb -
 113                                 (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
 114                 else
 115                         id.agsize = mp->m_sb.sb_agblocks;
 116
 117                 error = xfs_ag_init_headers(mp, &id);
 118                 if (error) {
 119                         xfs_buf_delwri_cancel(&id.buffer_list);
 120                         goto out_trans_cancel;
 121                 }
 122         }
 123         error = xfs_buf_delwri_submit(&id.buffer_list);
 124         if (error)
 125                 goto out_trans_cancel;
 126
 127         xfs_trans_agblocks_delta(tp, id.nfree);
 128
 129         /* If there are new blocks in the old last AG, extend it. */
 130         if (new) {
 131                 error = xfs_ag_extend_space(mp, tp, &id, new);
 132                 if (error)
 133                         goto out_trans_cancel;
 134         }
 135
 136         /*
 137          * Update changed superblock fields transactionally. These are not
 138          * seen by the rest of the world until the transaction commit applies
 139          * them atomically to the superblock.
 140          */
 141         if (nagcount > oagcount)
 142                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
 143         if (nb > mp->m_sb.sb_dblocks)
 144                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
 145                                  nb - mp->m_sb.sb_dblocks);
 146         if (id.nfree)
 147                 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
 148         xfs_trans_set_sync(tp);
 149         error = xfs_trans_commit(tp);
 150         if (error)
 151                 return error;
 152
 153         /* New allocation groups fully initialized, so update mount struct */
 154         if (nagimax)
 155                 mp->m_maxagi = nagimax;
 156         xfs_set_low_space_thresholds(mp);
 157         mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 158
 159         /*
 160          * If we expanded the last AG, free the per-AG reservation
 161          * so we can reinitialize it with the new size.
 162          */
 163         if (new) {
 164                 struct xfs_perag        *pag;
 165
 166                 pag = xfs_perag_get(mp, id.agno);
 167                 error = xfs_ag_resv_free(pag);
 168                 xfs_perag_put(pag);
 169                 if (error)
 170                         return error;
 171         }
 172
 173         /*
 174          * Reserve AG metadata blocks. ENOSPC here does not mean there was a
 175          * growfs failure, just that there still isn't space for new user data
 176          * after the grow has been run.
 177          */
 178         error = xfs_fs_reserve_ag_blocks(mp);
 179         if (error == -ENOSPC)
 180                 error = 0;
 181         return error;
 182
 183 out_trans_cancel:
 184         xfs_trans_cancel(tp);
 185         return error;
 186 }
 187
 188 static int
 189 xfs_growfs_log_private(
 190         xfs_mount_t             *mp,    /* mount point for filesystem */
 191         xfs_growfs_log_t        *in)    /* growfs log input struct */
 192 {
 193         xfs_extlen_t            nb;
 194
 195         nb = in->newblocks;
 196         if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
 197                 return -EINVAL;
 198         if (nb == mp->m_sb.sb_logblocks &&
 199             in->isint == (mp->m_sb.sb_logstart != 0))
 200                 return -EINVAL;
 201         /*
 202          * Moving the log is hard, need new interfaces to sync
 203          * the log first, hold off all activity while moving it.
 204          * Can have shorter or longer log in the same space,
 205          * or transform internal to external log or vice versa.
 206          */
 207         return -ENOSYS;
 208 }
 209
 210 static int
 211 xfs_growfs_imaxpct(
 212         struct xfs_mount        *mp,
 213         __u32                   imaxpct)
 214 {
 215         struct xfs_trans        *tp;
 216         int                     dpct;
 217         int                     error;
 218
 219         if (imaxpct > 100)
 220                 return -EINVAL;
 221
 222         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
 223                         XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
 224         if (error)
 225                 return error;
 226
 227         dpct = imaxpct - mp->m_sb.sb_imax_pct;
 228         xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
 229         xfs_trans_set_sync(tp);
 230         return xfs_trans_commit(tp);
 231 }
 232
 233 /*
 234  * protected versions of growfs function acquire and release locks on the mount
 235  * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
 236  * XFS_IOC_FSGROWFSRT
 237  */
 238 int
 239 xfs_growfs_data(
 240         struct xfs_mount        *mp,
 241         struct xfs_growfs_data  *in)
 242 {
 243         int                     error = 0;
 244
 245         if (!capable(CAP_SYS_ADMIN))
 246                 return -EPERM;
 247         if (!mutex_trylock(&mp->m_growlock))
 248                 return -EWOULDBLOCK;
 249
 250         /* update imaxpct separately to the physical grow of the filesystem */
 251         if (in->imaxpct != mp->m_sb.sb_imax_pct) {
 252                 error = xfs_growfs_imaxpct(mp, in->imaxpct);
 253                 if (error)
 254                         goto out_error;
 255         }
 256
 257         if (in->newblocks != mp->m_sb.sb_dblocks) {
 258                 error = xfs_growfs_data_private(mp, in);
 259                 if (error)
 260                         goto out_error;
 261         }
 262
 263         /* Post growfs calculations needed to reflect new state in operations */
 264         if (mp->m_sb.sb_imax_pct) {
 265                 uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
 266                 do_div(icount, 100);
 267                 mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 268         } else
 269                 mp->m_maxicount = 0;
 270
 271         /* Update secondary superblocks now the physical grow has completed */
 272         error = xfs_update_secondary_sbs(mp);
 273
 274 out_error:
 275         /*
 276          * Increment the generation unconditionally, the error could be from
 277          * updating the secondary superblocks, in which case the new size
 278          * is live already.
 279          */
 280         mp->m_generation++;
 281         mutex_unlock(&mp->m_growlock);
 282         return error;
 283 }
 284
 285 int
 286 xfs_growfs_log(
 287         xfs_mount_t             *mp,
 288         xfs_growfs_log_t        *in)
 289 {
 290         int error;
 291
 292         if (!capable(CAP_SYS_ADMIN))
 293                 return -EPERM;
 294         if (!mutex_trylock(&mp->m_growlock))
 295                 return -EWOULDBLOCK;
 296         error = xfs_growfs_log_private(mp, in);
 297         mutex_unlock(&mp->m_growlock);
 298         return error;
 299 }
 300
 301 /*
 302  * exported through ioctl XFS_IOC_FSCOUNTS
 303  */
 304
 305 int
 306 xfs_fs_counts(
 307         xfs_mount_t             *mp,
 308         xfs_fsop_counts_t       *cnt)
 309 {
 310         cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
 311         cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
 312         cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
 313                                                 mp->m_alloc_set_aside;
 314
 315         spin_lock(&mp->m_sb_lock);
 316         cnt->freertx = mp->m_sb.sb_frextents;
 317         spin_unlock(&mp->m_sb_lock);
 318         return 0;
 319 }
 320
 321 /*
 322  * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
 323  *
 324  * xfs_reserve_blocks is called to set m_resblks
 325  * in the in-core mount table. The number of unused reserved blocks
 326  * is kept in m_resblks_avail.
 327  *
 328  * Reserve the requested number of blocks if available. Otherwise return
 329  * as many as possible to satisfy the request. The actual number
 330  * reserved are returned in outval
 331  *
 332  * A null inval pointer indicates that only the current reserved blocks
 333  * available  should  be returned no settings are changed.
 334  */
 335
 336 int
 337 xfs_reserve_blocks(
 338         xfs_mount_t             *mp,
 339         uint64_t              *inval,
 340         xfs_fsop_resblks_t      *outval)
 341 {
 342         int64_t                 lcounter, delta;
 343         int64_t                 fdblks_delta = 0;
 344         uint64_t                request;
 345         int64_t                 free;
 346         int                     error = 0;
 347
 348         /* If inval is null, report current values and return */
 349         if (inval == (uint64_t *)NULL) {
 350                 if (!outval)
 351                         return -EINVAL;
 352                 outval->resblks = mp->m_resblks;
 353                 outval->resblks_avail = mp->m_resblks_avail;
 354                 return 0;
 355         }
 356
 357         request = *inval;
 358
 359         /*
 360          * With per-cpu counters, this becomes an interesting problem. we need
 361          * to work out if we are freeing or allocation blocks first, then we can
 362          * do the modification as necessary.
 363          *
 364          * We do this under the m_sb_lock so that if we are near ENOSPC, we will
 365          * hold out any changes while we work out what to do. This means that
 366          * the amount of free space can change while we do this, so we need to
 367          * retry if we end up trying to reserve more space than is available.
 368          */
 369         spin_lock(&mp->m_sb_lock);
 370
 371         /*
 372          * If our previous reservation was larger than the current value,
 373          * then move any unused blocks back to the free pool. Modify the resblks
 374          * counters directly since we shouldn't have any problems unreserving
 375          * space.
 376          */
 377         if (mp->m_resblks > request) {
 378                 lcounter = mp->m_resblks_avail - request;
 379                 if (lcounter  > 0) {            /* release unused blocks */
 380                         fdblks_delta = lcounter;
 381                         mp->m_resblks_avail -= lcounter;
 382                 }
 383                 mp->m_resblks = request;
 384                 if (fdblks_delta) {
 385                         spin_unlock(&mp->m_sb_lock);
 386                         error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
 387                         spin_lock(&mp->m_sb_lock);
 388                 }
 389
 390                 goto out;
 391         }
 392
 393         /*
 394          * If the request is larger than the current reservation, reserve the
 395          * blocks before we update the reserve counters. Sample m_fdblocks and
 396          * perform a partial reservation if the request exceeds free space.
 397          */
 398         error = -ENOSPC;
 399         do {
 400                 free = percpu_counter_sum(&mp->m_fdblocks) -
 401                                                 mp->m_alloc_set_aside;
 402                 if (!free)
 403                         break;
 404
 405                 delta = request - mp->m_resblks;
 406                 lcounter = free - delta;
 407                 if (lcounter < 0)
 408                         /* We can't satisfy the request, just get what we can */
 409                         fdblks_delta = free;
 410                 else
 411                         fdblks_delta = delta;
 412
 413                 /*
 414                  * We'll either succeed in getting space from the free block
 415                  * count or we'll get an ENOSPC. If we get a ENOSPC, it means
 416                  * things changed while we were calculating fdblks_delta and so
 417                  * we should try again to see if there is anything left to
 418                  * reserve.
 419                  *
 420                  * Don't set the reserved flag here - we don't want to reserve
 421                  * the extra reserve blocks from the reserve.....
 422                  */
 423                 spin_unlock(&mp->m_sb_lock);
 424                 error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
 425                 spin_lock(&mp->m_sb_lock);
 426         } while (error == -ENOSPC);
 427
 428         /*
 429          * Update the reserve counters if blocks have been successfully
 430          * allocated.
 431          */
 432         if (!error && fdblks_delta) {
 433                 mp->m_resblks += fdblks_delta;
 434                 mp->m_resblks_avail += fdblks_delta;
 435         }
 436
 437 out:
 438         if (outval) {
 439                 outval->resblks = mp->m_resblks;
 440                 outval->resblks_avail = mp->m_resblks_avail;
 441         }
 442
 443         spin_unlock(&mp->m_sb_lock);
 444         return error;
 445 }
 446
 447 int
 448 xfs_fs_goingdown(
 449         xfs_mount_t     *mp,
 450         uint32_t        inflags)
 451 {
 452         switch (inflags) {
 453         case XFS_FSOP_GOING_FLAGS_DEFAULT: {
 454                 struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
 455
 456                 if (sb && !IS_ERR(sb)) {
 457                         xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 458                         thaw_bdev(sb->s_bdev, sb);
 459                 }
 460
 461                 break;
 462         }
 463         case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
 464                 xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
 465                 break;
 466         case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
 467                 xfs_force_shutdown(mp,
 468                                 SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
 469                 break;
 470         default:
 471                 return -EINVAL;
 472         }
 473
 474         return 0;
 475 }
 476
 477 /*
 478  * Force a shutdown of the filesystem instantly while keeping the filesystem
 479  * consistent. We don't do an unmount here; just shutdown the shop, make sure
 480  * that absolutely nothing persistent happens to this filesystem after this
 481  * point.
 482  */
 483 void
 484 xfs_do_force_shutdown(
 485         xfs_mount_t     *mp,
 486         int             flags,
 487         char            *fname,
 488         int             lnnum)
 489 {
 490         int             logerror;
 491
 492         logerror = flags & SHUTDOWN_LOG_IO_ERROR;
 493
 494         if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 495                 xfs_notice(mp,
 496         "%s(0x%x) called from line %d of file %s.  Return address = "PTR_FMT,
 497                         __func__, flags, lnnum, fname, __return_address);
 498         }
 499         /*
 500          * No need to duplicate efforts.
 501          */
 502         if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
 503                 return;
 504
 505         /*
 506          * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
 507          * queue up anybody new on the log reservations, and wakes up
 508          * everybody who's sleeping on log reservations to tell them
 509          * the bad news.
 510          */
 511         if (xfs_log_force_umount(mp, logerror))
 512                 return;
 513
 514         if (flags & SHUTDOWN_CORRUPT_INCORE) {
 515                 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
 516     "Corruption of in-memory data detected.  Shutting down filesystem");
 517                 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
 518                         xfs_stack_trace();
 519         } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 520                 if (logerror) {
 521                         xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
 522                 "Log I/O Error Detected.  Shutting down filesystem");
 523                 } else if (flags & SHUTDOWN_DEVICE_REQ) {
 524                         xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
 525                 "All device paths lost.  Shutting down filesystem");
 526                 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
 527                         xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
 528                 "I/O Error Detected. Shutting down filesystem");
 529                 }
 530         }
 531         if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
 532                 xfs_alert(mp,
 533         "Please umount the filesystem and rectify the problem(s)");
 534         }
 535 }
 536
 537 /*
 538  * Reserve free space for per-AG metadata.
 539  */
 540 int
 541 xfs_fs_reserve_ag_blocks(
 542         struct xfs_mount        *mp)
 543 {
 544         xfs_agnumber_t          agno;
 545         struct xfs_perag        *pag;
 546         int                     error = 0;
 547         int                     err2;
 548
 549         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 550                 pag = xfs_perag_get(mp, agno);
 551                 err2 = xfs_ag_resv_init(pag);
 552                 xfs_perag_put(pag);
 553                 if (err2 && !error)
 554                         error = err2;
 555         }
 556
 557         if (error && error != -ENOSPC) {
 558                 xfs_warn(mp,
 559         "Error %d reserving per-AG metadata reserve pool.", error);
 560                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 561         }
 562
 563         return error;
 564 }
 565
 566 /*
 567  * Free space reserved for per-AG metadata.
 568  */
 569 int
 570 xfs_fs_unreserve_ag_blocks(
 571         struct xfs_mount        *mp)
 572 {
 573         xfs_agnumber_t          agno;
 574         struct xfs_perag        *pag;
 575         int                     error = 0;
 576         int                     err2;
 577
 578         for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 579                 pag = xfs_perag_get(mp, agno);
 580                 err2 = xfs_ag_resv_free(pag);
 581                 xfs_perag_put(pag);
 582                 if (err2 && !error)
 583                         error = err2;
 584         }
 585
 586         if (error)
 587                 xfs_warn(mp,
 588         "Error %d freeing per-AG metadata reserve pool.", error);
 589
 590         return error;
 591 }