Merge tag 'arm-soc/for-5.5/devicetree-part2' of https://github.com/Broadcom/stblinux...

[linux.git] / fs / xfs / xfs_iomap.c
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 95719e161286c74a4552c60bc7a1353eb2115173..28e2d1f37267d49d6abd8e5faa07f5401c856e64 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -29,8 +29,8 @@
  #include "xfs_reflink.h"
  
  
-#define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
-                                               << mp->m_writeio_log)
+#define XFS_ALLOC_ALIGN(mp, off) \
+       (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
  
  static int
  xfs_alert_fsblock_zero(
@@ -57,6 +57,7 @@ xfs_bmbt_to_iomap(
         u16                     flags)
  {
         struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
  
         if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
                 return xfs_alert_fsblock_zero(ip, imap);
@@ -77,8 +78,8 @@ xfs_bmbt_to_iomap(
         }
         iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
         iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
-       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-       iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+       iomap->bdev = target->bt_bdev;
+       iomap->dax_dev = target->bt_daxdev;
         iomap->flags = flags;
  
         if (xfs_ipincount(ip) &&
@@ -94,18 +95,30 @@ xfs_hole_to_iomap(
         xfs_fileoff_t           offset_fsb,
         xfs_fileoff_t           end_fsb)
  {
+       struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+
         iomap->addr = IOMAP_NULL_ADDR;
         iomap->type = IOMAP_HOLE;
         iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
         iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
-       iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
-       iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+       iomap->bdev = target->bt_bdev;
+       iomap->dax_dev = target->bt_daxdev;
+}
+
+static inline xfs_fileoff_t
+xfs_iomap_end_fsb(
+       struct xfs_mount        *mp,
+       loff_t                  offset,
+       loff_t                  count)
+{
+       ASSERT(offset <= mp->m_super->s_maxbytes);
+       return min(XFS_B_TO_FSB(mp, offset + count),
+                  XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
  }
  
-xfs_extlen_t
+static xfs_extlen_t
  xfs_eof_alignment(
-       struct xfs_inode        *ip,
-       xfs_extlen_t            extsize)
+       struct xfs_inode        *ip)
  {
         struct xfs_mount        *mp = ip->i_mount;
         xfs_extlen_t            align = 0;
@@ -128,111 +141,80 @@ xfs_eof_alignment(
                         align = 0;
         }
  
-       /*
-        * Always round up the allocation request to an extent boundary
-        * (when file on a real-time subvolume or has di_extsize hint).
-        */
-       if (extsize) {
-               if (align)
-                       align = roundup_64(align, extsize);
-               else
-                       align = extsize;
-       }
-
         return align;
  }
  
-STATIC int
+/*
+ * Check if last_fsb is outside the last extent, and if so grow it to the next
+ * stripe unit boundary.
+ */
+xfs_fileoff_t
  xfs_iomap_eof_align_last_fsb(
         struct xfs_inode        *ip,
-       xfs_extlen_t            extsize,
-       xfs_fileoff_t           *last_fsb)
+       xfs_fileoff_t           end_fsb)
  {
-       xfs_extlen_t            align = xfs_eof_alignment(ip, extsize);
+       struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       xfs_extlen_t            extsz = xfs_get_extsz_hint(ip);
+       xfs_extlen_t            align = xfs_eof_alignment(ip);
+       struct xfs_bmbt_irec    irec;
+       struct xfs_iext_cursor  icur;
+
+       ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+
+       /*
+        * Always round up the allocation request to the extent hint boundary.
+        */
+       if (extsz) {
+               if (align)
+                       align = roundup_64(align, extsz);
+               else
+                       align = extsz;
+       }
  
         if (align) {
-               xfs_fileoff_t   new_last_fsb = roundup_64(*last_fsb, align);
-               int             eof, error;
+               xfs_fileoff_t   aligned_end_fsb = roundup_64(end_fsb, align);
  
-               error = xfs_bmap_eof(ip, new_last_fsb, XFS_DATA_FORK, &eof);
-               if (error)
-                       return error;
-               if (eof)
-                       *last_fsb = new_last_fsb;
+               xfs_iext_last(ifp, &icur);
+               if (!xfs_iext_get_extent(ifp, &icur, &irec) ||
+                   aligned_end_fsb >= irec.br_startoff + irec.br_blockcount)
+                       return aligned_end_fsb;
         }
-       return 0;
+
+       return end_fsb;
  }
  
  int
  xfs_iomap_write_direct(
-       xfs_inode_t     *ip,
-       xfs_off_t       offset,
-       size_t          count,
-       xfs_bmbt_irec_t *imap,
-       int             nmaps)
+       struct xfs_inode        *ip,
+       xfs_fileoff_t           offset_fsb,
+       xfs_fileoff_t           count_fsb,
+       struct xfs_bmbt_irec    *imap)
  {
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_fileoff_t   offset_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_filblks_t   count_fsb, resaligned;
-       xfs_extlen_t    extsz;
-       int             nimaps;
-       int             quota_flag;
-       int             rt;
-       xfs_trans_t     *tp;
-       uint            qblocks, resblks, resrtextents;
-       int             error;
-       int             lockmode;
-       int             bmapi_flags = XFS_BMAPI_PREALLOC;
-       uint            tflags = 0;
-
-       rt = XFS_IS_REALTIME_INODE(ip);
-       extsz = xfs_get_extsz_hint(ip);
-       lockmode = XFS_ILOCK_SHARED;    /* locked by caller */
-
-       ASSERT(xfs_isilocked(ip, lockmode));
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       xfs_filblks_t           resaligned;
+       int                     nimaps;
+       int                     quota_flag;
+       uint                    qblocks, resblks;
+       unsigned int            resrtextents = 0;
+       int                     error;
+       int                     bmapi_flags = XFS_BMAPI_PREALLOC;
+       uint                    tflags = 0;
  
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
-       if ((offset + count) > XFS_ISIZE(ip)) {
-               /*
-                * Assert that the in-core extent list is present since this can
-                * call xfs_iread_extents() and we only have the ilock shared.
-                * This should be safe because the lock was held around a bmapi
-                * call in the caller and we only need it to access the in-core
-                * list.
-                */
-               ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
-                                                               XFS_IFEXTENTS);
-               error = xfs_iomap_eof_align_last_fsb(ip, extsz, &last_fsb);
-               if (error)
-                       goto out_unlock;
-       } else {
-               if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
-                       last_fsb = min(last_fsb, (xfs_fileoff_t)
-                                       imap->br_blockcount +
-                                       imap->br_startoff);
-       }
-       count_fsb = last_fsb - offset_fsb;
         ASSERT(count_fsb > 0);
-       resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
  
-       if (unlikely(rt)) {
+       resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+                                          xfs_get_extsz_hint(ip));
+       if (unlikely(XFS_IS_REALTIME_INODE(ip))) {
                 resrtextents = qblocks = resaligned;
                 resrtextents /= mp->m_sb.sb_rextsize;
                 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                 quota_flag = XFS_QMOPT_RES_RTBLKS;
         } else {
-               resrtextents = 0;
                 resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
                 quota_flag = XFS_QMOPT_RES_REGBLKS;
         }
  
-       /*
-        * Drop the shared lock acquired by the caller, attach the dquot if
-        * necessary and move on to transaction setup.
-        */
-       xfs_iunlock(ip, lockmode);
         error = xfs_qm_dqattach(ip);
         if (error)
                 return error;
@@ -262,8 +244,7 @@ xfs_iomap_write_direct(
         if (error)
                 return error;
  
-       lockmode = XFS_ILOCK_EXCL;
-       xfs_ilock(ip, lockmode);
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
  
         error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
         if (error)
@@ -276,8 +257,8 @@ xfs_iomap_write_direct(
          * caller gave to us.
          */
         nimaps = 1;
-       error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
-                               bmapi_flags, resblks, imap, &nimaps);
+       error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0,
+                               imap, &nimaps);
         if (error)
                 goto out_res_cancel;
  
@@ -300,7 +281,7 @@ xfs_iomap_write_direct(
                 error = xfs_alert_fsblock_zero(ip, imap);
  
  out_unlock:
-       xfs_iunlock(ip, lockmode);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  
  out_res_cancel:
@@ -409,19 +390,19 @@ xfs_iomap_prealloc_size(
         if (offset + count <= XFS_ISIZE(ip))
                 return 0;
  
-       if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) &&
-           (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_writeio_blocks)))
+       if (!(mp->m_flags & XFS_MOUNT_ALLOCSIZE) &&
+           (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)))
                 return 0;
  
         /*
          * If an explicit allocsize is set, the file is small, or we
          * are writing behind a hole, then use the minimum prealloc:
          */
-       if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
+       if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) ||
             XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
             !xfs_iext_peek_prev_extent(ifp, icur, &prev) ||
             prev.br_startoff + prev.br_blockcount < offset_fsb)
-               return mp->m_writeio_blocks;
+               return mp->m_allocsize_blocks;
  
         /*
          * Determine the initial size of the preallocation. We are beyond the
@@ -514,226 +495,13 @@ xfs_iomap_prealloc_size(
         while (alloc_blocks && alloc_blocks >= freesp)
                 alloc_blocks >>= 4;
  check_writeio:
-       if (alloc_blocks < mp->m_writeio_blocks)
-               alloc_blocks = mp->m_writeio_blocks;
+       if (alloc_blocks < mp->m_allocsize_blocks)
+               alloc_blocks = mp->m_allocsize_blocks;
         trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift,
-                                     mp->m_writeio_blocks);
+                                     mp->m_allocsize_blocks);
         return alloc_blocks;
  }
  
-static int
-xfs_file_iomap_begin_delay(
-       struct inode            *inode,
-       loff_t                  offset,
-       loff_t                  count,
-       unsigned                flags,
-       struct iomap            *iomap)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       xfs_fileoff_t           maxbytes_fsb =
-               XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-       xfs_fileoff_t           end_fsb;
-       struct xfs_bmbt_irec    imap, cmap;
-       struct xfs_iext_cursor  icur, ccur;
-       xfs_fsblock_t           prealloc_blocks = 0;
-       bool                    eof = false, cow_eof = false, shared = false;
-       u16                     iomap_flags = 0;
-       int                     whichfork = XFS_DATA_FORK;
-       int                     error = 0;
-
-       ASSERT(!XFS_IS_REALTIME_INODE(ip));
-       ASSERT(!xfs_get_extsz_hint(ip));
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-       if (unlikely(XFS_TEST_ERROR(
-           (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
-            XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
-            mp, XFS_ERRTAG_BMAPIFORMAT))) {
-               XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
-               error = -EFSCORRUPTED;
-               goto out_unlock;
-       }
-
-       XFS_STATS_INC(mp, xs_blk_mapw);
-
-       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
-               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
-               if (error)
-                       goto out_unlock;
-       }
-
-       end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
-       /*
-        * Search the data fork fork first to look up our source mapping.  We
-        * always need the data fork map, as we have to return it to the
-        * iomap code so that the higher level write code can read data in to
-        * perform read-modify-write cycles for unaligned writes.
-        */
-       eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
-       if (eof)
-               imap.br_startoff = end_fsb; /* fake hole until the end */
-
-       /* We never need to allocate blocks for zeroing a hole. */
-       if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
-               xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
-               goto out_unlock;
-       }
-
-       /*
-        * Search the COW fork extent list even if we did not find a data fork
-        * extent.  This serves two purposes: first this implements the
-        * speculative preallocation using cowextsize, so that we also unshare
-        * block adjacent to shared blocks instead of just the shared blocks
-        * themselves.  Second the lookup in the extent list is generally faster
-        * than going out to the shared extent tree.
-        */
-       if (xfs_is_cow_inode(ip)) {
-               if (!ip->i_cowfp) {
-                       ASSERT(!xfs_is_reflink_inode(ip));
-                       xfs_ifork_init_cow(ip);
-               }
-               cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
-                               &ccur, &cmap);
-               if (!cow_eof && cmap.br_startoff <= offset_fsb) {
-                       trace_xfs_reflink_cow_found(ip, &cmap);
-                       whichfork = XFS_COW_FORK;
-                       goto done;
-               }
-       }
-
-       if (imap.br_startoff <= offset_fsb) {
-               /*
-                * For reflink files we may need a delalloc reservation when
-                * overwriting shared extents.   This includes zeroing of
-                * existing extents that contain data.
-                */
-               if (!xfs_is_cow_inode(ip) ||
-                   ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
-                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
-                                       &imap);
-                       goto done;
-               }
-
-               xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
-
-               /* Trim the mapping to the nearest shared extent boundary. */
-               error = xfs_inode_need_cow(ip, &imap, &shared);
-               if (error)
-                       goto out_unlock;
-
-               /* Not shared?  Just report the (potentially capped) extent. */
-               if (!shared) {
-                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
-                                       &imap);
-                       goto done;
-               }
-
-               /*
-                * Fork all the shared blocks from our write offset until the
-                * end of the extent.
-                */
-               whichfork = XFS_COW_FORK;
-               end_fsb = imap.br_startoff + imap.br_blockcount;
-       } else {
-               /*
-                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
-                * pages to keep the chunks of work done where somewhat
-                * symmetric with the work writeback does.  This is a completely
-                * arbitrary number pulled out of thin air.
-                *
-                * Note that the values needs to be less than 32-bits wide until
-                * the lower level functions are updated.
-                */
-               count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-               end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
-
-               if (xfs_is_always_cow_inode(ip))
-                       whichfork = XFS_COW_FORK;
-       }
-
-       error = xfs_qm_dqattach_locked(ip, false);
-       if (error)
-               goto out_unlock;
-
-       if (eof) {
-               prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
-                               count, &icur);
-               if (prealloc_blocks) {
-                       xfs_extlen_t    align;
-                       xfs_off_t       end_offset;
-                       xfs_fileoff_t   p_end_fsb;
-
-                       end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
-                       p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
-                                       prealloc_blocks;
-
-                       align = xfs_eof_alignment(ip, 0);
-                       if (align)
-                               p_end_fsb = roundup_64(p_end_fsb, align);
-
-                       p_end_fsb = min(p_end_fsb, maxbytes_fsb);
-                       ASSERT(p_end_fsb > offset_fsb);
-                       prealloc_blocks = p_end_fsb - end_fsb;
-               }
-       }
-
-retry:
-       error = xfs_bmapi_reserve_delalloc(ip, whichfork, offset_fsb,
-                       end_fsb - offset_fsb, prealloc_blocks,
-                       whichfork == XFS_DATA_FORK ? &imap : &cmap,
-                       whichfork == XFS_DATA_FORK ? &icur : &ccur,
-                       whichfork == XFS_DATA_FORK ? eof : cow_eof);
-       switch (error) {
-       case 0:
-               break;
-       case -ENOSPC:
-       case -EDQUOT:
-               /* retry without any preallocation */
-               trace_xfs_delalloc_enospc(ip, offset, count);
-               if (prealloc_blocks) {
-                       prealloc_blocks = 0;
-                       goto retry;
-               }
-               /*FALLTHRU*/
-       default:
-               goto out_unlock;
-       }
-
-       /*
-        * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
-        * them out if the write happens to fail.
-        */
-       if (whichfork == XFS_DATA_FORK) {
-               iomap_flags |= IOMAP_F_NEW;
-               trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap);
-       } else {
-               trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap);
-       }
-done:
-       if (whichfork == XFS_COW_FORK) {
-               if (imap.br_startoff > offset_fsb) {
-                       xfs_trim_extent(&cmap, offset_fsb,
-                                       imap.br_startoff - offset_fsb);
-                       error = xfs_bmbt_to_iomap(ip, iomap, &cmap,
-                                       IOMAP_F_SHARED);
-                       goto out_unlock;
-               }
-               /* ensure we only report blocks we have a reservation for */
-               xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
-               shared = true;
-       }
-       if (shared)
-               iomap_flags |= IOMAP_F_SHARED;
-       error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return error;
-}
-
  int
  xfs_iomap_write_unwritten(
         xfs_inode_t     *ip,
@@ -771,6 +539,11 @@ xfs_iomap_write_unwritten(
          */
         resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
  
+       /* Attach dquots so that bmbt splits are accounted correctly. */
+       error = xfs_qm_dqattach(ip);
+       if (error)
+               return error;
+
         do {
                 /*
                  * Set up a transaction to convert the range of extents
@@ -789,6 +562,11 @@ xfs_iomap_write_unwritten(
                 xfs_ilock(ip, XFS_ILOCK_EXCL);
                 xfs_trans_ijoin(tp, ip, 0);
  
+               error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+                               XFS_QMOPT_RES_REGBLKS);
+               if (error)
+                       goto error_on_bmapi_transaction;
+
                 /*
                  * Modify the unwritten extent state of the buffer.
                  */
@@ -846,23 +624,42 @@ xfs_iomap_write_unwritten(
  static inline bool
  imap_needs_alloc(
         struct inode            *inode,
+       unsigned                flags,
         struct xfs_bmbt_irec    *imap,
         int                     nimaps)
  {
-       return !nimaps ||
-               imap->br_startblock == HOLESTARTBLOCK ||
-               imap->br_startblock == DELAYSTARTBLOCK ||
-               (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
+       /* don't allocate blocks when just zeroing */
+       if (flags & IOMAP_ZERO)
+               return false;
+       if (!nimaps ||
+           imap->br_startblock == HOLESTARTBLOCK ||
+           imap->br_startblock == DELAYSTARTBLOCK)
+               return true;
+       /* we convert unwritten extents before copying the data for DAX */
+       if (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN)
+               return true;
+       return false;
  }
  
  static inline bool
-needs_cow_for_zeroing(
+imap_needs_cow(
+       struct xfs_inode        *ip,
+       unsigned int            flags,
         struct xfs_bmbt_irec    *imap,
         int                     nimaps)
  {
-       return nimaps &&
-               imap->br_startblock != HOLESTARTBLOCK &&
-               imap->br_state != XFS_EXT_UNWRITTEN;
+       if (!xfs_is_cow_inode(ip))
+               return false;
+
+       /* when zeroing we don't have to COW holes or unwritten extents */
+       if (flags & IOMAP_ZERO) {
+               if (!nimaps ||
+                   imap->br_startblock == HOLESTARTBLOCK ||
+                   imap->br_state == XFS_EXT_UNWRITTEN)
+                       return false;
+       }
+
+       return true;
  }
  
  static int
@@ -878,15 +675,8 @@ xfs_ilock_for_iomap(
          * COW writes may allocate delalloc space or convert unwritten COW
          * extents, so we need to make sure to take the lock exclusively here.
          */
-       if (xfs_is_cow_inode(ip) && is_write) {
-               /*
-                * FIXME: It could still overwrite on unshared extents and not
-                * need allocation.
-                */
-               if (flags & IOMAP_NOWAIT)
-                       return -EAGAIN;
+       if (xfs_is_cow_inode(ip) && is_write)
                 mode = XFS_ILOCK_EXCL;
-       }
  
         /*
          * Extents not yet cached requires exclusive access, don't block.  This
@@ -923,7 +713,7 @@ xfs_ilock_for_iomap(
  }
  
  static int
-xfs_file_iomap_begin(
+xfs_direct_write_iomap_begin(
         struct inode            *inode,
         loff_t                  offset,
         loff_t                  length,
@@ -933,103 +723,63 @@ xfs_file_iomap_begin(
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_bmbt_irec    imap;
-       xfs_fileoff_t           offset_fsb, end_fsb;
+       struct xfs_bmbt_irec    imap, cmap;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = xfs_iomap_end_fsb(mp, offset, length);
         int                     nimaps = 1, error = 0;
         bool                    shared = false;
         u16                     iomap_flags = 0;
         unsigned                lockmode;
  
+       ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+
         if (XFS_FORCED_SHUTDOWN(mp))
                 return -EIO;
  
-       if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
-                       !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
-               /* Reserve delalloc blocks for regular writeback. */
-               return xfs_file_iomap_begin_delay(inode, offset, length, flags,
-                               iomap);
-       }
-
         /*
-        * Lock the inode in the manner required for the specified operation and
-        * check for as many conditions that would result in blocking as
-        * possible. This removes most of the non-blocking checks from the
-        * mapping code below.
-        */
-       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+        * Writes that span EOF might trigger an IO size update on completion,
+        * so consider them to be dirty for the purposes of O_DSYNC even if
+        * there is no other metadata changes pending or have been made here.
+        */
+       if (offset + length > i_size_read(inode))
+               iomap_flags |= IOMAP_F_DIRTY;
+
+       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
         if (error)
                 return error;
  
-       ASSERT(offset <= mp->m_super->s_maxbytes);
-       if (offset > mp->m_super->s_maxbytes - length)
-               length = mp->m_super->s_maxbytes - offset;
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       end_fsb = XFS_B_TO_FSB(mp, offset + length);
-
         error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
                                &nimaps, 0);
         if (error)
                 goto out_unlock;
  
-       if (flags & IOMAP_REPORT) {
-               /* Trim the mapping to the nearest shared extent boundary. */
-               error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
-               if (error)
+       if (imap_needs_cow(ip, flags, &imap, nimaps)) {
+               error = -EAGAIN;
+               if (flags & IOMAP_NOWAIT)
                         goto out_unlock;
-       }
-
-       /* Non-modifying mapping requested, so we are done */
-       if (!(flags & (IOMAP_WRITE | IOMAP_ZERO)))
-               goto out_found;
-
-       /*
-        * Break shared extents if necessary. Checks for non-blocking IO have
-        * been done up front, so we don't need to do them here.
-        */
-       if (xfs_is_cow_inode(ip)) {
-               struct xfs_bmbt_irec    cmap;
-               bool                    directio = (flags & IOMAP_DIRECT);
-
-               /* if zeroing doesn't need COW allocation, then we are done. */
-               if ((flags & IOMAP_ZERO) &&
-                   !needs_cow_for_zeroing(&imap, nimaps))
-                       goto out_found;
  
                 /* may drop and re-acquire the ilock */
-               cmap = imap;
-               error = xfs_reflink_allocate_cow(ip, &cmap, &shared, &lockmode,
-                               directio);
+               error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+                               &lockmode, flags & IOMAP_DIRECT);
                 if (error)
                         goto out_unlock;
-
-               /*
-                * For buffered writes we need to report the address of the
-                * previous block (if there was any) so that the higher level
-                * write code can perform read-modify-write operations; we
-                * won't need the CoW fork mapping until writeback.  For direct
-                * I/O, which must be block aligned, we need to report the
-                * newly allocated address.  If the data fork has a hole, copy
-                * the COW fork mapping to avoid allocating to the data fork.
-                */
-               if (directio || imap.br_startblock == HOLESTARTBLOCK)
-                       imap = cmap;
-
+               if (shared)
+                       goto out_found_cow;
                 end_fsb = imap.br_startoff + imap.br_blockcount;
                 length = XFS_FSB_TO_B(mp, end_fsb) - offset;
         }
  
-       /* Don't need to allocate over holes when doing zeroing operations. */
-       if (flags & IOMAP_ZERO)
-               goto out_found;
+       if (imap_needs_alloc(inode, flags, &imap, nimaps))
+               goto allocate_blocks;
  
-       if (!imap_needs_alloc(inode, &imap, nimaps))
-               goto out_found;
+       xfs_iunlock(ip, lockmode);
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
  
-       /* If nowait is set bail since we are going to make allocations. */
-       if (flags & IOMAP_NOWAIT) {
-               error = -EAGAIN;
+allocate_blocks:
+       error = -EAGAIN;
+       if (flags & IOMAP_NOWAIT)
                 goto out_unlock;
-       }
  
         /*
          * We cap the maximum length we map to a sane size  to keep the chunks
@@ -1041,57 +791,273 @@ xfs_file_iomap_begin(
          * lower level functions are updated.
          */
         length = min_t(loff_t, length, 1024 * PAGE_SIZE);
+       end_fsb = xfs_iomap_end_fsb(mp, offset, length);
  
-       /*
-        * xfs_iomap_write_direct() expects the shared lock. It is unlocked on
-        * return.
-        */
-       if (lockmode == XFS_ILOCK_EXCL)
-               xfs_ilock_demote(ip, lockmode);
-       error = xfs_iomap_write_direct(ip, offset, length, &imap,
-                       nimaps);
+       if (offset + length > XFS_ISIZE(ip))
+               end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb);
+       else if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
+               end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+       xfs_iunlock(ip, lockmode);
+
+       error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+                       &imap);
         if (error)
                 return error;
  
-       iomap_flags |= IOMAP_F_NEW;
         trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags | IOMAP_F_NEW);
+
+out_found_cow:
+       xfs_iunlock(ip, lockmode);
+       length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+       trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+       if (imap.br_startblock != HOLESTARTBLOCK) {
+               error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+               if (error)
+                       return error;
+       }
+       return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+}
+
+const struct iomap_ops xfs_direct_write_iomap_ops = {
+       .iomap_begin            = xfs_direct_write_iomap_begin,
+};
+
+static int
+xfs_buffered_write_iomap_begin(
+       struct inode            *inode,
+       loff_t                  offset,
+       loff_t                  count,
+       unsigned                flags,
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+       struct xfs_bmbt_irec    imap, cmap;
+       struct xfs_iext_cursor  icur, ccur;
+       xfs_fsblock_t           prealloc_blocks = 0;
+       bool                    eof = false, cow_eof = false, shared = false;
+       int                     allocfork = XFS_DATA_FORK;
+       int                     error = 0;
+
+       /* we can't use delayed allocations when using extent size hints */
+       if (xfs_get_extsz_hint(ip))
+               return xfs_direct_write_iomap_begin(inode, offset, count,
+                               flags, iomap, srcmap);
+
+       ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ip, XFS_DATA_FORK)) ||
+           XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+               error = -EFSCORRUPTED;
+               goto out_unlock;
+       }
+
+       XFS_STATS_INC(mp, xs_blk_mapw);
+
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+               error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+               if (error)
+                       goto out_unlock;
+       }
  
-out_finish:
         /*
-        * Writes that span EOF might trigger an IO size update on completion,
-        * so consider them to be dirty for the purposes of O_DSYNC even if
-        * there is no other metadata changes pending or have been made here.
+        * Search the data fork fork first to look up our source mapping.  We
+        * always need the data fork map, as we have to return it to the
+        * iomap code so that the higher level write code can read data in to
+        * perform read-modify-write cycles for unaligned writes.
          */
-       if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
-               iomap_flags |= IOMAP_F_DIRTY;
-       if (shared)
-               iomap_flags |= IOMAP_F_SHARED;
-       return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
+       eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap);
+       if (eof)
+               imap.br_startoff = end_fsb; /* fake hole until the end */
  
-out_found:
-       ASSERT(nimaps);
-       xfs_iunlock(ip, lockmode);
-       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
-       goto out_finish;
+       /* We never need to allocate blocks for zeroing a hole. */
+       if ((flags & IOMAP_ZERO) && imap.br_startoff > offset_fsb) {
+               xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff);
+               goto out_unlock;
+       }
+
+       /*
+        * Search the COW fork extent list even if we did not find a data fork
+        * extent.  This serves two purposes: first this implements the
+        * speculative preallocation using cowextsize, so that we also unshare
+        * block adjacent to shared blocks instead of just the shared blocks
+        * themselves.  Second the lookup in the extent list is generally faster
+        * than going out to the shared extent tree.
+        */
+       if (xfs_is_cow_inode(ip)) {
+               if (!ip->i_cowfp) {
+                       ASSERT(!xfs_is_reflink_inode(ip));
+                       xfs_ifork_init_cow(ip);
+               }
+               cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
+                               &ccur, &cmap);
+               if (!cow_eof && cmap.br_startoff <= offset_fsb) {
+                       trace_xfs_reflink_cow_found(ip, &cmap);
+                       goto found_cow;
+               }
+       }
+
+       if (imap.br_startoff <= offset_fsb) {
+               /*
+                * For reflink files we may need a delalloc reservation when
+                * overwriting shared extents.   This includes zeroing of
+                * existing extents that contain data.
+                */
+               if (!xfs_is_cow_inode(ip) ||
+                   ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto found_imap;
+               }
+
+               xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+
+               /* Trim the mapping to the nearest shared extent boundary. */
+               error = xfs_inode_need_cow(ip, &imap, &shared);
+               if (error)
+                       goto out_unlock;
+
+               /* Not shared?  Just report the (potentially capped) extent. */
+               if (!shared) {
+                       trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
+                                       &imap);
+                       goto found_imap;
+               }
+
+               /*
+                * Fork all the shared blocks from our write offset until the
+                * end of the extent.
+                */
+               allocfork = XFS_COW_FORK;
+               end_fsb = imap.br_startoff + imap.br_blockcount;
+       } else {
+               /*
+                * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
+                * pages to keep the chunks of work done where somewhat
+                * symmetric with the work writeback does.  This is a completely
+                * arbitrary number pulled out of thin air.
+                *
+                * Note that the values needs to be less than 32-bits wide until
+                * the lower level functions are updated.
+                */
+               count = min_t(loff_t, count, 1024 * PAGE_SIZE);
+               end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+
+               if (xfs_is_always_cow_inode(ip))
+                       allocfork = XFS_COW_FORK;
+       }
+
+       error = xfs_qm_dqattach_locked(ip, false);
+       if (error)
+               goto out_unlock;
+
+       if (eof) {
+               prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset,
+                               count, &icur);
+               if (prealloc_blocks) {
+                       xfs_extlen_t    align;
+                       xfs_off_t       end_offset;
+                       xfs_fileoff_t   p_end_fsb;
+
+                       end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1);
+                       p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+                                       prealloc_blocks;
+
+                       align = xfs_eof_alignment(ip);
+                       if (align)
+                               p_end_fsb = roundup_64(p_end_fsb, align);
+
+                       p_end_fsb = min(p_end_fsb,
+                               XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+                       ASSERT(p_end_fsb > offset_fsb);
+                       prealloc_blocks = p_end_fsb - end_fsb;
+               }
+       }
+
+retry:
+       error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
+                       end_fsb - offset_fsb, prealloc_blocks,
+                       allocfork == XFS_DATA_FORK ? &imap : &cmap,
+                       allocfork == XFS_DATA_FORK ? &icur : &ccur,
+                       allocfork == XFS_DATA_FORK ? eof : cow_eof);
+       switch (error) {
+       case 0:
+               break;
+       case -ENOSPC:
+       case -EDQUOT:
+               /* retry without any preallocation */
+               trace_xfs_delalloc_enospc(ip, offset, count);
+               if (prealloc_blocks) {
+                       prealloc_blocks = 0;
+                       goto retry;
+               }
+               /*FALLTHRU*/
+       default:
+               goto out_unlock;
+       }
+
+       if (allocfork == XFS_COW_FORK) {
+               trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
+               goto found_cow;
+       }
+
+       /*
+        * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+        * them out if the write happens to fail.
+        */
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, IOMAP_F_NEW);
+
+found_imap:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
+
+found_cow:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       if (imap.br_startoff <= offset_fsb) {
+               error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
+               if (error)
+                       return error;
+       } else {
+               xfs_trim_extent(&cmap, offset_fsb,
+                               imap.br_startoff - offset_fsb);
+       }
+       return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
  
  out_unlock:
-       xfs_iunlock(ip, lockmode);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
  }
  
  static int
-xfs_file_iomap_end_delalloc(
-       struct xfs_inode        *ip,
+xfs_buffered_write_iomap_end(
+       struct inode            *inode,
         loff_t                  offset,
         loff_t                  length,
         ssize_t                 written,
+       unsigned                flags,
         struct iomap            *iomap)
  {
+       struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
         xfs_fileoff_t           start_fsb;
         xfs_fileoff_t           end_fsb;
         int                     error = 0;
  
+       if (iomap->type != IOMAP_DELALLOC)
+               return 0;
+
         /*
          * Behave as if the write failed if drop writes is enabled. Set the NEW
          * flag to force delalloc cleanup.
@@ -1136,24 +1102,51 @@ xfs_file_iomap_end_delalloc(
         return 0;
  }
  
+const struct iomap_ops xfs_buffered_write_iomap_ops = {
+       .iomap_begin            = xfs_buffered_write_iomap_begin,
+       .iomap_end              = xfs_buffered_write_iomap_end,
+};
+
  static int
-xfs_file_iomap_end(
+xfs_read_iomap_begin(
         struct inode            *inode,
         loff_t                  offset,
         loff_t                  length,
-       ssize_t                 written,
         unsigned                flags,
-       struct iomap            *iomap)
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
  {
-       if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
-               return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
-                               length, written, iomap);
-       return 0;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_bmbt_irec    imap;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       xfs_fileoff_t           end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+       int                     nimaps = 1, error = 0;
+       bool                    shared = false;
+       unsigned                lockmode;
+
+       ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+       if (error)
+               return error;
+       error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                              &nimaps, 0);
+       if (!error && (flags & IOMAP_REPORT))
+               error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
+       xfs_iunlock(ip, lockmode);
+
+       if (error)
+               return error;
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, shared ? IOMAP_F_SHARED : 0);
  }
  
-const struct iomap_ops xfs_iomap_ops = {
-       .iomap_begin            = xfs_file_iomap_begin,
-       .iomap_end              = xfs_file_iomap_end,
+const struct iomap_ops xfs_read_iomap_ops = {
+       .iomap_begin            = xfs_read_iomap_begin,
  };
  
  static int
@@ -1196,8 +1189,7 @@ xfs_seek_iomap_begin(
                 /*
                  * Fake a hole until the end of the file.
                  */
-               data_fsb = min(XFS_B_TO_FSB(mp, offset + length),
-                              XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes));
+               data_fsb = xfs_iomap_end_fsb(mp, offset, length);
         }
  
         /*