Merge tag 'pwm/for-5.5-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/thierry...

[linux.git] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index a7ca65177980087f42632ae99b563e3ecde3e2c9..28f28de0c1b67e116228f3413bd322bb681f5074 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -163,32 +163,6 @@ int ext4_inode_is_fast_symlink(struct inode *inode)
                (inode->i_size < EXT4_N_BLOCKS * 4);
  }
  
-/*
- * Restart the transaction associated with *handle.  This does a commit,
- * so before we call here everything must be consistently dirtied against
- * this transaction.
- */
-int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
-                                int nblocks)
-{
-       int ret;
-
-       /*
-        * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
-        * moment, get_block can be called only for blocks inside i_size since
-        * page cache has been already dropped and writes are blocked by
-        * i_mutex. So we can safely drop the i_data_sem here.
-        */
-       BUG_ON(EXT4_JOURNAL(inode) == NULL);
-       jbd_debug(2, "restarting handle %p\n", handle);
-       up_write(&EXT4_I(inode)->i_data_sem);
-       ret = ext4_journal_restart(handle, nblocks);
-       down_write(&EXT4_I(inode)->i_data_sem);
-       ext4_discard_preallocations(inode);
-
-       return ret;
-}
-
  /*
   * Called at the last iput() if i_nlink is zero.
   */
@@ -196,7 +170,12 @@ void ext4_evict_inode(struct inode *inode)
  {
         handle_t *handle;
         int err;
-       int extra_credits = 3;
+       /*
+        * Credits for final inode cleanup and freeing:
+        * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+        * (xattr block freeing), bitmap, group descriptor (inode freeing)
+        */
+       int extra_credits = 6;
         struct ext4_xattr_inode_array *ea_inode_array = NULL;
  
         trace_ext4_evict_inode(inode);
@@ -252,8 +231,12 @@ void ext4_evict_inode(struct inode *inode)
         if (!IS_NOQUOTA(inode))
                 extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
  
+       /*
+        * Block bitmap, group descriptor, and inode are accounted in both
+        * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+        */
         handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
-                                ext4_blocks_for_truncate(inode)+extra_credits);
+                        ext4_blocks_for_truncate(inode) + extra_credits - 3);
         if (IS_ERR(handle)) {
                 ext4_std_error(inode->i_sb, PTR_ERR(handle));
                 /*
@@ -826,136 +809,6 @@ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
  /* Maximum number of blocks we map for direct IO at once. */
  #define DIO_MAX_BLOCKS 4096
  
-/*
- * Get blocks function for the cases that need to start a transaction -
- * generally difference cases of direct IO and DAX IO. It also handles retries
- * in case of ENOSPC.
- */
-static int ext4_get_block_trans(struct inode *inode, sector_t iblock,
-                               struct buffer_head *bh_result, int flags)
-{
-       int dio_credits;
-       handle_t *handle;
-       int retries = 0;
-       int ret;
-
-       /* Trim mapping request to maximum we can map at once for DIO */
-       if (bh_result->b_size >> inode->i_blkbits > DIO_MAX_BLOCKS)
-               bh_result->b_size = DIO_MAX_BLOCKS << inode->i_blkbits;
-       dio_credits = ext4_chunk_trans_blocks(inode,
-                                     bh_result->b_size >> inode->i_blkbits);
-retry:
-       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
-       if (IS_ERR(handle))
-               return PTR_ERR(handle);
-
-       ret = _ext4_get_block(inode, iblock, bh_result, flags);
-       ext4_journal_stop(handle);
-
-       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-               goto retry;
-       return ret;
-}
-
-/* Get block function for DIO reads and writes to inodes without extents */
-int ext4_dio_get_block(struct inode *inode, sector_t iblock,
-                      struct buffer_head *bh, int create)
-{
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       if (!create)
-               return _ext4_get_block(inode, iblock, bh, 0);
-       return ext4_get_block_trans(inode, iblock, bh, EXT4_GET_BLOCKS_CREATE);
-}
-
-/*
- * Get block function for AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete.
- */
-static int ext4_dio_get_block_unwritten_async(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
-       /*
-        * When doing DIO using unwritten extents, we need io_end to convert
-        * unwritten extents to written on IO completion. We allocate io_end
-        * once we spot unwritten extent and store it in b_private. Generic
-        * DIO code keeps b_private set and furthermore passes the value to
-        * our completion callback in 'private' argument.
-        */
-       if (!ret && buffer_unwritten(bh_result)) {
-               if (!bh_result->b_private) {
-                       ext4_io_end_t *io_end;
-
-                       io_end = ext4_init_io_end(inode, GFP_KERNEL);
-                       if (!io_end)
-                               return -ENOMEM;
-                       bh_result->b_private = io_end;
-                       ext4_set_io_unwritten_flag(inode, io_end);
-               }
-               set_buffer_defer_completion(bh_result);
-       }
-
-       return ret;
-}
-
-/*
- * Get block function for non-AIO DIO writes when we create unwritten extent if
- * blocks are not allocated yet. The extent will be converted to written
- * after IO is complete by ext4_direct_IO_write().
- */
-static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
-               sector_t iblock, struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = ext4_get_block_trans(inode, iblock, bh_result,
-                                  EXT4_GET_BLOCKS_IO_CREATE_EXT);
-
-       /*
-        * Mark inode as having pending DIO writes to unwritten extents.
-        * ext4_direct_IO_write() checks this flag and converts extents to
-        * written.
-        */
-       if (!ret && buffer_unwritten(bh_result))
-               ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-
-       return ret;
-}
-
-static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
-                  struct buffer_head *bh_result, int create)
-{
-       int ret;
-
-       ext4_debug("ext4_dio_get_block_overwrite: inode %lu, create flag %d\n",
-                  inode->i_ino, create);
-       /* We don't expect handle for direct IO */
-       WARN_ON_ONCE(ext4_journal_current_handle());
-
-       ret = _ext4_get_block(inode, iblock, bh_result, 0);
-       /*
-        * Blocks should have been preallocated! ext4_file_write_iter() checks
-        * that.
-        */
-       WARN_ON_ONCE(!buffer_mapped(bh_result) || buffer_unwritten(bh_result));
-
-       return ret;
-}
-
-
  /*
   * `handle' can be NULL if create is zero
   */
@@ -2340,6 +2193,79 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
         return lblk < blocks;
  }
  
+/*
+ * mpage_process_page - update page buffers corresponding to changed extent and
+ *                    may submit fully mapped page for IO
+ *
+ * @mpd                - description of extent to map, on return next extent to map
+ * @m_lblk     - logical block mapping.
+ * @m_pblk     - corresponding physical mapping.
+ * @map_bh     - determines on return whether this page requires any further
+ *               mapping or not.
+ * Scan given page buffers corresponding to changed extent and update buffer
+ * state according to new extent state.
+ * We map delalloc buffers to their physical location, clear unwritten bits.
+ * If the given page is not fully mapped, we update @map to the next extent in
+ * the given page that needs mapping & return @map_bh as true.
+ */
+static int mpage_process_page(struct mpage_da_data *mpd, struct page *page,
+                             ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
+                             bool *map_bh)
+{
+       struct buffer_head *head, *bh;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       ext4_lblk_t lblk = *m_lblk;
+       ext4_fsblk_t pblock = *m_pblk;
+       int err = 0;
+       int blkbits = mpd->inode->i_blkbits;
+       ssize_t io_end_size = 0;
+       struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);
+
+       bh = head = page_buffers(page);
+       do {
+               if (lblk < mpd->map.m_lblk)
+                       continue;
+               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
+                       /*
+                        * Buffer after end of mapped extent.
+                        * Find next buffer in the page to map.
+                        */
+                       mpd->map.m_len = 0;
+                       mpd->map.m_flags = 0;
+                       io_end_vec->size += io_end_size;
+                       io_end_size = 0;
+
+                       err = mpage_process_page_bufs(mpd, head, bh, lblk);
+                       if (err > 0)
+                               err = 0;
+                       if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
+                               io_end_vec = ext4_alloc_io_end_vec(io_end);
+                               if (IS_ERR(io_end_vec)) {
+                                       err = PTR_ERR(io_end_vec);
+                                       goto out;
+                               }
+                               io_end_vec->offset = mpd->map.m_lblk << blkbits;
+                       }
+                       *map_bh = true;
+                       goto out;
+               }
+               if (buffer_delay(bh)) {
+                       clear_buffer_delay(bh);
+                       bh->b_blocknr = pblock++;
+               }
+               clear_buffer_unwritten(bh);
+               io_end_size += (1 << blkbits);
+       } while (lblk++, (bh = bh->b_this_page) != head);
+
+       io_end_vec->size += io_end_size;
+       io_end_size = 0;
+       *map_bh = false;
+out:
+       *m_lblk = lblk;
+       *m_pblk = pblock;
+       return err;
+}
+
  /*
   * mpage_map_buffers - update buffers corresponding to changed extent and
   *                    submit fully mapped pages for IO
@@ -2359,12 +2285,12 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
         struct pagevec pvec;
         int nr_pages, i;
         struct inode *inode = mpd->inode;
-       struct buffer_head *head, *bh;
         int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
         pgoff_t start, end;
         ext4_lblk_t lblk;
-       sector_t pblock;
+       ext4_fsblk_t pblock;
         int err;
+       bool map_bh = false;
  
         start = mpd->map.m_lblk >> bpp_bits;
         end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
@@ -2380,50 +2306,19 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
                 for (i = 0; i < nr_pages; i++) {
                         struct page *page = pvec.pages[i];
  
-                       bh = head = page_buffers(page);
-                       do {
-                               if (lblk < mpd->map.m_lblk)
-                                       continue;
-                               if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
-                                       /*
-                                        * Buffer after end of mapped extent.
-                                        * Find next buffer in the page to map.
-                                        */
-                                       mpd->map.m_len = 0;
-                                       mpd->map.m_flags = 0;
-                                       /*
-                                        * FIXME: If dioread_nolock supports
-                                        * blocksize < pagesize, we need to make
-                                        * sure we add size mapped so far to
-                                        * io_end->size as the following call
-                                        * can submit the page for IO.
-                                        */
-                                       err = mpage_process_page_bufs(mpd, head,
-                                                                     bh, lblk);
-                                       pagevec_release(&pvec);
-                                       if (err > 0)
-                                               err = 0;
-                                       return err;
-                               }
-                               if (buffer_delay(bh)) {
-                                       clear_buffer_delay(bh);
-                                       bh->b_blocknr = pblock++;
-                               }
-                               clear_buffer_unwritten(bh);
-                       } while (lblk++, (bh = bh->b_this_page) != head);
-
+                       err = mpage_process_page(mpd, page, &lblk, &pblock,
+                                                &map_bh);
                         /*
-                        * FIXME: This is going to break if dioread_nolock
-                        * supports blocksize < pagesize as we will try to
-                        * convert potentially unmapped parts of inode.
+                        * If map_bh is true, means page may require further bh
+                        * mapping, or maybe the page was submitted for IO.
+                        * So we return to call further extent mapping.
                          */
-                       mpd->io_submit.io_end->size += PAGE_SIZE;
+                       if (err < 0 || map_bh == true)
+                               goto out;
                         /* Page fully mapped - let IO run! */
                         err = mpage_submit_page(mpd, page);
-                       if (err < 0) {
-                               pagevec_release(&pvec);
-                               return err;
-                       }
+                       if (err < 0)
+                               goto out;
                 }
                 pagevec_release(&pvec);
         }
@@ -2431,6 +2326,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
         mpd->map.m_len = 0;
         mpd->map.m_flags = 0;
         return 0;
+out:
+       pagevec_release(&pvec);
+       return err;
  }
  
  static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
@@ -2510,9 +2408,13 @@ static int mpage_map_and_submit_extent(handle_t *handle,
         int err;
         loff_t disksize;
         int progress = 0;
+       ext4_io_end_t *io_end = mpd->io_submit.io_end;
+       struct ext4_io_end_vec *io_end_vec;
  
-       mpd->io_submit.io_end->offset =
-                               ((loff_t)map->m_lblk) << inode->i_blkbits;
+       io_end_vec = ext4_alloc_io_end_vec(io_end);
+       if (IS_ERR(io_end_vec))
+               return PTR_ERR(io_end_vec);
+       io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
         do {
                 err = mpage_map_one_extent(handle, mpd);
                 if (err < 0) {
@@ -3406,473 +3308,235 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
         return inode->i_state & I_DIRTY_DATASYNC;
  }
  
-static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-                           unsigned flags, struct iomap *iomap)
+static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
+                          struct ext4_map_blocks *map, loff_t offset,
+                          loff_t length)
  {
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned int blkbits = inode->i_blkbits;
-       unsigned long first_block, last_block;
-       struct ext4_map_blocks map;
-       bool delalloc = false;
-       int ret;
-
-       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
-               return -EINVAL;
-       first_block = offset >> blkbits;
-       last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
-                          EXT4_MAX_LOGICAL_BLOCK);
-
-       if (flags & IOMAP_REPORT) {
-               if (ext4_has_inline_data(inode)) {
-                       ret = ext4_inline_data_iomap(inode, iomap);
-                       if (ret != -EAGAIN) {
-                               if (ret == 0 && offset >= iomap->length)
-                                       ret = -ENOENT;
-                               return ret;
-                       }
-               }
-       } else {
-               if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
-                       return -ERANGE;
-       }
-
-       map.m_lblk = first_block;
-       map.m_len = last_block - first_block + 1;
-
-       if (flags & IOMAP_REPORT) {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-
-               if (ret == 0) {
-                       ext4_lblk_t end = map.m_lblk + map.m_len - 1;
-                       struct extent_status es;
-
-                       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
-                                                 map.m_lblk, end, &es);
-
-                       if (!es.es_len || es.es_lblk > end) {
-                               /* entire range is a hole */
-                       } else if (es.es_lblk > map.m_lblk) {
-                               /* range starts with a hole */
-                               map.m_len = es.es_lblk - map.m_lblk;
-                       } else {
-                               ext4_lblk_t offs = 0;
-
-                               if (es.es_lblk < map.m_lblk)
-                                       offs = map.m_lblk - es.es_lblk;
-                               map.m_lblk = es.es_lblk + offs;
-                               map.m_len = es.es_len - offs;
-                               delalloc = true;
-                       }
-               }
-       } else if (flags & IOMAP_WRITE) {
-               int dio_credits;
-               handle_t *handle;
-               int retries = 0;
-
-               /* Trim mapping request to maximum we can map at once for DIO */
-               if (map.m_len > DIO_MAX_BLOCKS)
-                       map.m_len = DIO_MAX_BLOCKS;
-               dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
-retry:
-               /*
-                * Either we allocate blocks and then we don't get unwritten
-                * extent so we have reserved enough credits, or the blocks
-                * are already allocated and unwritten and in that case
-                * extent conversion fits in the credits as well.
-                */
-               handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
-                                           dio_credits);
-               if (IS_ERR(handle))
-                       return PTR_ERR(handle);
-
-               ret = ext4_map_blocks(handle, inode, &map,
-                                     EXT4_GET_BLOCKS_CREATE_ZERO);
-               if (ret < 0) {
-                       ext4_journal_stop(handle);
-                       if (ret == -ENOSPC &&
-                           ext4_should_retry_alloc(inode->i_sb, &retries))
-                               goto retry;
-                       return ret;
-               }
-
-               /*
-                * If we added blocks beyond i_size, we need to make sure they
-                * will get truncated if we crash before updating i_size in
-                * ext4_iomap_end(). For faults we don't need to do that (and
-                * even cannot because for orphan list operations inode_lock is
-                * required) - if we happen to instantiate block beyond i_size,
-                * it is because we race with truncate which has already added
-                * the inode to the orphan list.
-                */
-               if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
-                   (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
-                       int err;
-
-                       err = ext4_orphan_add(handle, inode);
-                       if (err < 0) {
-                               ext4_journal_stop(handle);
-                               return err;
-                       }
-               }
-               ext4_journal_stop(handle);
-       } else {
-               ret = ext4_map_blocks(NULL, inode, &map, 0);
-               if (ret < 0)
-                       return ret;
-       }
+       u8 blkbits = inode->i_blkbits;
  
+       /*
+        * Writes that span EOF might trigger an I/O size update on completion,
+        * so consider them to be dirty for the purpose of O_DSYNC, even if
+        * there is no other metadata changes being made or are pending.
+        */
         iomap->flags = 0;
-       if (ext4_inode_datasync_dirty(inode))
+       if (ext4_inode_datasync_dirty(inode) ||
+           offset + length > i_size_read(inode))
                 iomap->flags |= IOMAP_F_DIRTY;
+
+       if (map->m_flags & EXT4_MAP_NEW)
+               iomap->flags |= IOMAP_F_NEW;
+
         iomap->bdev = inode->i_sb->s_bdev;
-       iomap->dax_dev = sbi->s_daxdev;
-       iomap->offset = (u64)first_block << blkbits;
-       iomap->length = (u64)map.m_len << blkbits;
+       iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
+       iomap->offset = (u64) map->m_lblk << blkbits;
+       iomap->length = (u64) map->m_len << blkbits;
  
-       if (ret == 0) {
-               iomap->type = delalloc ? IOMAP_DELALLOC : IOMAP_HOLE;
-               iomap->addr = IOMAP_NULL_ADDR;
+       /*
+        * Flags passed to ext4_map_blocks() for direct I/O writes can result
+        * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
+        * set. In order for any allocated unwritten extents to be converted
+        * into written extents correctly within the ->end_io() handler, we
+        * need to ensure that the iomap->type is set appropriately. Hence, the
+        * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
+        * been set first.
+        */
+       if (map->m_flags & EXT4_MAP_UNWRITTEN) {
+               iomap->type = IOMAP_UNWRITTEN;
+               iomap->addr = (u64) map->m_pblk << blkbits;
+       } else if (map->m_flags & EXT4_MAP_MAPPED) {
+               iomap->type = IOMAP_MAPPED;
+               iomap->addr = (u64) map->m_pblk << blkbits;
         } else {
-               if (map.m_flags & EXT4_MAP_MAPPED) {
-                       iomap->type = IOMAP_MAPPED;
-               } else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                       iomap->type = IOMAP_UNWRITTEN;
-               } else {
-                       WARN_ON_ONCE(1);
-                       return -EIO;
-               }
-               iomap->addr = (u64)map.m_pblk << blkbits;
+               iomap->type = IOMAP_HOLE;
+               iomap->addr = IOMAP_NULL_ADDR;
         }
-
-       if (map.m_flags & EXT4_MAP_NEW)
-               iomap->flags |= IOMAP_F_NEW;
-
-       return 0;
  }
  
-static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
-                         ssize_t written, unsigned flags, struct iomap *iomap)
+static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
+                           unsigned int flags)
  {
-       int ret = 0;
         handle_t *handle;
-       int blkbits = inode->i_blkbits;
-       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       int ret, dio_credits, m_flags = 0, retries = 0;
  
-       if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
-               return 0;
-
-       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto orphan_del;
-       }
-       if (ext4_update_inode_size(inode, offset + written))
-               ext4_mark_inode_dirty(handle, inode);
         /*
-        * We may need to truncate allocated but not written blocks beyond EOF.
+        * Trim the mapping request to the maximum value that we can map at
+        * once for direct I/O.
          */
-       if (iomap->offset + iomap->length > 
-           ALIGN(inode->i_size, 1 << blkbits)) {
-               ext4_lblk_t written_blk, end_blk;
+       if (map->m_len > DIO_MAX_BLOCKS)
+               map->m_len = DIO_MAX_BLOCKS;
+       dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
  
-               written_blk = (offset + written) >> blkbits;
-               end_blk = (offset + length) >> blkbits;
-               if (written_blk < end_blk && ext4_can_truncate(inode))
-                       truncate = true;
-       }
+retry:
         /*
-        * Remove inode from orphan list if we were extending a inode and
-        * everything went fine.
+        * Either we allocate blocks and then don't get an unwritten extent, so
+        * in that case we have reserved enough credits. Or, the blocks are
+        * already allocated and unwritten. In that case, the extent conversion
+        * fits into the credits as well.
          */
-       if (!truncate && inode->i_nlink &&
-           !list_empty(&EXT4_I(inode)->i_orphan))
-               ext4_orphan_del(handle, inode);
-       ext4_journal_stop(handle);
-       if (truncate) {
-               ext4_truncate_failed_write(inode);
-orphan_del:
-               /*
-                * If truncate failed early the inode might still be on the
-                * orphan list; we need to make sure the inode is removed from
-                * the orphan list in that case.
-                */
-               if (inode->i_nlink)
-                       ext4_orphan_del(NULL, inode);
-       }
-       return ret;
-}
-
-const struct iomap_ops ext4_iomap_ops = {
-       .iomap_begin            = ext4_iomap_begin,
-       .iomap_end              = ext4_iomap_end,
-};
-
-static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                           ssize_t size, void *private)
-{
-        ext4_io_end_t *io_end = private;
+       handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
  
-       /* if not async direct IO just return */
-       if (!io_end)
-               return 0;
+       /*
+        * DAX and direct I/O are the only two operations that are currently
+        * supported with IOMAP_WRITE.
+        */
+       WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+       if (IS_DAX(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+       /*
+        * We use i_size instead of i_disksize here because delalloc writeback
+        * can complete at any point during the I/O and subsequently push the
+        * i_disksize out to i_size. This could be beyond where direct I/O is
+        * happening and thus expose allocated blocks to direct I/O reads.
+        */
+       else if ((map->m_lblk * (1 << blkbits)) >= i_size_read(inode))
+               m_flags = EXT4_GET_BLOCKS_CREATE;
+       else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+               m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
  
-       ext_debug("ext4_end_io_dio(): io_end 0x%p "
-                 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
-                 io_end, io_end->inode->i_ino, iocb, offset, size);
+       ret = ext4_map_blocks(handle, inode, map, m_flags);
  
         /*
-        * Error during AIO DIO. We cannot convert unwritten extents as the
-        * data was not written. Just clear the unwritten flag and drop io_end.
+        * We cannot fill holes in indirect tree based inodes as that could
+        * expose stale data in the case of a crash. Use the magic error code
+        * to fallback to buffered I/O.
          */
-       if (size <= 0) {
-               ext4_clear_io_unwritten_flag(io_end);
-               size = 0;
-       }
-       io_end->offset = offset;
-       io_end->size = size;
-       ext4_put_io_end(io_end);
+       if (!m_flags && !ret)
+               ret = -ENOTBLK;
  
-       return 0;
+       ext4_journal_stop(handle);
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
+
+       return ret;
  }
  
-/*
- * Handling of direct IO writes.
- *
- * For ext4 extent files, ext4 will do direct-io write even to holes,
- * preallocated extents, and those write extend the file, no need to
- * fall back to buffered IO.
- *
- * For holes, we fallocate those blocks, mark them as unwritten
- * If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as unwritten.
- *
- * The unwritten extents will be converted to written when DIO is completed.
- * For async direct IO, since the IO may still pending when return, we
- * set up an end_io call back function, which will do the conversion
- * when async direct IO completed.
- *
- * If the O_DIRECT write will extend the file then add this inode to the
- * orphan list.  So recovery will truncate it back to the original size
- * if the machine crashes during the write.
- *
- */
-static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
+
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       struct ext4_inode_info *ei = EXT4_I(inode);
-       ssize_t ret;
-       loff_t offset = iocb->ki_pos;
-       size_t count = iov_iter_count(iter);
-       int overwrite = 0;
-       get_block_t *get_block_func = NULL;
-       int dio_flags = 0;
-       loff_t final_size = offset + count;
-       int orphan = 0;
-       handle_t *handle;
+       int ret;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
  
-       if (final_size > inode->i_size || final_size > ei->i_disksize) {
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
-               ret = ext4_orphan_add(handle, inode);
-               if (ret) {
-                       ext4_journal_stop(handle);
-                       goto out;
-               }
-               orphan = 1;
-               ext4_update_i_disksize(inode, inode->i_size);
-               ext4_journal_stop(handle);
-       }
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
  
-       BUG_ON(iocb->private == NULL);
+       if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+               return -ERANGE;
  
         /*
-        * Make all waiters for direct IO properly wait also for extent
-        * conversion. This also disallows race between truncate() and
-        * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
+        * Calculate the first and last logical blocks respectively.
          */
-       inode_dio_begin(inode);
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+
+       if (flags & IOMAP_WRITE)
+               ret = ext4_iomap_alloc(inode, &map, flags);
+       else
+               ret = ext4_map_blocks(NULL, inode, &map, 0);
+
+       if (ret < 0)
+               return ret;
  
-       /* If we do a overwrite dio, i_mutex locking can be released */
-       overwrite = *((int *)iocb->private);
+       ext4_set_iomap(inode, iomap, &map, offset, length);
  
-       if (overwrite)
-               inode_unlock(inode);
+       return 0;
+}
  
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+                         ssize_t written, unsigned flags, struct iomap *iomap)
+{
         /*
-        * For extent mapped files we could direct write to holes and fallocate.
-        *
-        * Allocated blocks to fill the hole are marked as unwritten to prevent
-        * parallel buffered read to expose the stale data before DIO complete
-        * the data IO.
-        *
-        * As to previously fallocated extents, ext4 get_block will just simply
-        * mark the buffer mapped but still keep the extents unwritten.
-        *
-        * For non AIO case, we will convert those unwritten extents to written
-        * after return back from blockdev_direct_IO. That way we save us from
-        * allocating io_end structure and also the overhead of offloading
-        * the extent convertion to a workqueue.
-        *
-        * For async DIO, the conversion needs to be deferred when the
-        * IO is completed. The ext4 end_io callback function will be
-        * called to take care of the conversion work.  Here for async
-        * case, we allocate an io_end structure to hook to the iocb.
+        * Check to see whether an error occurred while writing out the data to
+        * the allocated blocks. If so, return the magic error code so that we
+        * fallback to buffered I/O and attempt to complete the remainder of
+        * the I/O. Any blocks that may have been allocated in preparation for
+        * the direct I/O will be reused during buffered I/O.
          */
-       iocb->private = NULL;
-       if (overwrite)
-               get_block_func = ext4_dio_get_block_overwrite;
-       else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
-                  round_down(offset, i_blocksize(inode)) >= inode->i_size) {
-               get_block_func = ext4_dio_get_block;
-               dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
-       } else if (is_sync_kiocb(iocb)) {
-               get_block_func = ext4_dio_get_block_unwritten_sync;
-               dio_flags = DIO_LOCKING;
-       } else {
-               get_block_func = ext4_dio_get_block_unwritten_async;
-               dio_flags = DIO_LOCKING;
-       }
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-                                  get_block_func, ext4_end_io_dio, NULL,
-                                  dio_flags);
+       if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+               return -ENOTBLK;
  
-       if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
-                                               EXT4_STATE_DIO_UNWRITTEN)) {
-               int err;
-               /*
-                * for non AIO case, since the IO is already
-                * completed, we could do the conversion right here
-                */
-               err = ext4_convert_unwritten_extents(NULL, inode,
-                                                    offset, ret);
-               if (err < 0)
-                       ret = err;
-               ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
-       }
+       return 0;
+}
  
-       inode_dio_end(inode);
-       /* take i_mutex locking again if we do a ovewrite dio */
-       if (overwrite)
-               inode_lock(inode);
+const struct iomap_ops ext4_iomap_ops = {
+       .iomap_begin            = ext4_iomap_begin,
+       .iomap_end              = ext4_iomap_end,
+};
  
-       if (ret < 0 && final_size > inode->i_size)
-               ext4_truncate_failed_write(inode);
+static bool ext4_iomap_is_delalloc(struct inode *inode,
+                                  struct ext4_map_blocks *map)
+{
+       struct extent_status es;
+       ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
  
-       /* Handle extending of i_size after direct IO write */
-       if (orphan) {
-               int err;
+       ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
+                                 map->m_lblk, end, &es);
  
-               /* Credits for sb + inode write */
-               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
-               if (IS_ERR(handle)) {
-                       /*
-                        * We wrote the data but cannot extend
-                        * i_size. Bail out. In async io case, we do
-                        * not return error here because we have
-                        * already submmitted the corresponding
-                        * bio. Returning error here makes the caller
-                        * think that this IO is done and failed
-                        * resulting in race with bio's completion
-                        * handler.
-                        */
-                       if (!ret)
-                               ret = PTR_ERR(handle);
-                       if (inode->i_nlink)
-                               ext4_orphan_del(NULL, inode);
+       if (!es.es_len || es.es_lblk > end)
+               return false;
  
-                       goto out;
-               }
-               if (inode->i_nlink)
-                       ext4_orphan_del(handle, inode);
-               if (ret > 0) {
-                       loff_t end = offset + ret;
-                       if (end > inode->i_size || end > ei->i_disksize) {
-                               ext4_update_i_disksize(inode, end);
-                               if (end > inode->i_size)
-                                       i_size_write(inode, end);
-                               /*
-                                * We're going to return a positive `ret'
-                                * here due to non-zero-length I/O, so there's
-                                * no way of reporting error returns from
-                                * ext4_mark_inode_dirty() to userspace.  So
-                                * ignore it.
-                                */
-                               ext4_mark_inode_dirty(handle, inode);
-                       }
-               }
-               err = ext4_journal_stop(handle);
-               if (ret == 0)
-                       ret = err;
+       if (es.es_lblk > map->m_lblk) {
+               map->m_len = es.es_lblk - map->m_lblk;
+               return false;
         }
-out:
-       return ret;
-}
  
-static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct address_space *mapping = iocb->ki_filp->f_mapping;
-       struct inode *inode = mapping->host;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
+       offset = map->m_lblk - es.es_lblk;
+       map->m_len = es.es_len - offset;
  
-       /*
-        * Shared inode_lock is enough for us - it protects against concurrent
-        * writes & truncates and since we take care of writing back page cache,
-        * we are protected against page writeback as well.
-        */
-       inode_lock_shared(inode);
-       ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-                                          iocb->ki_pos + count - 1);
-       if (ret)
-               goto out_unlock;
-       ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-                                  iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
-       inode_unlock_shared(inode);
-       return ret;
+       return true;
  }
  
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
+                                  loff_t length, unsigned int flags,
+                                  struct iomap *iomap, struct iomap *srcmap)
  {
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-       size_t count = iov_iter_count(iter);
-       loff_t offset = iocb->ki_pos;
-       ssize_t ret;
+       int ret;
+       bool delalloc = false;
+       struct ext4_map_blocks map;
+       u8 blkbits = inode->i_blkbits;
  
-#ifdef CONFIG_FS_ENCRYPTION
-       if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
-               return 0;
-#endif
-       if (fsverity_active(inode))
-               return 0;
+       if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+               return -EINVAL;
+
+       if (ext4_has_inline_data(inode)) {
+               ret = ext4_inline_data_iomap(inode, iomap);
+               if (ret != -EAGAIN) {
+                       if (ret == 0 && offset >= iomap->length)
+                               ret = -ENOENT;
+                       return ret;
+               }
+       }
  
         /*
-        * If we are doing data journalling we don't support O_DIRECT
+        * Calculate the first and last logical block respectively.
          */
-       if (ext4_should_journal_data(inode))
-               return 0;
+       map.m_lblk = offset >> blkbits;
+       map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+                         EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
  
-       /* Let buffer I/O handle the inline data case. */
-       if (ext4_has_inline_data(inode))
-               return 0;
+       ret = ext4_map_blocks(NULL, inode, &map, 0);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               delalloc = ext4_iomap_is_delalloc(inode, &map);
  
-       trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
-       if (iov_iter_rw(iter) == READ)
-               ret = ext4_direct_IO_read(iocb, iter);
-       else
-               ret = ext4_direct_IO_write(iocb, iter);
-       trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
-       return ret;
+       ext4_set_iomap(inode, iomap, &map, offset, length);
+       if (delalloc && iomap->type == IOMAP_HOLE)
+               iomap->type = IOMAP_DELALLOC;
+
+       return 0;
  }
  
+const struct iomap_ops ext4_iomap_report_ops = {
+       .iomap_begin = ext4_iomap_begin_report,
+};
+
  /*
   * Pages can be marked dirty completely asynchronously from ext4's journalling
   * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
@@ -3910,7 +3574,7 @@ static const struct address_space_operations ext4_aops = {
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
@@ -3927,7 +3591,7 @@ static const struct address_space_operations ext4_journalled_aops = {
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_journalled_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
  };
@@ -3943,7 +3607,7 @@ static const struct address_space_operations ext4_da_aops = {
         .bmap                   = ext4_bmap,
         .invalidatepage         = ext4_invalidatepage,
         .releasepage            = ext4_releasepage,
-       .direct_IO              = ext4_direct_IO,
+       .direct_IO              = noop_direct_IO,
         .migratepage            = buffer_migrate_page,
         .is_partially_uptodate  = block_is_partially_uptodate,
         .error_remove_page      = generic_error_remove_page,
@@ -5450,11 +5114,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
  
         offset = inode->i_size & (PAGE_SIZE - 1);
         /*
-        * All buffers in the last page remain valid? Then there's nothing to
-        * do. We do the check mainly to optimize the common PAGE_SIZE ==
-        * blocksize case
+        * If the page is fully truncated, we don't need to wait for any commit
+        * (and we even should not as __ext4_journalled_invalidatepage() may
+        * strip all buffers from the page but keep the page dirty which can then
+        * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+        * buffers). Also we don't need to wait for any commit if all buffers in
+        * the page remain valid. This is most beneficial for the common case of
+        * blocksize == PAGESIZE.
          */
-       if (offset > PAGE_SIZE - i_blocksize(inode))
+       if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                 return;
         while (1) {
                 page = find_lock_page(inode->i_mapping,
@@ -5915,8 +5583,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
  {
         struct ext4_inode *raw_inode;
         struct ext4_xattr_ibody_header *header;
+       unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
         int error;
  
+       /* this was checked at iget time, but double check for good measure */
+       if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+           (ei->i_extra_isize & 3)) {
+               EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+                                ei->i_extra_isize,
+                                EXT4_INODE_SIZE(inode->i_sb));
+               return -EFSCORRUPTED;
+       }
+       if ((new_extra_isize < ei->i_extra_isize) ||
+           (new_extra_isize < 4) ||
+           (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+               return -EINVAL; /* Should never happen */
+
         raw_inode = ext4_raw_inode(iloc);
  
         header = IHDR(inode, raw_inode);
@@ -5968,9 +5651,8 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
          * If this is felt to be critical, then e2fsck should be run to
          * force a large enough s_min_extra_isize.
          */
-       if (ext4_handle_valid(handle) &&
-           jbd2_journal_extend(handle,
-                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) != 0)
+       if (ext4_journal_extend(handle,
+                               EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                 return -ENOSPC;
  
         if (ext4_write_trylock_xattr(inode, &no_expand) == 0)