Merge branch 'iomap-for-next' into mb/dio

author Theodore Ts'o <tytso@mit.edu>

Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)

committer Theodore Ts'o <tytso@mit.edu>

Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)
author Theodore Ts'o <tytso@mit.edu>
Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)
committer Theodore Ts'o <tytso@mit.edu>
Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)
diff --git a/fs/dax.c b/fs/dax.c

index 6bf81f931de39e48bc2983a1c901b1a6643443f8..68eef98cd9c4068ffa880aaa7937d34e0052bc15 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
  
  static loff_t
  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap)
+               struct iomap *iomap, struct iomap *srcmap)
  {
         struct block_device *bdev = iomap->bdev;
         struct dax_device *dax_dev = iomap->dax_dev;
@@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
         struct inode *inode = mapping->host;
         unsigned long vaddr = vmf->address;
         loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
-       struct iomap iomap = { 0 };
+       struct iomap iomap = { .type = IOMAP_HOLE };
+       struct iomap srcmap = { .type = IOMAP_HOLE };
         unsigned flags = IOMAP_FAULT;
         int error, major = 0;
         bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
          * the file system block size to be equal the page size, which means
          * that we never have to deal with more than a single extent here.
          */
-       error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
+       error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
         if (iomap_errp)
                 *iomap_errp = error;
         if (error) {
@@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
         unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
         struct inode *inode = mapping->host;
         vm_fault_t result = VM_FAULT_FALLBACK;
-       struct iomap iomap = { 0 };
+       struct iomap iomap = { .type = IOMAP_HOLE };
+       struct iomap srcmap = { .type = IOMAP_HOLE };
         pgoff_t max_pgoff;
         void *entry;
         loff_t pos;
@@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
          * to look up our filesystem block.
          */
         pos = (loff_t)xas.xa_index << PAGE_SHIFT;
-       error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+       error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
+                       &srcmap);
         if (error)
                 goto unlock_entry;
  
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c

index 7004ce581a328b29eebae9e28e220c9fe5c7bbbb..467c13ff6b40902aea637f2bd7e6fa01df23ec73 100644 (file)
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
  
  #ifdef CONFIG_FS_DAX
  static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-               unsigned flags, struct iomap *iomap)
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
  {
         unsigned int blkbits = inode->i_blkbits;
         unsigned long first_block = offset >> blkbits;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index da9fed86086c2dda8e9be464d689fee99549b667..0d8971b819e99577386ed7f23143e30e61b21a81 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3449,7 +3449,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
  }
  
  static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-                           unsigned flags, struct iomap *iomap)
+               unsigned flags, struct iomap *iomap, struct iomap *srcmap)
  {
         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
         unsigned int blkbits = inode->i_blkbits;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c

index f63df54a08c6c3eb3a4c75aa946936929ce203a3..5161032482725cdbe51901d0a69b822f2e9a7b7d 100644 (file)
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags)
  }
  
  static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
-                           unsigned flags, struct iomap *iomap)
+                           unsigned flags, struct iomap *iomap,
+                           struct iomap *srcmap)
  {
         struct gfs2_inode *ip = GFS2_I(inode);
         struct metapath mp = { .mp_aheight = 1, };
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index 997b326247e26c9432cb87e94688e716b0debd49..f0caee2b7c00004a9aa490eba7860ea887fdb642 100644 (file)
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -732,7 +732,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
         if (ret)
                 goto out_uninit;
  
-       ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
+       ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
+                          is_sync_kiocb(iocb));
  
         gfs2_glock_dq(&gh);
  out_uninit:
@@ -767,7 +768,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
         if (offset + len > i_size_read(&ip->i_inode))
                 goto out;
  
-       ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
+       ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
+                          is_sync_kiocb(iocb));
  
  out:
         gfs2_glock_dq(&gh);
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile

index 93cd11938bf55a5912e346398c47cbe1cbaaffea..eef2722d93a183584625722a70c304ee90d08da8 100644 (file)
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -3,13 +3,15 @@
  # Copyright (c) 2019 Oracle.
  # All Rights Reserved.
  #
-obj-$(CONFIG_FS_IOMAP)         += iomap.o
  
-iomap-y                                += \
-                                       apply.o \
-                                       buffered-io.o \
-                                       direct-io.o \
-                                       fiemap.o \
-                                       seek.o
+ccflags-y += -I $(srctree)/$(src)              # needed for trace events
+
+obj-$(CONFIG_FS_IOMAP)         += iomap.o
  
+iomap-y                                += trace.o \
+                                  apply.o \
+                                  buffered-io.o \
+                                  direct-io.o \
+                                  fiemap.o \
+                                  seek.o
  iomap-$(CONFIG_SWAP)           += swapfile.o
diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c

index 54c02aecf3cd848825b79a57b57e87e53a1f1e69..484dd8eda861df5408831d3c0f45ebf8cd304e9b 100644 (file)
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -23,8 +23,10 @@ loff_t
  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
                 const struct iomap_ops *ops, void *data, iomap_actor_t actor)
  {
-       struct iomap iomap = { 0 };
+       struct iomap iomap = { .type = IOMAP_HOLE };
+       struct iomap srcmap = { .type = IOMAP_HOLE };
         loff_t written = 0, ret;
+       u64 end;
  
         /*
          * Need to map a range from start position for length bytes. This can
@@ -38,7 +40,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
          * expose transient stale data. If the reserve fails, we can safely
          * back out at this point as there is nothing to undo.
          */
-       ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
+       ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
         if (ret)
                 return ret;
         if (WARN_ON(iomap.offset > pos))
@@ -50,15 +52,26 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
          * Cut down the length to the one actually provided by the filesystem,
          * as it might not be able to give us the whole size that we requested.
          */
-       if (iomap.offset + iomap.length < pos + length)
-               length = iomap.offset + iomap.length - pos;
+       end = iomap.offset + iomap.length;
+       if (srcmap.type != IOMAP_HOLE)
+               end = min(end, srcmap.offset + srcmap.length);
+       if (pos + length > end)
+               length = end - pos;
  
         /*
-        * Now that we have guaranteed that the space allocation will succeed.
+        * Now that we have guaranteed that the space allocation will succeed,
          * we can do the copy-in page by page without having to worry about
          * failures exposing transient data.
+        *
+        * To support COW operations, we read in data for partially blocks from
+        * the srcmap if the file system filled it in.  In that case we the
+        * length needs to be limited to the earlier of the ends of the iomaps.
+        * If the file system did not provide a srcmap we pass in the normal
+        * iomap into the actors so that they don't need to have special
+        * handling for the two cases.
          */
-       written = actor(inode, pos, length, data, &iomap);
+       written = actor(inode, pos, length, data, &iomap,
+                       srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
  
         /*
          * Now the data has been copied, commit the range we've copied.  This
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c

index e25901ae3ff447712ec6d5635bb7d9ddfd180d3e..c62e807956b6ccecd516093ee6224e54194de2ce 100644 (file)
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
  // SPDX-License-Identifier: GPL-2.0
  /*
   * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (C) 2016-2019 Christoph Hellwig.
   */
  #include <linux/module.h>
  #include <linux/compiler.h>
@@ -12,13 +12,34 @@
  #include <linux/buffer_head.h>
  #include <linux/dax.h>
  #include <linux/writeback.h>
+#include <linux/list_sort.h>
  #include <linux/swap.h>
  #include <linux/bio.h>
  #include <linux/sched/signal.h>
  #include <linux/migrate.h>
+#include "trace.h"
  
  #include "../internal.h"
  
+/*
+ * Structure allocated for each page when block size < PAGE_SIZE to track
+ * sub-page uptodate status and I/O completions.
+ */
+struct iomap_page {
+       atomic_t                read_count;
+       atomic_t                write_count;
+       DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
+};
+
+static inline struct iomap_page *to_iomap_page(struct page *page)
+{
+       if (page_has_private(page))
+               return (struct iomap_page *)page_private(page);
+       return NULL;
+}
+
+static struct bio_set iomap_ioend_bioset;
+
  static struct iomap_page *
  iomap_page_create(struct inode *inode, struct page *page)
  {
@@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
         SetPageUptodate(page);
  }
  
+static inline bool iomap_block_needs_zeroing(struct inode *inode,
+               struct iomap *iomap, loff_t pos)
+{
+       return iomap->type != IOMAP_MAPPED ||
+               (iomap->flags & IOMAP_F_NEW) ||
+               pos >= i_size_read(inode);
+}
+
  static loff_t
  iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap)
+               struct iomap *iomap, struct iomap *srcmap)
  {
         struct iomap_readpage_ctx *ctx = data;
         struct page *page = ctx->cur_page;
@@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
         if (plen == 0)
                 goto done;
  
-       if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
+       if (iomap_block_needs_zeroing(inode, iomap, pos)) {
                 zero_user(page, poff, plen);
                 iomap_set_range_uptodate(page, poff, plen);
                 goto done;
@@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
         unsigned poff;
         loff_t ret;
  
+       trace_iomap_readpage(page->mapping->host, 1);
+
         for (poff = 0; poff < PAGE_SIZE; poff += ret) {
                 ret = iomap_apply(inode, page_offset(page) + poff,
                                 PAGE_SIZE - poff, 0, ops, &ctx,
@@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
  
  static loff_t
  iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
-               void *data, struct iomap *iomap)
+               void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         struct iomap_readpage_ctx *ctx = data;
         loff_t done, ret;
@@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
                         ctx->cur_page_in_bio = false;
                 }
                 ret = iomap_readpage_actor(inode, pos + done, length - done,
-                               ctx, iomap);
+                               ctx, iomap, srcmap);
         }
  
         return done;
@@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
         loff_t last = page_offset(list_entry(pages->next, struct page, lru));
         loff_t length = last - pos + PAGE_SIZE, ret = 0;
  
+       trace_iomap_readpages(mapping->host, nr_pages);
+
         while (length > 0) {
                 ret = iomap_apply(mapping->host, pos, length, 0, ops,
                                 &ctx, iomap_readpages_actor);
@@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
  int
  iomap_releasepage(struct page *page, gfp_t gfp_mask)
  {
+       trace_iomap_releasepage(page->mapping->host, page, 0, 0);
+
         /*
          * mm accommodates an old ext3 case where clean pages might not have had
          * the dirty bit cleared. Thus, it can send actual dirty pages to
@@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
  void
  iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
  {
+       trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
+
         /*
          * If we are invalidating the entire page, clear the dirty state from it
          * and release it to avoid unnecessary buildup of the LRU.
@@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
  EXPORT_SYMBOL_GPL(iomap_migrate_page);
  #endif /* CONFIG_MIGRATION */
  
+enum {
+       IOMAP_WRITE_F_UNSHARE           = (1 << 0),
+};
+
  static void
  iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  {
@@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
  }
  
  static int
-iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
-               unsigned poff, unsigned plen, unsigned from, unsigned to,
-               struct iomap *iomap)
+iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
+               unsigned plen, struct iomap *iomap)
  {
         struct bio_vec bvec;
         struct bio bio;
  
-       if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
-               zero_user_segments(page, poff, from, to, poff + plen);
-               iomap_set_range_uptodate(page, poff, plen);
-               return 0;
-       }
-
         bio_init(&bio, &bvec, 1);
         bio.bi_opf = REQ_OP_READ;
         bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
@@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
  }
  
  static int
-__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
-               struct page *page, struct iomap *iomap)
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
+               struct page *page, struct iomap *srcmap)
  {
         struct iomap_page *iop = iomap_page_create(inode, page);
         loff_t block_size = i_blocksize(inode);
         loff_t block_start = pos & ~(block_size - 1);
         loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
         unsigned from = offset_in_page(pos), to = from + len, poff, plen;
-       int status = 0;
+       int status;
  
         if (PageUptodate(page))
                 return 0;
@@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
                 if (plen == 0)
                         break;
  
-               if ((from > poff && from < poff + plen) ||
-                   (to > poff && to < poff + plen)) {
-                       status = iomap_read_page_sync(inode, block_start, page,
-                                       poff, plen, from, to, iomap);
-                       if (status)
-                               break;
+               if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
+                   (from <= poff || from >= poff + plen) &&
+                   (to <= poff || to >= poff + plen))
+                       continue;
+
+               if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
+                       if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
+                               return -EIO;
+                       zero_user_segments(page, poff, from, to, poff + plen);
+                       iomap_set_range_uptodate(page, poff, plen);
+                       continue;
                 }
  
+               status = iomap_read_page_sync(block_start, page, poff, plen,
+                               srcmap);
+               if (status)
+                       return status;
         } while ((block_start += plen) < block_end);
  
-       return status;
+       return 0;
  }
  
  static int
  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
-               struct page **pagep, struct iomap *iomap)
+               struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
  {
         const struct iomap_page_ops *page_ops = iomap->page_ops;
-       pgoff_t index = pos >> PAGE_SHIFT;
         struct page *page;
         int status = 0;
  
         BUG_ON(pos + len > iomap->offset + iomap->length);
+       if (srcmap != iomap)
+               BUG_ON(pos + len > srcmap->offset + srcmap->length);
  
         if (fatal_signal_pending(current))
                 return -EINTR;
@@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
                         return status;
         }
  
-       page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
+       page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
+                       AOP_FLAG_NOFS);
         if (!page) {
                 status = -ENOMEM;
                 goto out_no_page;
         }
  
-       if (iomap->type == IOMAP_INLINE)
-               iomap_read_inline_data(inode, page, iomap);
+       if (srcmap->type == IOMAP_INLINE)
+               iomap_read_inline_data(inode, page, srcmap);
         else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
-               status = __block_write_begin_int(page, pos, len, NULL, iomap);
+               status = __block_write_begin_int(page, pos, len, NULL, srcmap);
         else
-               status = __iomap_write_begin(inode, pos, len, page, iomap);
+               status = __iomap_write_begin(inode, pos, len, flags, page,
+                               srcmap);
  
         if (unlikely(status))
                 goto out_unlock;
@@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
  
  static int
  __iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-               unsigned copied, struct page *page, struct iomap *iomap)
+               unsigned copied, struct page *page)
  {
         flush_dcache_page(page);
  
@@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
  }
  
  static int
-iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
-               unsigned copied, struct page *page, struct iomap *iomap)
+iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
+               struct page *page, struct iomap *iomap, struct iomap *srcmap)
  {
         const struct iomap_page_ops *page_ops = iomap->page_ops;
         loff_t old_size = inode->i_size;
         int ret;
  
-       if (iomap->type == IOMAP_INLINE) {
+       if (srcmap->type == IOMAP_INLINE) {
                 ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
-       } else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
+       } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
                 ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
                                 page, NULL);
         } else {
-               ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
+               ret = __iomap_write_end(inode, pos, len, copied, page);
         }
  
         /*
@@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
  
  static loff_t
  iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap)
+               struct iomap *iomap, struct iomap *srcmap)
  {
         struct iov_iter *i = data;
         long status = 0;
         ssize_t written = 0;
-       unsigned int flags = AOP_FLAG_NOFS;
  
         do {
                 struct page *page;
@@ -771,8 +816,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
                         break;
                 }
  
-               status = iomap_write_begin(inode, pos, bytes, flags, &page,
-                               iomap);
+               status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
+                               srcmap);
                 if (unlikely(status))
                         break;
  
@@ -783,8 +828,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  
                 flush_dcache_page(page);
  
-               status = iomap_write_end(inode, pos, bytes, copied, page,
-                               iomap);
+               status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
+                               srcmap);
                 if (unlikely(status < 0))
                         break;
                 copied = status;
@@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
  }
  EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
  
-static struct page *
-__iomap_read_page(struct inode *inode, loff_t offset)
-{
-       struct address_space *mapping = inode->i_mapping;
-       struct page *page;
-
-       page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
-       if (IS_ERR(page))
-               return page;
-       if (!PageUptodate(page)) {
-               put_page(page);
-               return ERR_PTR(-EIO);
-       }
-       return page;
-}
-
  static loff_t
-iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap)
+iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+               struct iomap *iomap, struct iomap *srcmap)
  {
         long status = 0;
         ssize_t written = 0;
  
-       do {
-               struct page *page, *rpage;
-               unsigned long offset;   /* Offset into pagecache page */
-               unsigned long bytes;    /* Bytes to write to page */
-
-               offset = offset_in_page(pos);
-               bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+       /* don't bother with blocks that are not shared to start with */
+       if (!(iomap->flags & IOMAP_F_SHARED))
+               return length;
+       /* don't bother with holes or unwritten extents */
+       if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
+               return length;
  
-               rpage = __iomap_read_page(inode, pos);
-               if (IS_ERR(rpage))
-                       return PTR_ERR(rpage);
+       do {
+               unsigned long offset = offset_in_page(pos);
+               unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
+               struct page *page;
  
                 status = iomap_write_begin(inode, pos, bytes,
-                                          AOP_FLAG_NOFS, &page, iomap);
-               put_page(rpage);
+                               IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
                 if (unlikely(status))
                         return status;
  
-               WARN_ON_ONCE(!PageUptodate(page));
-
-               status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+               status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
+                               srcmap);
                 if (unlikely(status <= 0)) {
                         if (WARN_ON_ONCE(status == 0))
                                 return -EIO;
@@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  }
  
  int
-iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                 const struct iomap_ops *ops)
  {
         loff_t ret;
  
         while (len) {
                 ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
-                               iomap_dirty_actor);
+                               iomap_unshare_actor);
                 if (ret <= 0)
                         return ret;
                 pos += ret;
@@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
  
         return 0;
  }
-EXPORT_SYMBOL_GPL(iomap_file_dirty);
+EXPORT_SYMBOL_GPL(iomap_file_unshare);
  
  static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
-               unsigned bytes, struct iomap *iomap)
+               unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
  {
         struct page *page;
         int status;
  
-       status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
-                                  iomap);
+       status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
         if (status)
                 return status;
  
         zero_user(page, offset, bytes);
         mark_page_accessed(page);
  
-       return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
+       return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
  }
  
  static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
@@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
  
  static loff_t
  iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
-               void *data, struct iomap *iomap)
+               void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         bool *did_zero = data;
         loff_t written = 0;
         int status;
  
         /* already zeroed?  we're done. */
-       if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
+       if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
                 return count;
  
         do {
@@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
                 if (IS_DAX(inode))
                         status = iomap_dax_zero(pos, offset, bytes, iomap);
                 else
-                       status = iomap_zero(inode, pos, offset, bytes, iomap);
+                       status = iomap_zero(inode, pos, offset, bytes, iomap,
+                                       srcmap);
                 if (status < 0)
                         return status;
  
@@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
  
  static loff_t
  iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
-               void *data, struct iomap *iomap)
+               void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         struct page *page = data;
         int ret;
@@ -1071,3 +1098,551 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
         return block_page_mkwrite_return(ret);
  }
  EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
+
+static void
+iomap_finish_page_writeback(struct inode *inode, struct page *page,
+               int error)
+{
+       struct iomap_page *iop = to_iomap_page(page);
+
+       if (error) {
+               SetPageError(page);
+               mapping_set_error(inode->i_mapping, -EIO);
+       }
+
+       WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+       WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
+
+       if (!iop || atomic_dec_and_test(&iop->write_count))
+               end_page_writeback(page);
+}
+
+/*
+ * We're now finished for good with this ioend structure.  Update the page
+ * state, release holds on bios, and finally free up memory.  Do not use the
+ * ioend after this.
+ */
+static void
+iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+       struct inode *inode = ioend->io_inode;
+       struct bio *bio = &ioend->io_inline_bio;
+       struct bio *last = ioend->io_bio, *next;
+       u64 start = bio->bi_iter.bi_sector;
+       bool quiet = bio_flagged(bio, BIO_QUIET);
+
+       for (bio = &ioend->io_inline_bio; bio; bio = next) {
+               struct bio_vec *bv;
+               struct bvec_iter_all iter_all;
+
+               /*
+                * For the last bio, bi_private points to the ioend, so we
+                * need to explicitly end the iteration here.
+                */
+               if (bio == last)
+                       next = NULL;
+               else
+                       next = bio->bi_private;
+
+               /* walk each page on bio, ending page IO on them */
+               bio_for_each_segment_all(bv, bio, iter_all)
+                       iomap_finish_page_writeback(inode, bv->bv_page, error);
+               bio_put(bio);
+       }
+
+       if (unlikely(error && !quiet)) {
+               printk_ratelimited(KERN_ERR
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+                       inode->i_sb->s_id, inode->i_ino, ioend->io_offset,
+                       start);
+       }
+}
+
+void
+iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+       struct list_head tmp;
+
+       list_replace_init(&ioend->io_list, &tmp);
+       iomap_finish_ioend(ioend, error);
+
+       while (!list_empty(&tmp)) {
+               ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+               list_del_init(&ioend->io_list);
+               iomap_finish_ioend(ioend, error);
+       }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool
+iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
+{
+       if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+               return false;
+       if ((ioend->io_flags & IOMAP_F_SHARED) ^
+           (next->io_flags & IOMAP_F_SHARED))
+               return false;
+       if ((ioend->io_type == IOMAP_UNWRITTEN) ^
+           (next->io_type == IOMAP_UNWRITTEN))
+               return false;
+       if (ioend->io_offset + ioend->io_size != next->io_offset)
+               return false;
+       return true;
+}
+
+void
+iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
+               void (*merge_private)(struct iomap_ioend *ioend,
+                               struct iomap_ioend *next))
+{
+       struct iomap_ioend *next;
+
+       INIT_LIST_HEAD(&ioend->io_list);
+
+       while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+                       io_list))) {
+               if (!iomap_ioend_can_merge(ioend, next))
+                       break;
+               list_move_tail(&next->io_list, &ioend->io_list);
+               ioend->io_size += next->io_size;
+               if (next->io_private && merge_private)
+                       merge_private(ioend, next);
+       }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int
+iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
+{
+       struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+       struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+       if (ia->io_offset < ib->io_offset)
+               return -1;
+       if (ia->io_offset > ib->io_offset)
+               return 1;
+       return 0;
+}
+
+void
+iomap_sort_ioends(struct list_head *ioend_list)
+{
+       list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+static void iomap_writepage_end_bio(struct bio *bio)
+{
+       struct iomap_ioend *ioend = bio->bi_private;
+
+       iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+}
+
+/*
+ * Submit the final bio for an ioend.
+ *
+ * If @error is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them.  In this situation, we need to fail the bio instead of
+ * submitting it.  This typically only happens on a filesystem shutdown.
+ */
+static int
+iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
+               int error)
+{
+       ioend->io_bio->bi_private = ioend;
+       ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+
+       if (wpc->ops->prepare_ioend)
+               error = wpc->ops->prepare_ioend(ioend, error);
+       if (error) {
+               /*
+                * If we are failing the IO now, just mark the ioend with an
+                * error and finish it.  This will run IO completion immediately
+                * as there is only one reference to the ioend at this point in
+                * time.
+                */
+               ioend->io_bio->bi_status = errno_to_blk_status(error);
+               bio_endio(ioend->io_bio);
+               return error;
+       }
+
+       submit_bio(ioend->io_bio);
+       return 0;
+}
+
+static struct iomap_ioend *
+iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
+               loff_t offset, sector_t sector, struct writeback_control *wbc)
+{
+       struct iomap_ioend *ioend;
+       struct bio *bio;
+
+       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
+       bio_set_dev(bio, wpc->iomap.bdev);
+       bio->bi_iter.bi_sector = sector;
+       bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+       bio->bi_write_hint = inode->i_write_hint;
+       wbc_init_bio(wbc, bio);
+
+       ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+       INIT_LIST_HEAD(&ioend->io_list);
+       ioend->io_type = wpc->iomap.type;
+       ioend->io_flags = wpc->iomap.flags;
+       ioend->io_inode = inode;
+       ioend->io_size = 0;
+       ioend->io_offset = offset;
+       ioend->io_private = NULL;
+       ioend->io_bio = bio;
+       return ioend;
+}
+
+/*
+ * Allocate a new bio, and chain the old bio to the new one.
+ *
+ * Note that we have to do perform the chaining in this unintuitive order
+ * so that the bi_private linkage is set up in the right direction for the
+ * traversal in iomap_finish_ioend().
+ */
+static struct bio *
+iomap_chain_bio(struct bio *prev)
+{
+       struct bio *new;
+
+       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
+       bio_copy_dev(new, prev);/* also copies over blkcg information */
+       new->bi_iter.bi_sector = bio_end_sector(prev);
+       new->bi_opf = prev->bi_opf;
+       new->bi_write_hint = prev->bi_write_hint;
+
+       bio_chain(prev, new);
+       bio_get(prev);          /* for iomap_finish_ioend */
+       submit_bio(prev);
+       return new;
+}
+
+static bool
+iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
+               sector_t sector)
+{
+       if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
+           (wpc->ioend->io_flags & IOMAP_F_SHARED))
+               return false;
+       if (wpc->iomap.type != wpc->ioend->io_type)
+               return false;
+       if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+               return false;
+       if (sector != bio_end_sector(wpc->ioend->io_bio))
+               return false;
+       return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first, otherwise finish off the current ioend and start another.
+ */
+static void
+iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
+               struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
+               struct writeback_control *wbc, struct list_head *iolist)
+{
+       sector_t sector = iomap_sector(&wpc->iomap, offset);
+       unsigned len = i_blocksize(inode);
+       unsigned poff = offset & (PAGE_SIZE - 1);
+       bool merged, same_page = false;
+
+       if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
+               if (wpc->ioend)
+                       list_add(&wpc->ioend->io_list, iolist);
+               wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
+       }
+
+       merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
+                       &same_page);
+       if (iop && !same_page)
+               atomic_inc(&iop->write_count);
+
+       if (!merged) {
+               if (bio_full(wpc->ioend->io_bio, len)) {
+                       wpc->ioend->io_bio =
+                               iomap_chain_bio(wpc->ioend->io_bio);
+               }
+               bio_add_page(wpc->ioend->io_bio, page, len, poff);
+       }
+
+       wpc->ioend->io_size += len;
+       wbc_account_cgroup_owner(wbc, page, len);
+}
+
+/*
+ * We implement an immediate ioend submission policy here to avoid needing to
+ * chain multiple ioends and hence nest mempool allocations which can violate
+ * forward progress guarantees we need to provide. The current ioend we are
+ * adding blocks to is cached on the writepage context, and if the new block
+ * does not append to the cached ioend it will create a new ioend and cache that
+ * instead.
+ *
+ * If a new ioend is created and cached, the old ioend is returned and queued
+ * locally for submission once the entire page is processed or an error has been
+ * detected.  While ioends are submitted immediately after they are completed,
+ * batching optimisations are provided by higher level block plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+static int
+iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+               struct writeback_control *wbc, struct inode *inode,
+               struct page *page, u64 end_offset)
+{
+       struct iomap_page *iop = to_iomap_page(page);
+       struct iomap_ioend *ioend, *next;
+       unsigned len = i_blocksize(inode);
+       u64 file_offset; /* file offset of page */
+       int error = 0, count = 0, i;
+       LIST_HEAD(submit_list);
+
+       WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
+       WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
+
+       /*
+        * Walk through the page to find areas to write back. If we run off the
+        * end of the current map or find the current map invalid, grab a new
+        * one.
+        */
+       for (i = 0, file_offset = page_offset(page);
+            i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
+            i++, file_offset += len) {
+               if (iop && !test_bit(i, iop->uptodate))
+                       continue;
+
+               error = wpc->ops->map_blocks(wpc, inode, file_offset);
+               if (error)
+                       break;
+               if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
+                       continue;
+               if (wpc->iomap.type == IOMAP_HOLE)
+                       continue;
+               iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
+                                &submit_list);
+               count++;
+       }
+
+       WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
+       WARN_ON_ONCE(!PageLocked(page));
+       WARN_ON_ONCE(PageWriteback(page));
+
+       /*
+        * We cannot cancel the ioend directly here on error.  We may have
+        * already set other pages under writeback and hence we have to run I/O
+        * completion to mark the error state of the pages under writeback
+        * appropriately.
+        */
+       if (unlikely(error)) {
+               if (!count) {
+                       /*
+                        * If the current page hasn't been added to ioend, it
+                        * won't be affected by I/O completions and we must
+                        * discard and unlock it right here.
+                        */
+                       if (wpc->ops->discard_page)
+                               wpc->ops->discard_page(page);
+                       ClearPageUptodate(page);
+                       unlock_page(page);
+                       goto done;
+               }
+
+               /*
+                * If the page was not fully cleaned, we need to ensure that the
+                * higher layers come back to it correctly.  That means we need
+                * to keep the page dirty, and for WB_SYNC_ALL writeback we need
+                * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
+                * so another attempt to write this page in this writeback sweep
+                * will be made.
+                */
+               set_page_writeback_keepwrite(page);
+       } else {
+               clear_page_dirty_for_io(page);
+               set_page_writeback(page);
+       }
+
+       unlock_page(page);
+
+       /*
+        * Preserve the original error if there was one, otherwise catch
+        * submission errors here and propagate into subsequent ioend
+        * submissions.
+        */
+       list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
+               int error2;
+
+               list_del_init(&ioend->io_list);
+               error2 = iomap_submit_ioend(wpc, ioend, error);
+               if (error2 && !error)
+                       error = error2;
+       }
+
+       /*
+        * We can end up here with no error and nothing to write only if we race
+        * with a partial page truncate on a sub-page block sized filesystem.
+        */
+       if (!count)
+               end_page_writeback(page);
+done:
+       mapping_set_error(page->mapping, error);
+       return error;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ */
+static int
+iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
+{
+       struct iomap_writepage_ctx *wpc = data;
+       struct inode *inode = page->mapping->host;
+       pgoff_t end_index;
+       u64 end_offset;
+       loff_t offset;
+
+       trace_iomap_writepage(inode, page, 0, 0);
+
+       /*
+        * Refuse to write the page out if we are called from reclaim context.
+        *
+        * This avoids stack overflows when called from deeply used stacks in
+        * random callers for direct reclaim or memcg reclaim.  We explicitly
+        * allow reclaim from kswapd as the stack usage there is relatively low.
+        *
+        * This should never happen except in the case of a VM regression so
+        * warn about it.
+        */
+       if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
+                       PF_MEMALLOC))
+               goto redirty;
+
+       /*
+        * Given that we do not allow direct reclaim to call us, we should
+        * never be called in a recursive filesystem reclaim context.
+        */
+       if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
+               goto redirty;
+
+       /*
+        * Is this page beyond the end of the file?
+        *
+        * The page index is less than the end_index, adjust the end_offset
+        * to the highest offset that this page should represent.
+        * -----------------------------------------------------
+        * |                    file mapping           | <EOF> |
+        * -----------------------------------------------------
+        * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
+        * ^--------------------------------^----------|--------
+        * |     desired writeback range    |      see else    |
+        * ---------------------------------^------------------|
+        */
+       offset = i_size_read(inode);
+       end_index = offset >> PAGE_SHIFT;
+       if (page->index < end_index)
+               end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
+       else {
+               /*
+                * Check whether the page to write out is beyond or straddles
+                * i_size or not.
+                * -------------------------------------------------------
+                * |            file mapping                    | <EOF>  |
+                * -------------------------------------------------------
+                * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
+                * ^--------------------------------^-----------|---------
+                * |                                |      Straddles     |
+                * ---------------------------------^-----------|--------|
+                */
+               unsigned offset_into_page = offset & (PAGE_SIZE - 1);
+
+               /*
+                * Skip the page if it is fully outside i_size, e.g. due to a
+                * truncate operation that is in progress. We must redirty the
+                * page so that reclaim stops reclaiming it. Otherwise
+                * iomap_vm_releasepage() is called on it and gets confused.
+                *
+                * Note that the end_index is unsigned long, it would overflow
+                * if the given offset is greater than 16TB on 32-bit system
+                * and if we do check the page is fully outside i_size or not
+                * via "if (page->index >= end_index + 1)" as "end_index + 1"
+                * will be evaluated to 0.  Hence this page will be redirtied
+                * and be written out repeatedly which would result in an
+                * infinite loop, the user program that perform this operation
+                * will hang.  Instead, we can verify this situation by checking
+                * if the page to write is totally beyond the i_size or if it's
+                * offset is just equal to the EOF.
+                */
+               if (page->index > end_index ||
+                   (page->index == end_index && offset_into_page == 0))
+                       goto redirty;
+
+               /*
+                * The page straddles i_size.  It must be zeroed out on each
+                * and every writepage invocation because it may be mmapped.
+                * "A file is mapped in multiples of the page size.  For a file
+                * that is not a multiple of the page size, the remaining
+                * memory is zeroed when mapped, and writes to that region are
+                * not written out to the file."
+                */
+               zero_user_segment(page, offset_into_page, PAGE_SIZE);
+
+               /* Adjust the end_offset to the end of file */
+               end_offset = offset;
+       }
+
+       return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
+
+redirty:
+       redirty_page_for_writepage(wbc, page);
+       unlock_page(page);
+       return 0;
+}
+
+int
+iomap_writepage(struct page *page, struct writeback_control *wbc,
+               struct iomap_writepage_ctx *wpc,
+               const struct iomap_writeback_ops *ops)
+{
+       int ret;
+
+       wpc->ops = ops;
+       ret = iomap_do_writepage(page, wbc, wpc);
+       if (!wpc->ioend)
+               return ret;
+       return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepage);
+
+int
+iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
+               struct iomap_writepage_ctx *wpc,
+               const struct iomap_writeback_ops *ops)
+{
+       int                     ret;
+
+       wpc->ops = ops;
+       ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
+       if (!wpc->ioend)
+               return ret;
+       return iomap_submit_ioend(wpc, wpc->ioend, ret);
+}
+EXPORT_SYMBOL_GPL(iomap_writepages);
+
+static int __init iomap_init(void)
+{
+       return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+                          offsetof(struct iomap_ioend, io_inline_bio),
+                          BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c

index 1fc28c2da2790fc6ed9d1f167b1f17d577223d88..2f88d64c2a4dbb8cecdacd9a29ce2730b9f28ba4 100644 (file)
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -358,7 +358,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
  
  static loff_t
  iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
-               void *data, struct iomap *iomap)
+               void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         struct iomap_dio *dio = data;
  
@@ -392,7 +392,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
   */
  ssize_t
  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-               const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+               bool wait_for_completion)
  {
         struct address_space *mapping = iocb->ki_filp->f_mapping;
         struct inode *inode = file_inode(iocb->ki_filp);
@@ -400,7 +401,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         loff_t pos = iocb->ki_pos, start = pos;
         loff_t end = iocb->ki_pos + count - 1, ret = 0;
         unsigned int flags = IOMAP_DIRECT;
-       bool wait_for_completion = is_sync_kiocb(iocb);
         struct blk_plug plug;
         struct iomap_dio *dio;
  
@@ -409,6 +409,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
         if (!count)
                 return 0;
  
+       if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
+               return -EIO;
+
         dio = kmalloc(sizeof(*dio), GFP_KERNEL);
         if (!dio)
                 return -ENOMEM;
@@ -430,7 +433,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
                 if (pos >= dio->i_size)
                         goto out_free_dio;
  
-               if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
+               if (iter_is_iovec(iter))
                         dio->flags |= IOMAP_DIO_DIRTY;
         } else {
                 flags |= IOMAP_WRITE;
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c

index f26fdd36e3832b22cbc06a2248b3105e880f20bd..690ef2d7c6c803e802dc4f992eb4687e5994ddd7 100644 (file)
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
  
  static loff_t
  iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
-               struct iomap *iomap)
+               struct iomap *iomap, struct iomap *srcmap)
  {
         struct fiemap_ctx *ctx = data;
         loff_t ret = length;
@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
  
  static loff_t
  iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
-               void *data, struct iomap *iomap)
+               void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         sector_t *bno = data, addr;
  
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c

index c04bad4b2b43f9fbdfc3a3788e36e8dff95ba241..89f61d93c0bcfddad22c6dda6385bdaa4e8da893 100644 (file)
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -119,7 +119,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
  
  static loff_t
  iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
-                     void *data, struct iomap *iomap)
+                     void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         switch (iomap->type) {
         case IOMAP_UNWRITTEN:
@@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole);
  
  static loff_t
  iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
-                     void *data, struct iomap *iomap)
+                     void *data, struct iomap *iomap, struct iomap *srcmap)
  {
         switch (iomap->type) {
         case IOMAP_HOLE:
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c

index 152a230f668d47256f3f27933f757da7380a0c8a..a648dbf6991e4e5307574b7bb207269d7f650321 100644 (file)
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
   * distinction between written and unwritten extents.
   */
  static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
-               loff_t count, void *data, struct iomap *iomap)
+               loff_t count, void *data, struct iomap *iomap,
+               struct iomap *srcmap)
  {
         struct iomap_swapfile_info *isi = data;
         int error;
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c

new file mode 100644 (file)

index 0000000..da21724
--- /dev/null
+++ b/fs/iomap/trace.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Christoph Hellwig
+ */
+#include <linux/iomap.h>
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h

new file mode 100644 (file)

index 0000000..4ca1aa2
--- /dev/null
+++ b/fs/iomap/trace.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2009-2019 Christoph Hellwig
+ *
+ * NOTE: none of these tracepoints shall be consider a stable kernel ABI
+ * as they can change at any time.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iomap
+
+#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _IOMAP_TRACE_H
+
+#include <linux/tracepoint.h>
+
+struct inode;
+
+DECLARE_EVENT_CLASS(iomap_readpage_class,
+       TP_PROTO(struct inode *inode, int nr_pages),
+       TP_ARGS(inode, nr_pages),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(u64, ino)
+               __field(int, nr_pages)
+       ),
+       TP_fast_assign(
+               __entry->dev = inode->i_sb->s_dev;
+               __entry->ino = inode->i_ino;
+               __entry->nr_pages = nr_pages;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name)            \
+DEFINE_EVENT(iomap_readpage_class, name,       \
+       TP_PROTO(struct inode *inode, int nr_pages), \
+       TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(iomap_readpage);
+DEFINE_READPAGE_EVENT(iomap_readpages);
+
+DECLARE_EVENT_CLASS(iomap_page_class,
+       TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
+                unsigned int len),
+       TP_ARGS(inode, page, off, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(u64, ino)
+               __field(pgoff_t, pgoff)
+               __field(loff_t, size)
+               __field(unsigned long, offset)
+               __field(unsigned int, length)
+       ),
+       TP_fast_assign(
+               __entry->dev = inode->i_sb->s_dev;
+               __entry->ino = inode->i_ino;
+               __entry->pgoff = page_offset(page);
+               __entry->size = i_size_read(inode);
+               __entry->offset = off;
+               __entry->length = len;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+                 "length %x",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->pgoff,
+                 __entry->size,
+                 __entry->offset,
+                 __entry->length)
+)
+
+#define DEFINE_PAGE_EVENT(name)                \
+DEFINE_EVENT(iomap_page_class, name,   \
+       TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
+                unsigned int len),     \
+       TP_ARGS(inode, page, off, len))
+DEFINE_PAGE_EVENT(iomap_writepage);
+DEFINE_PAGE_EVENT(iomap_releasepage);
+DEFINE_PAGE_EVENT(iomap_invalidatepage);
+
+#endif /* _IOMAP_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c

index 02469d59c7879d5a6e039d929df11e558ab186f3..ef75e223cb703ca1a16f6995a3e4e046f312941d 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,6 +34,7 @@
  #include "xfs_ag_resv.h"
  #include "xfs_refcount.h"
  #include "xfs_icache.h"
+#include "xfs_iomap.h"
  
  
  kmem_zone_t            *xfs_bmap_free_item_zone;
@@ -4456,16 +4457,21 @@ int
  xfs_bmapi_convert_delalloc(
         struct xfs_inode        *ip,
         int                     whichfork,
-       xfs_fileoff_t           offset_fsb,
-       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset,
+       struct iomap            *iomap,
         unsigned int            *seq)
  {
         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
         struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
         struct xfs_bmalloca     bma = { NULL };
+       u16                     flags = 0;
         struct xfs_trans        *tp;
         int                     error;
  
+       if (whichfork == XFS_COW_FORK)
+               flags |= IOMAP_F_SHARED;
+
         /*
          * Space for the extent and indirect blocks was reserved when the
          * delalloc extent was created so there's no need to do so here.
@@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc(
          * the extent.  Just return the real extent at this offset.
          */
         if (!isnullstartblock(bma.got.br_startblock)) {
-               *imap = bma.got;
+               xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
                 *seq = READ_ONCE(ifp->if_seq);
                 goto out_trans_cancel;
         }
@@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc(
         XFS_STATS_INC(mp, xs_xstrat_quick);
  
         ASSERT(!isnullstartblock(bma.got.br_startblock));
-       *imap = bma.got;
+       xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
         *seq = READ_ONCE(ifp->if_seq);
  
         if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h

index e2798c6f3a5f350f655ec02ef6e439212d0b45d8..14d25e0b7d9c8b3968ae092fd399e775936dec4c 100644 (file)
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
                 struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
                 int eof);
  int    xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
-               xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
-               unsigned int *seq);
+               xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
  int    xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
                 struct xfs_inode *ip, int whichfork,
                 struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index f16d5f196c6b15cfb7cd84807984a76cf10b52dd..5936507c6f50d954041aceb186e45149d08475f0 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -18,17 +18,18 @@
  #include "xfs_bmap_util.h"
  #include "xfs_reflink.h"
  
-/*
- * structure owned by writepages passed to individual writepage calls
- */
  struct xfs_writepage_ctx {
-       struct xfs_bmbt_irec    imap;
-       int                     fork;
+       struct iomap_writepage_ctx ctx;
         unsigned int            data_seq;
         unsigned int            cow_seq;
-       struct xfs_ioend        *ioend;
  };
  
+static inline struct xfs_writepage_ctx *
+XFS_WPC(struct iomap_writepage_ctx *ctx)
+{
+       return container_of(ctx, struct xfs_writepage_ctx, ctx);
+}
+
  struct block_device *
  xfs_find_bdev_for_inode(
         struct inode            *inode)
@@ -55,71 +56,10 @@ xfs_find_daxdev_for_inode(
                 return mp->m_ddev_targp->bt_daxdev;
  }
  
-static void
-xfs_finish_page_writeback(
-       struct inode            *inode,
-       struct bio_vec  *bvec,
-       int                     error)
-{
-       struct iomap_page       *iop = to_iomap_page(bvec->bv_page);
-
-       if (error) {
-               SetPageError(bvec->bv_page);
-               mapping_set_error(inode->i_mapping, -EIO);
-       }
-
-       ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
-       ASSERT(!iop || atomic_read(&iop->write_count) > 0);
-
-       if (!iop || atomic_dec_and_test(&iop->write_count))
-               end_page_writeback(bvec->bv_page);
-}
-
-/*
- * We're now finished for good with this ioend structure.  Update the page
- * state, release holds on bios, and finally free up memory.  Do not use the
- * ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
-       struct xfs_ioend        *ioend,
-       int                     error)
-{
-       struct inode            *inode = ioend->io_inode;
-       struct bio              *bio = &ioend->io_inline_bio;
-       struct bio              *last = ioend->io_bio, *next;
-       u64                     start = bio->bi_iter.bi_sector;
-       bool                    quiet = bio_flagged(bio, BIO_QUIET);
-
-       for (bio = &ioend->io_inline_bio; bio; bio = next) {
-               struct bio_vec  *bvec;
-               struct bvec_iter_all iter_all;
-
-               /*
-                * For the last bio, bi_private points to the ioend, so we
-                * need to explicitly end the iteration here.
-                */
-               if (bio == last)
-                       next = NULL;
-               else
-                       next = bio->bi_private;
-
-               /* walk each page on bio, ending page IO on them */
-               bio_for_each_segment_all(bvec, bio, iter_all)
-                       xfs_finish_page_writeback(inode, bvec, error);
-               bio_put(bio);
-       }
-
-       if (unlikely(error && !quiet)) {
-               xfs_err_ratelimited(XFS_I(inode)->i_mount,
-                       "writeback error on sector %llu", start);
-       }
-}
-
  /*
   * Fast and loose check if this write could update the on-disk inode size.
   */
-static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
+static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
  {
         return ioend->io_offset + ioend->io_size >
                 XFS_I(ioend->io_inode)->i_d.di_size;
@@ -127,7 +67,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
  
  STATIC int
  xfs_setfilesize_trans_alloc(
-       struct xfs_ioend        *ioend)
+       struct iomap_ioend      *ioend)
  {
         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
         struct xfs_trans        *tp;
@@ -137,7 +77,7 @@ xfs_setfilesize_trans_alloc(
         if (error)
                 return error;
  
-       ioend->io_append_trans = tp;
+       ioend->io_private = tp;
  
         /*
          * We may pass freeze protection with a transaction.  So tell lockdep
@@ -200,11 +140,11 @@ xfs_setfilesize(
  
  STATIC int
  xfs_setfilesize_ioend(
-       struct xfs_ioend        *ioend,
+       struct iomap_ioend      *ioend,
         int                     error)
  {
         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-       struct xfs_trans        *tp = ioend->io_append_trans;
+       struct xfs_trans        *tp = ioend->io_private;
  
         /*
          * The transaction may have been allocated in the I/O submission thread,
@@ -228,9 +168,8 @@ xfs_setfilesize_ioend(
   */
  STATIC void
  xfs_end_ioend(
-       struct xfs_ioend        *ioend)
+       struct iomap_ioend      *ioend)
  {
-       struct list_head        ioend_list;
         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
         xfs_off_t               offset = ioend->io_offset;
         size_t                  size = ioend->io_size;
@@ -257,7 +196,7 @@ xfs_end_ioend(
          */
         error = blk_status_to_errno(ioend->io_bio->bi_status);
         if (unlikely(error)) {
-               if (ioend->io_fork == XFS_COW_FORK)
+               if (ioend->io_flags & IOMAP_F_SHARED)
                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
                 goto done;
         }
@@ -265,49 +204,20 @@ xfs_end_ioend(
         /*
          * Success: commit the COW or unwritten blocks if needed.
          */
-       if (ioend->io_fork == XFS_COW_FORK)
+       if (ioend->io_flags & IOMAP_F_SHARED)
                 error = xfs_reflink_end_cow(ip, offset, size);
-       else if (ioend->io_state == XFS_EXT_UNWRITTEN)
+       else if (ioend->io_type == IOMAP_UNWRITTEN)
                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
         else
-               ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
+               ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
  
  done:
-       if (ioend->io_append_trans)
+       if (ioend->io_private)
                 error = xfs_setfilesize_ioend(ioend, error);
-       list_replace_init(&ioend->io_list, &ioend_list);
-       xfs_destroy_ioend(ioend, error);
-
-       while (!list_empty(&ioend_list)) {
-               ioend = list_first_entry(&ioend_list, struct xfs_ioend,
-                               io_list);
-               list_del_init(&ioend->io_list);
-               xfs_destroy_ioend(ioend, error);
-       }
-
+       iomap_finish_ioends(ioend, error);
         memalloc_nofs_restore(nofs_flag);
  }
  
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-xfs_ioend_can_merge(
-       struct xfs_ioend        *ioend,
-       struct xfs_ioend        *next)
-{
-       if (ioend->io_bio->bi_status != next->io_bio->bi_status)
-               return false;
-       if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
-               return false;
-       if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
-           (next->io_state == XFS_EXT_UNWRITTEN))
-               return false;
-       if (ioend->io_offset + ioend->io_size != next->io_offset)
-               return false;
-       return true;
-}
-
  /*
   * If the to be merged ioend has a preallocated transaction for file
   * size updates we need to ensure the ioend it is merged into also
@@ -315,104 +225,65 @@ xfs_ioend_can_merge(
   * as it is guaranteed to be clean.
   */
  static void
-xfs_ioend_merge_append_transactions(
-       struct xfs_ioend        *ioend,
-       struct xfs_ioend        *next)
+xfs_ioend_merge_private(
+       struct iomap_ioend      *ioend,
+       struct iomap_ioend      *next)
  {
-       if (!ioend->io_append_trans) {
-               ioend->io_append_trans = next->io_append_trans;
-               next->io_append_trans = NULL;
+       if (!ioend->io_private) {
+               ioend->io_private = next->io_private;
+               next->io_private = NULL;
         } else {
                 xfs_setfilesize_ioend(next, -ECANCELED);
         }
  }
  
-/* Try to merge adjacent completions. */
-STATIC void
-xfs_ioend_try_merge(
-       struct xfs_ioend        *ioend,
-       struct list_head        *more_ioends)
-{
-       struct xfs_ioend        *next_ioend;
-
-       while (!list_empty(more_ioends)) {
-               next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
-                               io_list);
-               if (!xfs_ioend_can_merge(ioend, next_ioend))
-                       break;
-               list_move_tail(&next_ioend->io_list, &ioend->io_list);
-               ioend->io_size += next_ioend->io_size;
-               if (next_ioend->io_append_trans)
-                       xfs_ioend_merge_append_transactions(ioend, next_ioend);
-       }
-}
-
-/* list_sort compare function for ioends */
-static int
-xfs_ioend_compare(
-       void                    *priv,
-       struct list_head        *a,
-       struct list_head        *b)
-{
-       struct xfs_ioend        *ia;
-       struct xfs_ioend        *ib;
-
-       ia = container_of(a, struct xfs_ioend, io_list);
-       ib = container_of(b, struct xfs_ioend, io_list);
-       if (ia->io_offset < ib->io_offset)
-               return -1;
-       else if (ia->io_offset > ib->io_offset)
-               return 1;
-       return 0;
-}
-
  /* Finish all pending io completions. */
  void
  xfs_end_io(
         struct work_struct      *work)
  {
-       struct xfs_inode        *ip;
-       struct xfs_ioend        *ioend;
-       struct list_head        completion_list;
+       struct xfs_inode        *ip =
+               container_of(work, struct xfs_inode, i_ioend_work);
+       struct iomap_ioend      *ioend;
+       struct list_head        tmp;
         unsigned long           flags;
  
-       ip = container_of(work, struct xfs_inode, i_ioend_work);
-
         spin_lock_irqsave(&ip->i_ioend_lock, flags);
-       list_replace_init(&ip->i_ioend_list, &completion_list);
+       list_replace_init(&ip->i_ioend_list, &tmp);
         spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
  
-       list_sort(NULL, &completion_list, xfs_ioend_compare);
-
-       while (!list_empty(&completion_list)) {
-               ioend = list_first_entry(&completion_list, struct xfs_ioend,
-                               io_list);
+       iomap_sort_ioends(&tmp);
+       while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
+                       io_list))) {
                 list_del_init(&ioend->io_list);
-               xfs_ioend_try_merge(ioend, &completion_list);
+               iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
                 xfs_end_ioend(ioend);
         }
  }
  
+static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
+{
+       return ioend->io_private ||
+               ioend->io_type == IOMAP_UNWRITTEN ||
+               (ioend->io_flags & IOMAP_F_SHARED);
+}
+
  STATIC void
  xfs_end_bio(
         struct bio              *bio)
  {
-       struct xfs_ioend        *ioend = bio->bi_private;
+       struct iomap_ioend      *ioend = bio->bi_private;
         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-       struct xfs_mount        *mp = ip->i_mount;
         unsigned long           flags;
  
-       if (ioend->io_fork == XFS_COW_FORK ||
-           ioend->io_state == XFS_EXT_UNWRITTEN ||
-           ioend->io_append_trans != NULL) {
-               spin_lock_irqsave(&ip->i_ioend_lock, flags);
-               if (list_empty(&ip->i_ioend_list))
-                       WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
-                                                &ip->i_ioend_work));
-               list_add_tail(&ioend->io_list, &ip->i_ioend_list);
-               spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
-       } else
-               xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
+       ASSERT(xfs_ioend_needs_workqueue(ioend));
+
+       spin_lock_irqsave(&ip->i_ioend_lock, flags);
+       if (list_empty(&ip->i_ioend_list))
+               WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+                                        &ip->i_ioend_work));
+       list_add_tail(&ioend->io_list, &ip->i_ioend_list);
+       spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
  }
  
  /*
@@ -421,19 +292,19 @@ xfs_end_bio(
   */
  static bool
  xfs_imap_valid(
-       struct xfs_writepage_ctx        *wpc,
+       struct iomap_writepage_ctx      *wpc,
         struct xfs_inode                *ip,
-       xfs_fileoff_t                   offset_fsb)
+       loff_t                          offset)
  {
-       if (offset_fsb < wpc->imap.br_startoff ||
-           offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
+       if (offset < wpc->iomap.offset ||
+           offset >= wpc->iomap.offset + wpc->iomap.length)
                 return false;
         /*
          * If this is a COW mapping, it is sufficient to check that the mapping
          * covers the offset. Be careful to check this first because the caller
          * can revalidate a COW mapping without updating the data seqno.
          */
-       if (wpc->fork == XFS_COW_FORK)
+       if (wpc->iomap.flags & IOMAP_F_SHARED)
                 return true;
  
         /*
@@ -443,17 +314,17 @@ xfs_imap_valid(
          * checked (and found nothing at this offset) could have added
          * overlapping blocks.
          */
-       if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
+       if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
                 return false;
         if (xfs_inode_has_cow_data(ip) &&
-           wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
+           XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
                 return false;
         return true;
  }
  
  /*
   * Pass in a dellalloc extent and convert it to real extents, return the real
- * extent that maps offset_fsb in wpc->imap.
+ * extent that maps offset_fsb in wpc->iomap.
   *
   * The current page is held locked so nothing could have removed the block
   * backing offset_fsb, although it could have moved from the COW to the data
@@ -461,32 +332,38 @@ xfs_imap_valid(
   */
  static int
  xfs_convert_blocks(
-       struct xfs_writepage_ctx *wpc,
+       struct iomap_writepage_ctx *wpc,
         struct xfs_inode        *ip,
-       xfs_fileoff_t           offset_fsb)
+       int                     whichfork,
+       loff_t                  offset)
  {
         int                     error;
+       unsigned                *seq;
+
+       if (whichfork == XFS_COW_FORK)
+               seq = &XFS_WPC(wpc)->cow_seq;
+       else
+               seq = &XFS_WPC(wpc)->data_seq;
  
         /*
-        * Attempt to allocate whatever delalloc extent currently backs
-        * offset_fsb and put the result into wpc->imap.  Allocate in a loop
-        * because it may take several attempts to allocate real blocks for a
-        * contiguous delalloc extent if free space is sufficiently fragmented.
+        * Attempt to allocate whatever delalloc extent currently backs offset
+        * and put the result into wpc->iomap.  Allocate in a loop because it
+        * may take several attempts to allocate real blocks for a contiguous
+        * delalloc extent if free space is sufficiently fragmented.
          */
         do {
-               error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
-                               &wpc->imap, wpc->fork == XFS_COW_FORK ?
-                                       &wpc->cow_seq : &wpc->data_seq);
+               error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
+                               &wpc->iomap, seq);
                 if (error)
                         return error;
-       } while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
+       } while (wpc->iomap.offset + wpc->iomap.length <= offset);
  
         return 0;
  }
  
-STATIC int
+static int
  xfs_map_blocks(
-       struct xfs_writepage_ctx *wpc,
+       struct iomap_writepage_ctx *wpc,
         struct inode            *inode,
         loff_t                  offset)
  {
@@ -496,6 +373,7 @@ xfs_map_blocks(
         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset);
         xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + count);
         xfs_fileoff_t           cow_fsb = NULLFILEOFF;
+       int                     whichfork = XFS_DATA_FORK;
         struct xfs_bmbt_irec    imap;
         struct xfs_iext_cursor  icur;
         int                     retries = 0;
@@ -519,7 +397,7 @@ xfs_map_blocks(
          * against concurrent updates and provides a memory barrier on the way
          * out that ensures that we always see the current value.
          */
-       if (xfs_imap_valid(wpc, ip, offset_fsb))
+       if (xfs_imap_valid(wpc, ip, offset))
                 return 0;
  
         /*
@@ -541,10 +419,10 @@ xfs_map_blocks(
             xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
                 cow_fsb = imap.br_startoff;
         if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
-               wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
+               XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
-               wpc->fork = XFS_COW_FORK;
+               whichfork = XFS_COW_FORK;
                 goto allocate_blocks;
         }
  
@@ -552,7 +430,7 @@ xfs_map_blocks(
          * No COW extent overlap. Revalidate now that we may have updated
          * ->cow_seq. If the data mapping is still valid, we're done.
          */
-       if (xfs_imap_valid(wpc, ip, offset_fsb)) {
+       if (xfs_imap_valid(wpc, ip, offset)) {
                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
                 return 0;
         }
@@ -564,11 +442,9 @@ xfs_map_blocks(
          */
         if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
-       wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
+       XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
         xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
-       wpc->fork = XFS_DATA_FORK;
-
         /* landed in a hole or beyond EOF? */
         if (imap.br_startoff > offset_fsb) {
                 imap.br_blockcount = imap.br_startoff - offset_fsb;
@@ -592,11 +468,11 @@ xfs_map_blocks(
             isnullstartblock(imap.br_startblock))
                 goto allocate_blocks;
  
-       wpc->imap = imap;
-       trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
+       xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
+       trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
         return 0;
  allocate_blocks:
-       error = xfs_convert_blocks(wpc, ip, offset_fsb);
+       error = xfs_convert_blocks(wpc, ip, whichfork, offset);
         if (error) {
                 /*
                  * If we failed to find the extent in the COW fork we might have
@@ -605,7 +481,7 @@ xfs_map_blocks(
                  * the former case, but prevent additional retries to avoid
                  * looping forever for the latter case.
                  */
-               if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
+               if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
                         goto retry;
                 ASSERT(error != -EAGAIN);
                 return error;
@@ -616,34 +492,22 @@ xfs_map_blocks(
          * original delalloc one.  Trim the return extent to the next COW
          * boundary again to force a re-lookup.
          */
-       if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
-           cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
-               wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
+       if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
+               loff_t          cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
+
+               if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
+                       wpc->iomap.length = cow_offset - wpc->iomap.offset;
+       }
  
-       ASSERT(wpc->imap.br_startoff <= offset_fsb);
-       ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
-       trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
+       ASSERT(wpc->iomap.offset <= offset);
+       ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
+       trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
         return 0;
  }
  
-/*
- * Submit the bio for an ioend. We are passed an ioend with a bio attached to
- * it, and we submit that bio. The ioend may be used for multiple bio
- * submissions, so we only want to allocate an append transaction for the ioend
- * once. In the case of multiple bio submission, each bio will take an IO
- * reference to the ioend to ensure that the ioend completion is only done once
- * all bios have been submitted and the ioend is really done.
- *
- * If @status is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we have marked paged for writeback
- * and unlocked them. In this situation, we need to fail the bio and ioend
- * rather than submit it to IO. This typically only happens on a filesystem
- * shutdown.
- */
-STATIC int
-xfs_submit_ioend(
-       struct writeback_control *wbc,
-       struct xfs_ioend        *ioend,
+static int
+xfs_prepare_ioend(
+       struct iomap_ioend      *ioend,
         int                     status)
  {
         unsigned int            nofs_flag;
@@ -656,157 +520,24 @@ xfs_submit_ioend(
         nofs_flag = memalloc_nofs_save();
  
         /* Convert CoW extents to regular */
-       if (!status && ioend->io_fork == XFS_COW_FORK) {
+       if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
                 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
                                 ioend->io_offset, ioend->io_size);
         }
  
         /* Reserve log space if we might write beyond the on-disk inode size. */
         if (!status &&
-           (ioend->io_fork == XFS_COW_FORK ||
-            ioend->io_state != XFS_EXT_UNWRITTEN) &&
+           ((ioend->io_flags & IOMAP_F_SHARED) ||
+            ioend->io_type != IOMAP_UNWRITTEN) &&
             xfs_ioend_is_append(ioend) &&
-           !ioend->io_append_trans)
+           !ioend->io_private)
                 status = xfs_setfilesize_trans_alloc(ioend);
  
         memalloc_nofs_restore(nofs_flag);
  
-       ioend->io_bio->bi_private = ioend;
-       ioend->io_bio->bi_end_io = xfs_end_bio;
-
-       /*
-        * If we are failing the IO now, just mark the ioend with an
-        * error and finish it. This will run IO completion immediately
-        * as there is only one reference to the ioend at this point in
-        * time.
-        */
-       if (status) {
-               ioend->io_bio->bi_status = errno_to_blk_status(status);
-               bio_endio(ioend->io_bio);
-               return status;
-       }
-
-       submit_bio(ioend->io_bio);
-       return 0;
-}
-
-static struct xfs_ioend *
-xfs_alloc_ioend(
-       struct inode            *inode,
-       int                     fork,
-       xfs_exntst_t            state,
-       xfs_off_t               offset,
-       struct block_device     *bdev,
-       sector_t                sector,
-       struct writeback_control *wbc)
-{
-       struct xfs_ioend        *ioend;
-       struct bio              *bio;
-
-       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
-       bio_set_dev(bio, bdev);
-       bio->bi_iter.bi_sector = sector;
-       bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
-       bio->bi_write_hint = inode->i_write_hint;
-       wbc_init_bio(wbc, bio);
-
-       ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
-       INIT_LIST_HEAD(&ioend->io_list);
-       ioend->io_fork = fork;
-       ioend->io_state = state;
-       ioend->io_inode = inode;
-       ioend->io_size = 0;
-       ioend->io_offset = offset;
-       ioend->io_append_trans = NULL;
-       ioend->io_bio = bio;
-       return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to do perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in xfs_destroy_ioend().
- */
-static struct bio *
-xfs_chain_bio(
-       struct bio              *prev)
-{
-       struct bio *new;
-
-       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
-       bio_copy_dev(new, prev);/* also copies over blkcg information */
-       new->bi_iter.bi_sector = bio_end_sector(prev);
-       new->bi_opf = prev->bi_opf;
-       new->bi_write_hint = prev->bi_write_hint;
-
-       bio_chain(prev, new);
-       bio_get(prev);          /* for xfs_destroy_ioend */
-       submit_bio(prev);
-       return new;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first, otherwise finish off the current ioend and start another.
- */
-STATIC void
-xfs_add_to_ioend(
-       struct inode            *inode,
-       xfs_off_t               offset,
-       struct page             *page,
-       struct iomap_page       *iop,
-       struct xfs_writepage_ctx *wpc,
-       struct writeback_control *wbc,
-       struct list_head        *iolist)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-       unsigned                len = i_blocksize(inode);
-       unsigned                poff = offset & (PAGE_SIZE - 1);
-       bool                    merged, same_page = false;
-       sector_t                sector;
-
-       sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
-               ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
-
-       if (!wpc->ioend ||
-           wpc->fork != wpc->ioend->io_fork ||
-           wpc->imap.br_state != wpc->ioend->io_state ||
-           sector != bio_end_sector(wpc->ioend->io_bio) ||
-           offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
-               if (wpc->ioend)
-                       list_add(&wpc->ioend->io_list, iolist);
-               wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
-                               wpc->imap.br_state, offset, bdev, sector, wbc);
-       }
-
-       merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
-                       &same_page);
-
-       if (iop && !same_page)
-               atomic_inc(&iop->write_count);
-
-       if (!merged) {
-               if (bio_full(wpc->ioend->io_bio, len))
-                       wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
-               bio_add_page(wpc->ioend->io_bio, page, len, poff);
-       }
-
-       wpc->ioend->io_size += len;
-       wbc_account_cgroup_owner(wbc, page, len);
-}
-
-STATIC void
-xfs_vm_invalidatepage(
-       struct page             *page,
-       unsigned int            offset,
-       unsigned int            length)
-{
-       trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
-       iomap_invalidatepage(page, offset, length);
+       if (xfs_ioend_needs_workqueue(ioend))
+               ioend->io_bio->bi_end_io = xfs_end_bio;
+       return status;
  }
  
  /*
@@ -820,8 +551,8 @@ xfs_vm_invalidatepage(
   * transaction as there is no space left for block reservation (typically why we
   * see a ENOSPC in writeback).
   */
-STATIC void
-xfs_aops_discard_page(
+static void
+xfs_discard_page(
         struct page             *page)
  {
         struct inode            *inode = page->mapping->host;
@@ -843,246 +574,14 @@ xfs_aops_discard_page(
         if (error && !XFS_FORCED_SHUTDOWN(mp))
                 xfs_alert(mp, "page discard unable to remove delalloc mapping.");
  out_invalidate:
-       xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
+       iomap_invalidatepage(page, 0, PAGE_SIZE);
  }
  
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * forward progress guarantees we need to provide. The current ioend we are
- * adding blocks to is cached on the writepage context, and if the new block
- * does not append to the cached ioend it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-xfs_writepage_map(
-       struct xfs_writepage_ctx *wpc,
-       struct writeback_control *wbc,
-       struct inode            *inode,
-       struct page             *page,
-       uint64_t                end_offset)
-{
-       LIST_HEAD(submit_list);
-       struct iomap_page       *iop = to_iomap_page(page);
-       unsigned                len = i_blocksize(inode);
-       struct xfs_ioend        *ioend, *next;
-       uint64_t                file_offset;    /* file offset of page */
-       int                     error = 0, count = 0, i;
-
-       ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
-       ASSERT(!iop || atomic_read(&iop->write_count) == 0);
-
-       /*
-        * Walk through the page to find areas to write back. If we run off the
-        * end of the current map or find the current map invalid, grab a new
-        * one.
-        */
-       for (i = 0, file_offset = page_offset(page);
-            i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
-            i++, file_offset += len) {
-               if (iop && !test_bit(i, iop->uptodate))
-                       continue;
-
-               error = xfs_map_blocks(wpc, inode, file_offset);
-               if (error)
-                       break;
-               if (wpc->imap.br_startblock == HOLESTARTBLOCK)
-                       continue;
-               xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
-                                &submit_list);
-               count++;
-       }
-
-       ASSERT(wpc->ioend || list_empty(&submit_list));
-       ASSERT(PageLocked(page));
-       ASSERT(!PageWriteback(page));
-
-       /*
-        * On error, we have to fail the ioend here because we may have set
-        * pages under writeback, we have to make sure we run IO completion to
-        * mark the error state of the IO appropriately, so we can't cancel the
-        * ioend directly here.  That means we have to mark this page as under
-        * writeback if we included any blocks from it in the ioend chain so
-        * that completion treats it correctly.
-        *
-        * If we didn't include the page in the ioend, the on error we can
-        * simply discard and unlock it as there are no other users of the page
-        * now.  The caller will still need to trigger submission of outstanding
-        * ioends on the writepage context so they are treated correctly on
-        * error.
-        */
-       if (unlikely(error)) {
-               if (!count) {
-                       xfs_aops_discard_page(page);
-                       ClearPageUptodate(page);
-                       unlock_page(page);
-                       goto done;
-               }
-
-               /*
-                * If the page was not fully cleaned, we need to ensure that the
-                * higher layers come back to it correctly.  That means we need
-                * to keep the page dirty, and for WB_SYNC_ALL writeback we need
-                * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
-                * so another attempt to write this page in this writeback sweep
-                * will be made.
-                */
-               set_page_writeback_keepwrite(page);
-       } else {
-               clear_page_dirty_for_io(page);
-               set_page_writeback(page);
-       }
-
-       unlock_page(page);
-
-       /*
-        * Preserve the original error if there was one, otherwise catch
-        * submission errors here and propagate into subsequent ioend
-        * submissions.
-        */
-       list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-               int error2;
-
-               list_del_init(&ioend->io_list);
-               error2 = xfs_submit_ioend(wbc, ioend, error);
-               if (error2 && !error)
-                       error = error2;
-       }
-
-       /*
-        * We can end up here with no error and nothing to write only if we race
-        * with a partial page truncate on a sub-page block sized filesystem.
-        */
-       if (!count)
-               end_page_writeback(page);
-done:
-       mapping_set_error(page->mapping, error);
-       return error;
-}
-
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
- * regular allocated space.
- */
-STATIC int
-xfs_do_writepage(
-       struct page             *page,
-       struct writeback_control *wbc,
-       void                    *data)
-{
-       struct xfs_writepage_ctx *wpc = data;
-       struct inode            *inode = page->mapping->host;
-       loff_t                  offset;
-       uint64_t              end_offset;
-       pgoff_t                 end_index;
-
-       trace_xfs_writepage(inode, page, 0, 0);
-
-       /*
-        * Refuse to write the page out if we are called from reclaim context.
-        *
-        * This avoids stack overflows when called from deeply used stacks in
-        * random callers for direct reclaim or memcg reclaim.  We explicitly
-        * allow reclaim from kswapd as the stack usage there is relatively low.
-        *
-        * This should never happen except in the case of a VM regression so
-        * warn about it.
-        */
-       if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-                       PF_MEMALLOC))
-               goto redirty;
-
-       /*
-        * Given that we do not allow direct reclaim to call us, we should
-        * never be called while in a filesystem transaction.
-        */
-       if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
-               goto redirty;
-
-       /*
-        * Is this page beyond the end of the file?
-        *
-        * The page index is less than the end_index, adjust the end_offset
-        * to the highest offset that this page should represent.
-        * -----------------------------------------------------
-        * |                    file mapping           | <EOF> |
-        * -----------------------------------------------------
-        * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
-        * ^--------------------------------^----------|--------
-        * |     desired writeback range    |      see else    |
-        * ---------------------------------^------------------|
-        */
-       offset = i_size_read(inode);
-       end_index = offset >> PAGE_SHIFT;
-       if (page->index < end_index)
-               end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
-       else {
-               /*
-                * Check whether the page to write out is beyond or straddles
-                * i_size or not.
-                * -------------------------------------------------------
-                * |            file mapping                    | <EOF>  |
-                * -------------------------------------------------------
-                * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
-                * ^--------------------------------^-----------|---------
-                * |                                |      Straddles     |
-                * ---------------------------------^-----------|--------|
-                */
-               unsigned offset_into_page = offset & (PAGE_SIZE - 1);
-
-               /*
-                * Skip the page if it is fully outside i_size, e.g. due to a
-                * truncate operation that is in progress. We must redirty the
-                * page so that reclaim stops reclaiming it. Otherwise
-                * xfs_vm_releasepage() is called on it and gets confused.
-                *
-                * Note that the end_index is unsigned long, it would overflow
-                * if the given offset is greater than 16TB on 32-bit system
-                * and if we do check the page is fully outside i_size or not
-                * via "if (page->index >= end_index + 1)" as "end_index + 1"
-                * will be evaluated to 0.  Hence this page will be redirtied
-                * and be written out repeatedly which would result in an
-                * infinite loop, the user program that perform this operation
-                * will hang.  Instead, we can verify this situation by checking
-                * if the page to write is totally beyond the i_size or if it's
-                * offset is just equal to the EOF.
-                */
-               if (page->index > end_index ||
-                   (page->index == end_index && offset_into_page == 0))
-                       goto redirty;
-
-               /*
-                * The page straddles i_size.  It must be zeroed out on each
-                * and every writepage invocation because it may be mmapped.
-                * "A file is mapped in multiples of the page size.  For a file
-                * that is not a multiple of the page size, the remaining
-                * memory is zeroed when mapped, and writes to that region are
-                * not written out to the file."
-                */
-               zero_user_segment(page, offset_into_page, PAGE_SIZE);
-
-               /* Adjust the end_offset to the end of file */
-               end_offset = offset;
-       }
-
-       return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
-
-redirty:
-       redirty_page_for_writepage(wbc, page);
-       unlock_page(page);
-       return 0;
-}
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+       .map_blocks             = xfs_map_blocks,
+       .prepare_ioend          = xfs_prepare_ioend,
+       .discard_page           = xfs_discard_page,
+};
  
  STATIC int
  xfs_vm_writepage(
@@ -1090,12 +589,8 @@ xfs_vm_writepage(
         struct writeback_control *wbc)
  {
         struct xfs_writepage_ctx wpc = { };
-       int                     ret;
  
-       ret = xfs_do_writepage(page, wbc, &wpc);
-       if (wpc.ioend)
-               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
-       return ret;
+       return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
  }
  
  STATIC int
@@ -1104,13 +599,9 @@ xfs_vm_writepages(
         struct writeback_control *wbc)
  {
         struct xfs_writepage_ctx wpc = { };
-       int                     ret;
  
         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-       ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
-       if (wpc.ioend)
-               ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
-       return ret;
+       return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
  }
  
  STATIC int
@@ -1123,15 +614,6 @@ xfs_dax_writepages(
                         xfs_find_bdev_for_inode(mapping->host), wbc);
  }
  
-STATIC int
-xfs_vm_releasepage(
-       struct page             *page,
-       gfp_t                   gfp_mask)
-{
-       trace_xfs_releasepage(page->mapping->host, page, 0, 0);
-       return iomap_releasepage(page, gfp_mask);
-}
-
  STATIC sector_t
  xfs_vm_bmap(
         struct address_space    *mapping,
@@ -1160,7 +642,6 @@ xfs_vm_readpage(
         struct file             *unused,
         struct page             *page)
  {
-       trace_xfs_vm_readpage(page->mapping->host, 1);
         return iomap_readpage(page, &xfs_iomap_ops);
  }
  
@@ -1171,7 +652,6 @@ xfs_vm_readpages(
         struct list_head        *pages,
         unsigned                nr_pages)
  {
-       trace_xfs_vm_readpages(mapping->host, nr_pages);
         return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
  }
  
@@ -1191,8 +671,8 @@ const struct address_space_operations xfs_address_space_operations = {
         .writepage              = xfs_vm_writepage,
         .writepages             = xfs_vm_writepages,
         .set_page_dirty         = iomap_set_page_dirty,
-       .releasepage            = xfs_vm_releasepage,
-       .invalidatepage         = xfs_vm_invalidatepage,
+       .releasepage            = iomap_releasepage,
+       .invalidatepage         = iomap_invalidatepage,
         .bmap                   = xfs_vm_bmap,
         .direct_IO              = noop_direct_IO,
         .migratepage            = iomap_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h

index 45a1ea240cbbb0a0b3ded8fcd982169b11789c72..687b11f34fa2ac6d64b3913a9153c9d4924f39ac 100644 (file)
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -6,23 +6,6 @@
  #ifndef __XFS_AOPS_H__
  #define __XFS_AOPS_H__
  
-extern struct bio_set xfs_ioend_bioset;
-
-/*
- * Structure for buffered I/O completions.
- */
-struct xfs_ioend {
-       struct list_head        io_list;        /* next ioend in chain */
-       int                     io_fork;        /* inode fork written back */
-       xfs_exntst_t            io_state;       /* extent state */
-       struct inode            *io_inode;      /* file being written to */
-       size_t                  io_size;        /* size of the extent */
-       xfs_off_t               io_offset;      /* offset in the file */
-       struct xfs_trans        *io_append_trans;/* xact. for size update */
-       struct bio              *io_bio;        /* bio being built */
-       struct bio              io_inline_bio;  /* MUST BE LAST! */
-};
-
  extern const struct address_space_operations xfs_address_space_operations;
  extern const struct address_space_operations xfs_dax_aops;
  
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 1ffb179f35d23ab5ae3bb12d98106430f6381034..c0620135a279d8533f7c088066dbc8997c2e1b3b 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -188,7 +188,7 @@ xfs_file_dio_aio_read(
         file_accessed(iocb->ki_filp);
  
         xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+       ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb));
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  
         return ret;
@@ -547,15 +547,12 @@ xfs_file_dio_aio_write(
         }
  
         trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
-       ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
-
         /*
-        * If unaligned, this is the only IO in-flight. If it has not yet
-        * completed, wait on it before we release the iolock to prevent
-        * subsequent overlapping IO.
+        * If unaligned, this is the only IO in-flight. Wait on it before we
+        * release the iolock to prevent subsequent overlapping IO.
          */
-       if (ret == -EIOCBQUEUED && unaligned_io)
-               inode_dio_wait(inode);
+       ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops,
+                          is_sync_kiocb(iocb) || unaligned_io);
  out:
         xfs_iunlock(ip, iolock);
  
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index f780e223b11852cca45aa6b1093facc4734a9b79..95719e161286c74a4552c60bc7a1353eb2115173 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -54,7 +54,7 @@ xfs_bmbt_to_iomap(
         struct xfs_inode        *ip,
         struct iomap            *iomap,
         struct xfs_bmbt_irec    *imap,
-       bool                    shared)
+       u16                     flags)
  {
         struct xfs_mount        *mp = ip->i_mount;
  
@@ -79,12 +79,11 @@ xfs_bmbt_to_iomap(
         iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
         iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
         iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+       iomap->flags = flags;
  
         if (xfs_ipincount(ip) &&
             (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                 iomap->flags |= IOMAP_F_DIRTY;
-       if (shared)
-               iomap->flags |= IOMAP_F_SHARED;
         return 0;
  }
  
@@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay(
         struct xfs_iext_cursor  icur, ccur;
         xfs_fsblock_t           prealloc_blocks = 0;
         bool                    eof = false, cow_eof = false, shared = false;
+       u16                     iomap_flags = 0;
         int                     whichfork = XFS_DATA_FORK;
         int                     error = 0;
  
@@ -707,22 +707,28 @@ xfs_file_iomap_begin_delay(
          * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
          * them out if the write happens to fail.
          */
-       iomap->flags |= IOMAP_F_NEW;
-       trace_xfs_iomap_alloc(ip, offset, count, whichfork,
-                       whichfork == XFS_DATA_FORK ? &imap : &cmap);
+       if (whichfork == XFS_DATA_FORK) {
+               iomap_flags |= IOMAP_F_NEW;
+               trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap);
+       } else {
+               trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap);
+       }
  done:
         if (whichfork == XFS_COW_FORK) {
                 if (imap.br_startoff > offset_fsb) {
                         xfs_trim_extent(&cmap, offset_fsb,
                                         imap.br_startoff - offset_fsb);
-                       error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+                       error = xfs_bmbt_to_iomap(ip, iomap, &cmap,
+                                       IOMAP_F_SHARED);
                         goto out_unlock;
                 }
                 /* ensure we only report blocks we have a reservation for */
                 xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
                 shared = true;
         }
-       error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+       if (shared)
+               iomap_flags |= IOMAP_F_SHARED;
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
  out_unlock:
         xfs_iunlock(ip, XFS_ILOCK_EXCL);
         return error;
@@ -922,7 +928,8 @@ xfs_file_iomap_begin(
         loff_t                  offset,
         loff_t                  length,
         unsigned                flags,
-       struct iomap            *iomap)
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -930,6 +937,7 @@ xfs_file_iomap_begin(
         xfs_fileoff_t           offset_fsb, end_fsb;
         int                     nimaps = 1, error = 0;
         bool                    shared = false;
+       u16                     iomap_flags = 0;
         unsigned                lockmode;
  
         if (XFS_FORCED_SHUTDOWN(mp))
@@ -1045,11 +1053,20 @@ xfs_file_iomap_begin(
         if (error)
                 return error;
  
-       iomap->flags |= IOMAP_F_NEW;
+       iomap_flags |= IOMAP_F_NEW;
         trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
  
  out_finish:
-       return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
+       /*
+        * Writes that span EOF might trigger an IO size update on completion,
+        * so consider them to be dirty for the purposes of O_DSYNC even if
+        * there is no other metadata changes pending or have been made here.
+        */
+       if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
+               iomap_flags |= IOMAP_F_DIRTY;
+       if (shared)
+               iomap_flags |= IOMAP_F_SHARED;
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
  
  out_found:
         ASSERT(nimaps);
@@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin(
         loff_t                  offset,
         loff_t                  length,
         unsigned                flags,
-       struct iomap            *iomap)
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin(
                 if (data_fsb < cow_fsb + cmap.br_blockcount)
                         end_fsb = min(end_fsb, data_fsb);
                 xfs_trim_extent(&cmap, offset_fsb, end_fsb);
-               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
+               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
                 /*
                  * This is a COW extent, so we must probe the page cache
                  * because there could be dirty page cache being backed
@@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin(
         imap.br_state = XFS_EXT_NORM;
  done:
         xfs_trim_extent(&imap, offset_fsb, end_fsb);
-       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
  out_unlock:
         xfs_iunlock(ip, lockmode);
         return error;
@@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin(
         loff_t                  offset,
         loff_t                  length,
         unsigned                flags,
-       struct iomap            *iomap)
+       struct iomap            *iomap,
+       struct iomap            *srcmap)
  {
         struct xfs_inode        *ip = XFS_I(inode);
         struct xfs_mount        *mp = ip->i_mount;
@@ -1261,7 +1280,7 @@ xfs_xattr_iomap_begin(
         if (error)
                 return error;
         ASSERT(nimaps);
-       return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
  }
  
  const struct iomap_ops xfs_xattr_iomap_ops = {
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h

index 5c2f6aa6d78ffa810bdaeae1ed06cb85f465d1b6..71d0ae460c44021a791537800331f3b4751ec166 100644 (file)
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
  int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
  
  int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
-               struct xfs_bmbt_irec *, bool shared);
+               struct xfs_bmbt_irec *, u16);
  xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
  
  static inline xfs_filblks_t
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c

index a339bd5fa260471428e27809808899c39ff949e0..9c96493be9e0e09ef9527b0e65c6a652b6b6242d 100644 (file)
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -178,7 +178,7 @@ xfs_fs_map_blocks(
         }
         xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  
-       error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
         *device_generation = mp->m_generation;
         return error;
  out_unlock:
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c

index 0f08153b4994190fcd21d8b4fd9f6914ae908cb4..a9634110c78306a6dee9a67cb0b0c70048489c12 100644 (file)
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents(
                         flen = XFS_FSB_TO_B(mp, rlen);
                         if (fpos + flen > isize)
                                 flen = isize - fpos;
-                       error = iomap_file_dirty(VFS_I(ip), fpos, flen,
+                       error = iomap_file_unshare(VFS_I(ip), fpos, flen,
                                         &xfs_iomap_ops);
                         xfs_ilock(ip, XFS_ILOCK_EXCL);
                         if (error)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 8d1df9f8be071db3be45ac169e9382f740ffed7e..0a8cf6b87a214469f1c80c7a298c39607abe291f 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -40,7 +40,6 @@
  #include <linux/parser.h>
  
  static const struct super_operations xfs_super_operations;
-struct bio_set xfs_ioend_bioset;
  
  static struct kset *xfs_kset;          /* top-level xfs sysfs dir */
  #ifdef DEBUG
@@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs");
  STATIC int __init
  xfs_init_zones(void)
  {
-       if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
-                       offsetof(struct xfs_ioend, io_inline_bio),
-                       BIOSET_NEED_BVECS))
-               goto out;
-
         xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
                                                 "xfs_log_ticket");
         if (!xfs_log_ticket_zone)
-               goto out_free_ioend_bioset;
+               goto out;
  
         xfs_bmap_free_item_zone = kmem_zone_init(
                         sizeof(struct xfs_extent_free_item),
@@ -1996,8 +1990,6 @@ xfs_init_zones(void)
         kmem_zone_destroy(xfs_bmap_free_item_zone);
   out_destroy_log_ticket_zone:
         kmem_zone_destroy(xfs_log_ticket_zone);
- out_free_ioend_bioset:
-       bioset_exit(&xfs_ioend_bioset);
   out:
         return -ENOMEM;
  }
@@ -2028,7 +2020,6 @@ xfs_destroy_zones(void)
         kmem_zone_destroy(xfs_btree_cur_zone);
         kmem_zone_destroy(xfs_bmap_free_item_zone);
         kmem_zone_destroy(xfs_log_ticket_zone);
-       bioset_exit(&xfs_ioend_bioset);
  }
  
  STATIC int __init
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index eaae275ed43084dc6ba14715cd854871341698b4..cbb23d7a3554ab568710d91d4002af05b9cd9f43 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write);
  DEFINE_RW_EVENT(xfs_file_direct_write);
  DEFINE_RW_EVENT(xfs_file_dax_write);
  
-DECLARE_EVENT_CLASS(xfs_page_class,
-       TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
-                unsigned int len),
-       TP_ARGS(inode, page, off, len),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(pgoff_t, pgoff)
-               __field(loff_t, size)
-               __field(unsigned long, offset)
-               __field(unsigned int, length)
-       ),
-       TP_fast_assign(
-               __entry->dev = inode->i_sb->s_dev;
-               __entry->ino = XFS_I(inode)->i_ino;
-               __entry->pgoff = page_offset(page);
-               __entry->size = i_size_read(inode);
-               __entry->offset = off;
-               __entry->length = len;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                 "length %x",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->pgoff,
-                 __entry->size,
-                 __entry->offset,
-                 __entry->length)
-)
-
-#define DEFINE_PAGE_EVENT(name)                \
-DEFINE_EVENT(xfs_page_class, name,     \
-       TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
-                unsigned int len),     \
-       TP_ARGS(inode, page, off, len))
-DEFINE_PAGE_EVENT(xfs_writepage);
-DEFINE_PAGE_EVENT(xfs_releasepage);
-DEFINE_PAGE_EVENT(xfs_invalidatepage);
-
-DECLARE_EVENT_CLASS(xfs_readpage_class,
-       TP_PROTO(struct inode *inode, int nr_pages),
-       TP_ARGS(inode, nr_pages),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(int, nr_pages)
-       ),
-       TP_fast_assign(
-               __entry->dev = inode->i_sb->s_dev;
-               __entry->ino = inode->i_ino;
-               __entry->nr_pages = nr_pages;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->nr_pages)
-)
-
-#define DEFINE_READPAGE_EVENT(name)            \
-DEFINE_EVENT(xfs_readpage_class, name, \
-       TP_PROTO(struct inode *inode, int nr_pages), \
-       TP_ARGS(inode, nr_pages))
-DEFINE_READPAGE_EVENT(xfs_vm_readpage);
-DEFINE_READPAGE_EVENT(xfs_vm_readpages);
-
  DECLARE_EVENT_CLASS(xfs_imap_class,
         TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
                  int whichfork, struct xfs_bmbt_irec *irec),
diff --git a/include/linux/iomap.h b/include/linux/iomap.h

index 7aa5d61179361d3bbdd3e949645568160d25ae15..8b09463dae0dba2a4a60ce2372863ba0e8fd16b9 100644 (file)
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -4,6 +4,7 @@
  
  #include <linux/atomic.h>
  #include <linux/bitmap.h>
+#include <linux/blk_types.h>
  #include <linux/mm.h>
  #include <linux/types.h>
  #include <linux/mm_types.h>
@@ -12,6 +13,7 @@
  struct address_space;
  struct fiemap_extent_info;
  struct inode;
+struct iomap_writepage_ctx;
  struct iov_iter;
  struct kiocb;
  struct page;
@@ -21,28 +23,45 @@ struct vm_fault;
  /*
   * Types of block ranges for iomap mappings:
   */
-#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
-#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
-#define IOMAP_MAPPED   0x03    /* blocks allocated at @addr */
-#define IOMAP_UNWRITTEN        0x04    /* blocks allocated at @addr in unwritten state */
-#define IOMAP_INLINE   0x05    /* data inline in the inode */
+#define IOMAP_HOLE     0       /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 1       /* delayed allocation blocks */
+#define IOMAP_MAPPED   2       /* blocks allocated at @addr */
+#define IOMAP_UNWRITTEN        3       /* blocks allocated at @addr in unwritten state */
+#define IOMAP_INLINE   4       /* data inline in the inode */
  
  /*
- * Flags for all iomap mappings:
+ * Flags reported by the file system from iomap_begin:
+ *
+ * IOMAP_F_NEW indicates that the blocks have been newly allocated and need
+ * zeroing for areas that no data is copied to.
   *
   * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
   * written data and requires fdatasync to commit them to persistent storage.
+ * This needs to take into account metadata changes that *may* be made at IO
+ * completion, such as file size updates from direct IO.
+ *
+ * IOMAP_F_SHARED indicates that the blocks are shared, and will need to be
+ * unshared as part a write.
+ *
+ * IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block
+ * mappings.
+ *
+ * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
+ * buffer heads for this mapping.
   */
-#define IOMAP_F_NEW            0x01    /* blocks have been newly allocated */
-#define IOMAP_F_DIRTY          0x02    /* uncommitted metadata */
-#define IOMAP_F_BUFFER_HEAD    0x04    /* file system requires buffer heads */
-#define IOMAP_F_SIZE_CHANGED   0x08    /* file size has changed */
+#define IOMAP_F_NEW            0x01
+#define IOMAP_F_DIRTY          0x02
+#define IOMAP_F_SHARED         0x04
+#define IOMAP_F_MERGED         0x08
+#define IOMAP_F_BUFFER_HEAD    0x10
  
  /*
- * Flags that only need to be reported for IOMAP_REPORT requests:
+ * Flags set by the core iomap code during operations:
+ *
+ * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
+ * has changed as the result of this write operation.
   */
-#define IOMAP_F_MERGED         0x10    /* contains multiple blocks/extents */
-#define IOMAP_F_SHARED         0x20    /* block shared with another file */
+#define IOMAP_F_SIZE_CHANGED   0x100
  
  /*
   * Flags from 0x1000 up are for file system specific usage:
@@ -110,7 +129,8 @@ struct iomap_ops {
          * The actual length is returned in iomap->length.
          */
         int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
-                       unsigned flags, struct iomap *iomap);
+                       unsigned flags, struct iomap *iomap,
+                       struct iomap *srcmap);
  
         /*
          * Commit and/or unreserve space previous allocated using iomap_begin.
@@ -126,29 +146,12 @@ struct iomap_ops {
   * Main iomap iterator function.
   */
  typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
-               void *data, struct iomap *iomap);
+               void *data, struct iomap *iomap, struct iomap *srcmap);
  
  loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
                 unsigned flags, const struct iomap_ops *ops, void *data,
                 iomap_actor_t actor);
  
-/*
- * Structure allocate for each page when block size < PAGE_SIZE to track
- * sub-page uptodate status and I/O completions.
- */
-struct iomap_page {
-       atomic_t                read_count;
-       atomic_t                write_count;
-       DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
-};
-
-static inline struct iomap_page *to_iomap_page(struct page *page)
-{
-       if (page_has_private(page))
-               return (struct iomap_page *)page_private(page);
-       return NULL;
-}
-
  ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
                 const struct iomap_ops *ops);
  int iomap_readpage(struct page *page, const struct iomap_ops *ops);
@@ -166,7 +169,7 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
  #else
  #define iomap_migrate_page NULL
  #endif
-int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
+int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                 const struct iomap_ops *ops);
  int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
                 bool *did_zero, const struct iomap_ops *ops);
@@ -183,6 +186,63 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
  sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
                 const struct iomap_ops *ops);
  
+/*
+ * Structure for writeback I/O completions.
+ */
+struct iomap_ioend {
+       struct list_head        io_list;        /* next ioend in chain */
+       u16                     io_type;
+       u16                     io_flags;       /* IOMAP_F_* */
+       struct inode            *io_inode;      /* file being written to */
+       size_t                  io_size;        /* size of the extent */
+       loff_t                  io_offset;      /* offset in the file */
+       void                    *io_private;    /* file system private data */
+       struct bio              *io_bio;        /* bio being built */
+       struct bio              io_inline_bio;  /* MUST BE LAST! */
+};
+
+struct iomap_writeback_ops {
+       /*
+        * Required, maps the blocks so that writeback can be performed on
+        * the range starting at offset.
+        */
+       int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
+                               loff_t offset);
+
+       /*
+        * Optional, allows the file systems to perform actions just before
+        * submitting the bio and/or override the bio end_io handler for complex
+        * operations like copy on write extent manipulation or unwritten extent
+        * conversions.
+        */
+       int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+
+       /*
+        * Optional, allows the file system to discard state on a page where
+        * we failed to submit any I/O.
+        */
+       void (*discard_page)(struct page *page);
+};
+
+struct iomap_writepage_ctx {
+       struct iomap            iomap;
+       struct iomap_ioend      *ioend;
+       const struct iomap_writeback_ops *ops;
+};
+
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+               struct list_head *more_ioends,
+               void (*merge_private)(struct iomap_ioend *ioend,
+                               struct iomap_ioend *next));
+void iomap_sort_ioends(struct list_head *ioend_list);
+int iomap_writepage(struct page *page, struct writeback_control *wbc,
+               struct iomap_writepage_ctx *wpc,
+               const struct iomap_writeback_ops *ops);
+int iomap_writepages(struct address_space *mapping,
+               struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
+               const struct iomap_writeback_ops *ops);
+
  /*
   * Flags for direct I/O ->end_io:
   */
@@ -195,7 +255,8 @@ struct iomap_dio_ops {
  };
  
  ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
-               const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+               bool wait_for_completion);
  int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
  
  #ifdef CONFIG_SWAP
author	Theodore Ts'o <tytso@mit.edu>
	Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)
committer	Theodore Ts'o <tytso@mit.edu>
	Tue, 5 Nov 2019 16:31:32 +0000 (11:31 -0500)
fs/dax.c		patch \| blob \| history
fs/ext2/inode.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/gfs2/bmap.c		patch \| blob \| history
fs/gfs2/file.c		patch \| blob \| history
fs/iomap/Makefile		patch \| blob \| history
fs/iomap/apply.c		patch \| blob \| history
fs/iomap/buffered-io.c		patch \| blob \| history
fs/iomap/direct-io.c		patch \| blob \| history
fs/iomap/fiemap.c		patch \| blob \| history
fs/iomap/seek.c		patch \| blob \| history
fs/iomap/swapfile.c		patch \| blob \| history
fs/iomap/trace.c	[new file with mode: 0644]	patch \| blob
fs/iomap/trace.h	[new file with mode: 0644]	patch \| blob
fs/xfs/libxfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/libxfs/xfs_bmap.h		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_aops.h		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iomap.h		patch \| blob \| history
fs/xfs/xfs_pnfs.c		patch \| blob \| history
fs/xfs/xfs_reflink.c		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
include/linux/iomap.h		patch \| blob \| history