Merge tag 'pwm/for-5.5-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/thierry...

[linux.git] / fs / ext4 / file.c
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index 8d2bbcc2d8133d6e43544e08fc4bd16f05a449fe..6a7293a5cda2d6f95fdb7dae8de3865bfc5d4f22 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,10 +29,58 @@
  #include <linux/pagevec.h>
  #include <linux/uio.h>
  #include <linux/mman.h>
+#include <linux/backing-dev.h>
  #include "ext4.h"
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #include "acl.h"
+#include "truncate.h"
+
+static bool ext4_dio_supported(struct inode *inode)
+{
+       if (IS_ENABLED(CONFIG_FS_ENCRYPTION) && IS_ENCRYPTED(inode))
+               return false;
+       if (fsverity_active(inode))
+               return false;
+       if (ext4_should_journal_data(inode))
+               return false;
+       if (ext4_has_inline_data(inode))
+               return false;
+       return true;
+}
+
+static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+       ssize_t ret;
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock_shared(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock_shared(inode);
+       }
+
+       if (!ext4_dio_supported(inode)) {
+               inode_unlock_shared(inode);
+               /*
+                * Fallback to buffered I/O if the operation being performed on
+                * the inode is not supported by direct I/O. The IOCB_DIRECT
+                * flag needs to be cleared here in order to ensure that the
+                * direct I/O path within generic_file_read_iter() is not
+                * taken.
+                */
+               iocb->ki_flags &= ~IOCB_DIRECT;
+               return generic_file_read_iter(iocb, to);
+       }
+
+       ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL,
+                          is_sync_kiocb(iocb));
+       inode_unlock_shared(inode);
+
+       file_accessed(iocb->ki_filp);
+       return ret;
+}
  
  #ifdef CONFIG_FS_DAX
  static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -64,16 +112,21 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
  
  static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
  {
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                 return -EIO;
  
         if (!iov_iter_count(to))
                 return 0; /* skip atime */
  
  #ifdef CONFIG_FS_DAX
-       if (IS_DAX(file_inode(iocb->ki_filp)))
+       if (IS_DAX(inode))
                 return ext4_dax_read_iter(iocb, to);
  #endif
+       if (iocb->ki_flags & IOCB_DIRECT)
+               return ext4_dio_read_iter(iocb, to);
+
         return generic_file_read_iter(iocb, to);
  }
  
@@ -103,13 +156,6 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
         return 0;
  }
  
-static void ext4_unwritten_wait(struct inode *inode)
-{
-       wait_queue_head_t *wq = ext4_ioend_wq(inode);
-
-       wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
-}
-
  /*
   * This tests whether the IO in question is block-aligned or not.
   * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
@@ -162,13 +208,13 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
         struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
  
+       if (unlikely(IS_IMMUTABLE(inode)))
+               return -EPERM;
+
         ret = generic_write_checks(iocb, from);
         if (ret <= 0)
                 return ret;
  
-       if (unlikely(IS_IMMUTABLE(inode)))
-               return -EPERM;
-
         /*
          * If we have encountered a bitmap-format file, the size limit
          * is smaller than s_maxbytes, which is for extent-mapped files.
@@ -180,56 +226,266 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
                         return -EFBIG;
                 iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
         }
+
+       ret = file_modified(iocb->ki_filp);
+       if (ret)
+               return ret;
+
         return iov_iter_count(from);
  }
  
-#ifdef CONFIG_FS_DAX
-static ssize_t
-ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+                                       struct iov_iter *from)
  {
-       struct inode *inode = file_inode(iocb->ki_filp);
         ssize_t ret;
+       struct inode *inode = file_inode(iocb->ki_filp);
  
-       if (!inode_trylock(inode)) {
-               if (iocb->ki_flags & IOCB_NOWAIT)
-                       return -EAGAIN;
-               inode_lock(inode);
-       }
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               return -EOPNOTSUPP;
+
+       inode_lock(inode);
         ret = ext4_write_checks(iocb, from);
         if (ret <= 0)
                 goto out;
-       ret = file_remove_privs(iocb->ki_filp);
-       if (ret)
-               goto out;
-       ret = file_update_time(iocb->ki_filp);
-       if (ret)
-               goto out;
  
-       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+       current->backing_dev_info = inode_to_bdi(inode);
+       ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+       current->backing_dev_info = NULL;
+
  out:
         inode_unlock(inode);
-       if (ret > 0)
+       if (likely(ret > 0)) {
+               iocb->ki_pos += ret;
                 ret = generic_write_sync(iocb, ret);
+       }
+
         return ret;
  }
-#endif
  
-static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
+                                          ssize_t written, size_t count)
  {
+       handle_t *handle;
+       bool truncate = false;
+       u8 blkbits = inode->i_blkbits;
+       ext4_lblk_t written_blk, end_blk;
+
+       /*
+        * Note that EXT4_I(inode)->i_disksize can get extended up to
+        * inode->i_size while the I/O was running due to writeback of delalloc
+        * blocks. But, the code in ext4_iomap_alloc() is careful to use
+        * zeroed/unwritten extents if this is possible; thus we won't leave
+        * uninitialized blocks in a file even if we didn't succeed in writing
+        * as much as we intended.
+        */
+       WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize);
+       if (offset + count <= EXT4_I(inode)->i_disksize) {
+               /*
+                * We need to ensure that the inode is removed from the orphan
+                * list if it has been added prematurely, due to writeback of
+                * delalloc blocks.
+                */
+               if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) {
+                       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+
+                       if (IS_ERR(handle)) {
+                               ext4_orphan_del(NULL, inode);
+                               return PTR_ERR(handle);
+                       }
+
+                       ext4_orphan_del(handle, inode);
+                       ext4_journal_stop(handle);
+               }
+
+               return written;
+       }
+
+       if (written < 0)
+               goto truncate;
+
+       handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+       if (IS_ERR(handle)) {
+               written = PTR_ERR(handle);
+               goto truncate;
+       }
+
+       if (ext4_update_inode_size(inode, offset + written))
+               ext4_mark_inode_dirty(handle, inode);
+
+       /*
+        * We may need to truncate allocated but not written blocks beyond EOF.
+        */
+       written_blk = ALIGN(offset + written, 1 << blkbits);
+       end_blk = ALIGN(offset + count, 1 << blkbits);
+       if (written_blk < end_blk && ext4_can_truncate(inode))
+               truncate = true;
+
+       /*
+        * Remove the inode from the orphan list if it has been extended and
+        * everything went OK.
+        */
+       if (!truncate && inode->i_nlink)
+               ext4_orphan_del(handle, inode);
+       ext4_journal_stop(handle);
+
+       if (truncate) {
+truncate:
+               ext4_truncate_failed_write(inode);
+               /*
+                * If the truncate operation failed early, then the inode may
+                * still be on the orphan list. In that case, we need to try
+                * remove the inode from the in-memory linked list.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
+
+       return written;
+}
+
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
+                                int error, unsigned int flags)
+{
+       loff_t offset = iocb->ki_pos;
         struct inode *inode = file_inode(iocb->ki_filp);
-       int o_direct = iocb->ki_flags & IOCB_DIRECT;
-       int unaligned_aio = 0;
-       int overwrite = 0;
+
+       if (error)
+               return error;
+
+       if (size && flags & IOMAP_DIO_UNWRITTEN)
+               return ext4_convert_unwritten_extents(NULL, inode,
+                                                     offset, size);
+
+       return 0;
+}
+
+static const struct iomap_dio_ops ext4_dio_write_ops = {
+       .end_io = ext4_dio_write_end_io,
+};
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
         ssize_t ret;
+       size_t count;
+       loff_t offset;
+       handle_t *handle;
+       struct inode *inode = file_inode(iocb->ki_filp);
+       bool extend = false, overwrite = false, unaligned_aio = false;
  
-       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
-               return -EIO;
+       if (iocb->ki_flags & IOCB_NOWAIT) {
+               if (!inode_trylock(inode))
+                       return -EAGAIN;
+       } else {
+               inode_lock(inode);
+       }
+
+       if (!ext4_dio_supported(inode)) {
+               inode_unlock(inode);
+               /*
+                * Fallback to buffered I/O if the inode does not support
+                * direct I/O.
+                */
+               return ext4_buffered_write_iter(iocb, from);
+       }
+
+       ret = ext4_write_checks(iocb, from);
+       if (ret <= 0) {
+               inode_unlock(inode);
+               return ret;
+       }
+
+       /*
+        * Unaligned asynchronous direct I/O must be serialized among each
+        * other as the zeroing of partial blocks of two competing unaligned
+        * asynchronous direct I/O writes can result in data corruption.
+        */
+       offset = iocb->ki_pos;
+       count = iov_iter_count(from);
+       if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+           !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+               unaligned_aio = true;
+               inode_dio_wait(inode);
+       }
+
+       /*
+        * Determine whether the I/O will overwrite allocated and initialized
+        * blocks. If so, check to see whether it is possible to take the
+        * dioread_nolock path.
+        */
+       if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+           ext4_should_dioread_nolock(inode)) {
+               overwrite = true;
+               downgrade_write(&inode->i_rwsem);
+       }
+
+       if (offset + count > EXT4_I(inode)->i_disksize) {
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out;
+               }
+
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
+
+               extend = true;
+               ext4_journal_stop(handle);
+       }
+
+       ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, &ext4_dio_write_ops,
+                          is_sync_kiocb(iocb) || unaligned_aio || extend);
+
+       if (extend)
+               ret = ext4_handle_inode_extension(inode, offset, ret, count);
+
+out:
+       if (overwrite)
+               inode_unlock_shared(inode);
+       else
+               inode_unlock(inode);
+
+       if (ret >= 0 && iov_iter_count(from)) {
+               ssize_t err;
+               loff_t endbyte;
+
+               offset = iocb->ki_pos;
+               err = ext4_buffered_write_iter(iocb, from);
+               if (err < 0)
+                       return err;
+
+               /*
+                * We need to ensure that the pages within the page cache for
+                * the range covered by this I/O are written to disk and
+                * invalidated. This is in attempt to preserve the expected
+                * direct I/O semantics in the case we fallback to buffered I/O
+                * to complete off the I/O request.
+                */
+               ret += err;
+               endbyte = offset + err - 1;
+               err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping,
+                                                  offset, endbyte);
+               if (!err)
+                       invalidate_mapping_pages(iocb->ki_filp->f_mapping,
+                                                offset >> PAGE_SHIFT,
+                                                endbyte >> PAGE_SHIFT);
+       }
+
+       return ret;
+}
  
  #ifdef CONFIG_FS_DAX
-       if (IS_DAX(inode))
-               return ext4_dax_write_iter(iocb, from);
-#endif
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       ssize_t ret;
+       size_t count;
+       loff_t offset;
+       handle_t *handle;
+       bool extend = false;
+       struct inode *inode = file_inode(iocb->ki_filp);
  
         if (!inode_trylock(inode)) {
                 if (iocb->ki_flags & IOCB_NOWAIT)
@@ -241,49 +497,55 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
         if (ret <= 0)
                 goto out;
  
-       /*
-        * Unaligned direct AIO must be serialized among each other as zeroing
-        * of partial blocks of two competing unaligned AIOs can result in data
-        * corruption.
-        */
-       if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
-           !is_sync_kiocb(iocb) &&
-           ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
-               unaligned_aio = 1;
-               ext4_unwritten_wait(inode);
-       }
+       offset = iocb->ki_pos;
+       count = iov_iter_count(from);
  
-       iocb->private = &overwrite;
-       /* Check whether we do a DIO overwrite or not */
-       if (o_direct && !unaligned_aio) {
-               if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-                       if (ext4_should_dioread_nolock(inode))
-                               overwrite = 1;
-               } else if (iocb->ki_flags & IOCB_NOWAIT) {
-                       ret = -EAGAIN;
+       if (offset + count > EXT4_I(inode)->i_disksize) {
+               handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
                         goto out;
                 }
-       }
  
-       ret = __generic_file_write_iter(iocb, from);
-       /*
-        * Unaligned direct AIO must be the only IO in flight. Otherwise
-        * overlapping aligned IO after unaligned might result in data
-        * corruption.
-        */
-       if (ret == -EIOCBQUEUED && unaligned_aio)
-               ext4_unwritten_wait(inode);
-       inode_unlock(inode);
+               ret = ext4_orphan_add(handle, inode);
+               if (ret) {
+                       ext4_journal_stop(handle);
+                       goto out;
+               }
  
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
+               extend = true;
+               ext4_journal_stop(handle);
+       }
  
-       return ret;
+       ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
  
+       if (extend)
+               ret = ext4_handle_inode_extension(inode, offset, ret, count);
  out:
         inode_unlock(inode);
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
         return ret;
  }
+#endif
+
+static ssize_t
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct inode *inode = file_inode(iocb->ki_filp);
+
+       if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+               return -EIO;
+
+#ifdef CONFIG_FS_DAX
+       if (IS_DAX(inode))
+               return ext4_dax_write_iter(iocb, from);
+#endif
+       if (iocb->ki_flags & IOCB_DIRECT)
+               return ext4_dio_write_iter(iocb, from);
+
+       return ext4_buffered_write_iter(iocb, from);
+}
  
  #ifdef CONFIG_FS_DAX
  static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
@@ -494,12 +756,14 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
                                                 maxbytes, i_size_read(inode));
         case SEEK_HOLE:
                 inode_lock_shared(inode);
-               offset = iomap_seek_hole(inode, offset, &ext4_iomap_ops);
+               offset = iomap_seek_hole(inode, offset,
+                                        &ext4_iomap_report_ops);
                 inode_unlock_shared(inode);
                 break;
         case SEEK_DATA:
                 inode_lock_shared(inode);
-               offset = iomap_seek_data(inode, offset, &ext4_iomap_ops);
+               offset = iomap_seek_data(inode, offset,
+                                        &ext4_iomap_report_ops);
                 inode_unlock_shared(inode);
                 break;
         }