]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - fs/btrfs/inode.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/josef/btrfs...
[linux.git] / fs / btrfs / inode.c
index 16d9e8e191e6ac34d92da08fe90b46e2de42ebf8..1aa98be54ce02aefbc105ba5f8579d824a17aad6 100644 (file)
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/mount.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
 
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
        if (list_empty(&async_cow->extents))
                return 0;
 
-
+again:
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
                                          struct async_extent, list);
@@ -648,6 +648,8 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                                  async_extent->ram_size - 1,
                                                  btrfs_get_extent,
                                                  WB_SYNC_ALL);
+                       else if (ret)
+                               unlock_page(async_cow->locked_page);
                        kfree(async_extent);
                        cond_resched();
                        continue;
@@ -672,6 +674,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
 
                if (ret) {
                        int i;
+
                        for (i = 0; i < async_extent->nr_pages; i++) {
                                WARN_ON(async_extent->pages[i]->mapping);
                                page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ static noinline int submit_compressed_extents(struct inode *inode,
                        kfree(async_extent->pages);
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
-                       unlock_extent(io_tree, async_extent->start,
-                                     async_extent->start +
-                                     async_extent->ram_size - 1);
+
                        if (ret == -ENOSPC)
                                goto retry;
-                       goto out_free; /* JDM: Requeue? */
+                       goto out_free;
                }
 
                /*
@@ -696,10 +697,13 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                        async_extent->ram_size - 1, 0);
 
                em = alloc_extent_map();
-               BUG_ON(!em); /* -ENOMEM */
+               if (!em)
+                       goto out_free_reserve;
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
+               em->mod_start = em->start;
+               em->mod_len = em->len;
 
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -726,6 +730,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                                async_extent->ram_size - 1, 0);
                }
 
+               if (ret)
+                       goto out_free_reserve;
+
                ret = btrfs_add_ordered_extent_compress(inode,
                                                async_extent->start,
                                                ins.objectid,
@@ -733,7 +740,8 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-               BUG_ON(ret); /* -ENOMEM */
+               if (ret)
+                       goto out_free_reserve;
 
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ static noinline int submit_compressed_extents(struct inode *inode,
                                    ins.objectid,
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
-
-               BUG_ON(ret); /* -ENOMEM */
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
+               if (ret)
+                       goto out;
                cond_resched();
        }
        ret = 0;
 out:
        return ret;
+out_free_reserve:
+       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_free:
+       extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                    async_extent->start,
+                                    async_extent->start +
+                                    async_extent->ram_size - 1,
+                                    NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+                                    EXTENT_CLEAR_UNLOCK |
+                                    EXTENT_CLEAR_DELALLOC |
+                                    EXTENT_CLEAR_DIRTY |
+                                    EXTENT_SET_WRITEBACK |
+                                    EXTENT_END_WRITEBACK);
        kfree(async_extent);
-       goto out;
+       goto again;
 }
 
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                em->orig_start = em->start;
                ram_size = ins.offset;
                em->len = ins.offset;
+               em->mod_start = em->start;
+               em->mod_len = em->len;
 
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
                        em->block_start = disk_bytenr;
                        em->orig_block_len = disk_num_bytes;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
+                       em->mod_start = em->start;
+                       em->mod_len = em->len;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
                        em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
 
-               spin_lock(&root->fs_info->delalloc_lock);
+               __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+                                    root->fs_info->delalloc_batch);
+               spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
-               root->fs_info->delalloc_bytes += len;
-               if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                       list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-                                     &root->fs_info->delalloc_inodes);
+               if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                        &BTRFS_I(inode)->runtime_flags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                               list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                                             &root->fs_info->delalloc_inodes);
+                               set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                       &BTRFS_I(inode)->runtime_flags);
+                       }
+                       spin_unlock(&root->fs_info->delalloc_lock);
                }
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
 
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    && do_list)
                        btrfs_free_reserved_data_space(inode, len);
 
-               spin_lock(&root->fs_info->delalloc_lock);
-               root->fs_info->delalloc_bytes -= len;
+               __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+                                    root->fs_info->delalloc_batch);
+               spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes -= len;
-
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-                   !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-                       list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                   test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                            &BTRFS_I(inode)->runtime_flags)) {
+                       spin_lock(&root->fs_info->delalloc_lock);
+                       if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                               list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                               clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                         &BTRFS_I(inode)->runtime_flags);
+                       }
+                       spin_unlock(&root->fs_info->delalloc_lock);
                }
-               spin_unlock(&root->fs_info->delalloc_lock);
+               spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
 
@@ -2001,11 +2040,23 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        if (trans)
                btrfs_end_transaction(trans, root);
 
-       if (ret)
+       if (ret) {
                clear_extent_uptodate(io_tree, ordered_extent->file_offset,
                                      ordered_extent->file_offset +
                                      ordered_extent->len - 1, NULL, GFP_NOFS);
 
+               /*
+                * If the ordered extent had an IOERR or something else went
+                * wrong we need to return the space for this ordered extent
+                * back to the allocator.
+                */
+               if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+                   !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+                       btrfs_free_reserved_extent(root, ordered_extent->start,
+                                                  ordered_extent->disk_len);
+       }
+
+
        /*
         * This needs to be done to make sure anybody waiting knows we are done
         * updating everything for this ordered extent.
@@ -2062,7 +2113,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state, int mirror)
 {
-       size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+       size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        char *kaddr;
@@ -2167,11 +2218,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        }
 }
 
-enum btrfs_orphan_cleanup_state {
-       ORPHAN_CLEANUP_STARTED  = 1,
-       ORPHAN_CLEANUP_DONE     = 2,
-};
-
 /*
  * This is called in transaction commit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +2515,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                        &BTRFS_I(inode)->runtime_flags);
+               atomic_inc(&root->orphan_inodes);
 
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
@@ -2478,7 +2525,21 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+
+                       /* 1 for the orphan item deletion. */
+                       trans = btrfs_start_transaction(root, 1);
+                       if (IS_ERR(trans)) {
+                               ret = PTR_ERR(trans);
+                               goto out;
+                       }
+                       ret = btrfs_orphan_add(trans, inode);
+                       btrfs_end_transaction(trans, root);
+                       if (ret)
+                               goto out;
+
                        ret = btrfs_truncate(inode);
+                       if (ret)
+                               btrfs_orphan_del(NULL, inode);
                } else {
                        nr_unlink++;
                }
@@ -2697,34 +2758,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
-       btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
-       btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-       btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
-       btrfs_set_inode_mode(leaf, item, inode->i_mode);
-       btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+       struct btrfs_map_token token;
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-                              inode->i_atime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_nsec);
+       btrfs_init_map_token(&token);
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-                              inode->i_mtime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_nsec);
+       btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+       btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+       btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+                                  &token);
+       btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+       btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
 
-       btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-                              inode->i_ctime.tv_sec);
-       btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_nsec);
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+                                    inode->i_atime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+                                     inode->i_atime.tv_nsec, &token);
 
-       btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-       btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-       btrfs_set_inode_sequence(leaf, item, inode->i_version);
-       btrfs_set_inode_transid(leaf, item, trans->transid);
-       btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-       btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-       btrfs_set_inode_block_group(leaf, item, 0);
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+                                    inode->i_mtime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+                                     inode->i_mtime.tv_nsec, &token);
+
+       btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+                                    inode->i_ctime.tv_sec, &token);
+       btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+                                     inode->i_ctime.tv_nsec, &token);
+
+       btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+                                    &token);
+       btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+                                        &token);
+       btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+       btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+       btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+       btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+       btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 
 /*
@@ -3665,6 +3733,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                block_end - cur_offset, 0);
                if (IS_ERR(em)) {
                        err = PTR_ERR(em);
+                       em = NULL;
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
@@ -3748,16 +3817,27 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        return err;
 }
 
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
+       loff_t newsize = attr->ia_size;
+       int mask = attr->ia_valid;
        int ret;
 
        if (newsize == oldsize)
                return 0;
 
+       /*
+        * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+        * special case where we need to update the times despite not having
+        * these flags set.  For all other operations the VFS set these flags
+        * explicitly if it wants a timestamp update.
+        */
+       if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+               inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+
        if (newsize > oldsize) {
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +3863,40 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
 
+               /*
+                * 1 for the orphan item we're going to add
+                * 1 for the orphan item deletion.
+                */
+               trans = btrfs_start_transaction(root, 2);
+               if (IS_ERR(trans))
+                       return PTR_ERR(trans);
+
+               /*
+                * We need to do this in case we fail at _any_ point during the
+                * actual truncate.  Once we do the truncate_setsize we could
+                * invalidate pages which forces any outstanding ordered io to
+                * be instantly completed which will give us extents that need
+                * to be truncated.  If we fail to get an orphan inode down we
+                * could have left over extents that were never meant to live,
+                * so we need to garuntee from this point on that everything
+                * will be consistent.
+                */
+               ret = btrfs_orphan_add(trans, inode);
+               btrfs_end_transaction(trans, root);
+               if (ret)
+                       return ret;
+
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
+
+               /* Disable nonlocked read DIO to avoid the end less truncate */
+               btrfs_inode_block_unlocked_dio(inode);
+               inode_dio_wait(inode);
+               btrfs_inode_resume_unlocked_dio(inode);
+
                ret = btrfs_truncate(inode);
+               if (ret && inode->i_nlink)
+                       btrfs_orphan_del(NULL, inode);
        }
 
        return ret;
@@ -3805,7 +3916,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
 
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-               err = btrfs_setsize(inode, attr->ia_size);
+               err = btrfs_setsize(inode, attr);
                if (err)
                        return err;
        }
@@ -3855,6 +3966,12 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
 
+       ret = btrfs_commit_inode_delayed_inode(inode);
+       if (ret) {
+               btrfs_orphan_del(NULL, inode);
+               goto no_delete;
+       }
+
        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                btrfs_orphan_del(NULL, inode);
@@ -3892,7 +4009,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
 
-               trans = btrfs_start_transaction_lflush(root, 1);
+               trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3906,9 +4023,6 @@ void btrfs_evict_inode(struct inode *inode)
                        break;
 
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-               ret = btrfs_update_inode(trans, root, inode);
-               BUG_ON(ret);
-
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root);
@@ -4957,12 +5071,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
 
-       err = btrfs_update_inode(trans, root, inode);
-       if (err) {
-               drop_inode = 1;
-               goto out_unlock;
-       }
-
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5572,10 +5680,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                return em;
        if (em) {
                /*
-                * if our em maps to a hole, there might
-                * actually be delalloc bytes behind it
+                * if our em maps to
+                * -  a hole or
+                * -  a pre-alloc extent,
+                * there might actually be delalloc bytes behind it.
                 */
-               if (em->block_start != EXTENT_MAP_HOLE)
+               if (em->block_start != EXTENT_MAP_HOLE &&
+                   !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        return em;
                else
                        hole_em = em;
@@ -5657,6 +5768,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
+                       if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+                               set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
                        em->start = range_start;
                        em->len = found;
@@ -5895,6 +6008,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 
        em->start = start;
        em->orig_start = orig_start;
+       em->mod_start = start;
+       em->mod_len = len;
        em->len = len;
        em->block_len = block_len;
        em->block_start = block_start;
@@ -5936,16 +6051,15 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
        int unlock_bits = EXTENT_LOCKED;
-       int ret;
+       int ret = 0;
 
        if (create) {
-               ret = btrfs_delalloc_reserve_space(inode, len);
-               if (ret)
-                       return ret;
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->outstanding_extents++;
+               spin_unlock(&BTRFS_I(inode)->lock);
                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-       } else {
+       } else
                len = min_t(u64, len, root->sectorsize);
-       }
 
        lockstart = start;
        lockend = start + len - 1;
@@ -5957,14 +6071,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
                return -ENOTBLK;
 
-       if (create) {
-               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, EXTENT_DELALLOC, NULL,
-                                    &cached_state, GFP_NOFS);
-               if (ret)
-                       goto unlock_err;
-       }
-
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
@@ -5996,7 +6102,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-               ret = 0;
                goto unlock_err;
        }
 
@@ -6094,6 +6199,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 */
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
+
+               ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockstart + len - 1, EXTENT_DELALLOC, NULL,
+                                    &cached_state, GFP_NOFS);
+               BUG_ON(ret);
        }
 
        /*
@@ -6102,24 +6212,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         * aren't using if there is any left over space.
         */
        if (lockstart < lockend) {
-               if (create && len < lockend - lockstart) {
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockstart + len - 1,
-                                        unlock_bits | EXTENT_DEFRAG, 1, 0,
-                                        &cached_state, GFP_NOFS);
-                       /*
-                        * Beside unlock, we also need to cleanup reserved space
-                        * for the left range by attaching EXTENT_DO_ACCOUNTING.
-                        */
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree,
-                                        lockstart + len, lockend,
-                                        unlock_bits | EXTENT_DO_ACCOUNTING |
-                                        EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
-               } else {
-                       clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                        lockend, unlock_bits, 1, 0,
-                                        &cached_state, GFP_NOFS);
-               }
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                lockend, unlock_bits, 1, 0,
+                                &cached_state, GFP_NOFS);
        } else {
                free_extent_state(cached_state);
        }
@@ -6129,9 +6224,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        return 0;
 
 unlock_err:
-       if (create)
-               unlock_bits |= EXTENT_DO_ACCOUNTING;
-
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
        return ret;
@@ -6569,15 +6661,63 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+       size_t count = 0;
+       int flags = 0;
+       bool wakeup = true;
+       bool relock = false;
+       ssize_t ret;
 
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
                            offset, nr_segs))
                return 0;
 
-       return __blockdev_direct_IO(rw, iocb, inode,
-                  BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-                  iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-                  btrfs_submit_direct, 0);
+       atomic_inc(&inode->i_dio_count);
+       smp_mb__after_atomic_inc();
+
+       if (rw & WRITE) {
+               count = iov_length(iov, nr_segs);
+               /*
+                * If the write DIO is beyond the EOF, we need update
+                * the isize, but it is protected by i_mutex. So we can
+                * not unlock the i_mutex at this case.
+                */
+               if (offset + count <= inode->i_size) {
+                       mutex_unlock(&inode->i_mutex);
+                       relock = true;
+               }
+               ret = btrfs_delalloc_reserve_space(inode, count);
+               if (ret)
+                       goto out;
+       } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                    &BTRFS_I(inode)->runtime_flags))) {
+               inode_dio_done(inode);
+               flags = DIO_LOCKING | DIO_SKIP_HOLES;
+               wakeup = false;
+       }
+
+       ret = __blockdev_direct_IO(rw, iocb, inode,
+                       BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                       iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                       btrfs_submit_direct, flags);
+       if (rw & WRITE) {
+               if (ret < 0 && ret != -EIOCBQUEUED)
+                       btrfs_delalloc_release_space(inode, count);
+               else if (ret > 0 && (size_t)ret < count) {
+                       spin_lock(&BTRFS_I(inode)->lock);
+                       BTRFS_I(inode)->outstanding_extents++;
+                       spin_unlock(&BTRFS_I(inode)->lock);
+                       btrfs_delalloc_release_space(inode,
+                                                    count - (size_t)ret);
+               }
+               btrfs_delalloc_release_metadata(inode, 0);
+       }
+out:
+       if (wakeup)
+               inode_dio_done(inode);
+       if (relock)
+               mutex_lock(&inode->i_mutex);
+
+       return ret;
 }
 
 #define BTRFS_FIEMAP_FLAGS     (FIEMAP_FLAG_SYNC)
@@ -6681,8 +6821,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                return;
        }
        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_extent(inode,
-                                          page_offset(page));
+       ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
        if (ordered) {
                /*
                 * IO on this page will never be started, so we need
@@ -6915,11 +7054,9 @@ static int btrfs_truncate(struct inode *inode)
 
        /*
         * 1 for the truncate slack space
-        * 1 for the orphan item we're going to add
-        * 1 for the orphan item deletion
         * 1 for updating the inode.
         */
-       trans = btrfs_start_transaction(root, 4);
+       trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
@@ -6930,12 +7067,6 @@ static int btrfs_truncate(struct inode *inode)
                                      min_size);
        BUG_ON(ret);
 
-       ret = btrfs_orphan_add(trans, inode);
-       if (ret) {
-               btrfs_end_transaction(trans, root);
-               goto out;
-       }
-
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -7004,12 +7135,6 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
-       } else if (ret && inode->i_nlink > 0) {
-               /*
-                * Failed to do the truncate, remove us from the in memory
-                * orphan list.
-                */
-               ret = btrfs_orphan_del(NULL, inode);
        }
 
        if (trans) {
@@ -7176,8 +7301,9 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
 
+       /* the snap/subvol tree is on deleting */
        if (btrfs_root_refs(&root->root_item) == 0 &&
-           !btrfs_is_free_space_inode(inode))
+           root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -7259,14 +7385,19 @@ int btrfs_init_cachep(void)
 static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
+       u64 delalloc_bytes;
        struct inode *inode = dentry->d_inode;
        u32 blocksize = inode->i_sb->s_blocksize;
 
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
+
+       spin_lock(&BTRFS_I(inode)->lock);
+       delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+       spin_unlock(&BTRFS_I(inode)->lock);
        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-               ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+                       ALIGN(delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
 
@@ -7531,41 +7662,57 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
  */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-       struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
        struct inode *inode;
        struct btrfs_delalloc_work *work, *next;
        struct list_head works;
+       struct list_head splice;
        int ret = 0;
 
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
        INIT_LIST_HEAD(&works);
+       INIT_LIST_HEAD(&splice);
 
        spin_lock(&root->fs_info->delalloc_lock);
-       while (!list_empty(head)) {
-               binode = list_entry(head->next, struct btrfs_inode,
+       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+       while (!list_empty(&splice)) {
+               binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
+
+               list_del_init(&binode->delalloc_inodes);
+
                inode = igrab(&binode->vfs_inode);
-               if (!inode)
-                       list_del_init(&binode->delalloc_inodes);
+               if (!inode) {
+                       clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                 &binode->runtime_flags);
+                       continue;
+               }
+
+               list_add_tail(&binode->delalloc_inodes,
+                             &root->fs_info->delalloc_inodes);
                spin_unlock(&root->fs_info->delalloc_lock);
-               if (inode) {
-                       work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                       if (!work) {
-                               ret = -ENOMEM;
-                               goto out;
-                       }
-                       list_add_tail(&work->list, &works);
-                       btrfs_queue_worker(&root->fs_info->flush_workers,
-                                          &work->work);
+
+               work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+               if (unlikely(!work)) {
+                       ret = -ENOMEM;
+                       goto out;
                }
+               list_add_tail(&work->list, &works);
+               btrfs_queue_worker(&root->fs_info->flush_workers,
+                                  &work->work);
+
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
        }
        spin_unlock(&root->fs_info->delalloc_lock);
 
+       list_for_each_entry_safe(work, next, &works, list) {
+               list_del_init(&work->list);
+               btrfs_wait_and_free_delalloc_work(work);
+       }
+
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
@@ -7578,11 +7725,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
+       return 0;
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+
+       if (!list_empty_careful(&splice)) {
+               spin_lock(&root->fs_info->delalloc_lock);
+               list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+               spin_unlock(&root->fs_info->delalloc_lock);
+       }
        return ret;
 }