]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - fs/btrfs/extent-tree.c
btrfs: Always trim all unallocated space in btrfs_trim_free_extents
[linux.git] / fs / btrfs / extent-tree.c
index c5880329ae37c661b4e87b3cafc0599e776f242d..c7adff343ba9a1ec3feeae0c3e044c3bc65e1749 100644 (file)
@@ -643,7 +643,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 
        if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
                mutex_lock(&caching_ctl->mutex);
-               ret = load_free_space_cache(fs_info, cache);
+               ret = load_free_space_cache(cache);
 
                spin_lock(&cache->lock);
                if (ret == 1) {
@@ -756,14 +756,17 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        return NULL;
 }
 
-static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
-                            bool metadata, u64 root_objectid)
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
+                            struct btrfs_ref *ref, int sign)
 {
        struct btrfs_space_info *space_info;
+       s64 num_bytes;
        u64 flags;
 
-       if (metadata) {
-               if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+       ASSERT(sign == 1 || sign == -1);
+       num_bytes = sign * ref->len;
+       if (ref->type == BTRFS_REF_METADATA) {
+               if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
                        flags = BTRFS_BLOCK_GROUP_SYSTEM;
                else
                        flags = BTRFS_BLOCK_GROUP_METADATA;
@@ -1704,7 +1707,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
        type = extent_ref_type(parent, owner);
        size = btrfs_extent_inline_ref_size(type);
 
-       btrfs_extend_item(fs_info, path, size);
+       btrfs_extend_item(path, size);
 
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
@@ -1779,7 +1782,6 @@ void update_inline_extent_backref(struct btrfs_path *path,
                                  int *last_ref)
 {
        struct extent_buffer *leaf = path->nodes[0];
-       struct btrfs_fs_info *fs_info = leaf->fs_info;
        struct btrfs_extent_item *ei;
        struct btrfs_extent_data_ref *dref = NULL;
        struct btrfs_shared_data_ref *sref = NULL;
@@ -1834,7 +1836,7 @@ void update_inline_extent_backref(struct btrfs_path *path,
                        memmove_extent_buffer(leaf, ptr, ptr + size,
                                              end - ptr - size);
                item_size -= size;
-               btrfs_truncate_item(fs_info, path, item_size, 1);
+               btrfs_truncate_item(path, item_size, 1);
        }
        btrfs_mark_buffer_dirty(leaf);
 }
@@ -1905,7 +1907,6 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
                               u64 *discarded_bytes)
 {
@@ -2043,39 +2044,28 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
 
 /* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
-                        u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset)
+                        struct btrfs_ref *generic_ref)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
 
-       BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
-              root_objectid == BTRFS_TREE_LOG_OBJECTID);
+       ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
+              generic_ref->action);
+       BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
+              generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
 
-       btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
-                          owner, offset, BTRFS_ADD_DELAYED_REF);
-
-       if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr,
-                                                num_bytes, parent,
-                                                root_objectid, (int)owner,
-                                                BTRFS_ADD_DELAYED_REF, NULL,
-                                                &old_ref_mod, &new_ref_mod);
-       } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr,
-                                                num_bytes, parent,
-                                                root_objectid, owner, offset,
-                                                0, BTRFS_ADD_DELAYED_REF,
+       if (generic_ref->type == BTRFS_REF_METADATA)
+               ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
+                               NULL, &old_ref_mod, &new_ref_mod);
+       else
+               ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
                                                 &old_ref_mod, &new_ref_mod);
-       }
 
-       if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
-               bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+       btrfs_ref_tree_mod(fs_info, generic_ref);
 
-               add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
-       }
+       if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
+               add_pinned_bytes(fs_info, generic_ref, -1);
 
        return ret;
 }
@@ -2877,97 +2867,6 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
        return btrfs_check_space_for_delayed_refs(trans->fs_info);
 }
 
-struct async_delayed_refs {
-       struct btrfs_root *root;
-       u64 transid;
-       int count;
-       int error;
-       int sync;
-       struct completion wait;
-       struct btrfs_work work;
-};
-
-static inline struct async_delayed_refs *
-to_async_delayed_refs(struct btrfs_work *work)
-{
-       return container_of(work, struct async_delayed_refs, work);
-}
-
-static void delayed_ref_async_start(struct btrfs_work *work)
-{
-       struct async_delayed_refs *async = to_async_delayed_refs(work);
-       struct btrfs_trans_handle *trans;
-       struct btrfs_fs_info *fs_info = async->root->fs_info;
-       int ret;
-
-       /* if the commit is already started, we don't need to wait here */
-       if (btrfs_transaction_blocked(fs_info))
-               goto done;
-
-       trans = btrfs_join_transaction(async->root);
-       if (IS_ERR(trans)) {
-               async->error = PTR_ERR(trans);
-               goto done;
-       }
-
-       /*
-        * trans->sync means that when we call end_transaction, we won't
-        * wait on delayed refs
-        */
-       trans->sync = true;
-
-       /* Don't bother flushing if we got into a different transaction */
-       if (trans->transid > async->transid)
-               goto end;
-
-       ret = btrfs_run_delayed_refs(trans, async->count);
-       if (ret)
-               async->error = ret;
-end:
-       ret = btrfs_end_transaction(trans);
-       if (ret && !async->error)
-               async->error = ret;
-done:
-       if (async->sync)
-               complete(&async->wait);
-       else
-               kfree(async);
-}
-
-int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
-                                unsigned long count, u64 transid, int wait)
-{
-       struct async_delayed_refs *async;
-       int ret;
-
-       async = kmalloc(sizeof(*async), GFP_NOFS);
-       if (!async)
-               return -ENOMEM;
-
-       async->root = fs_info->tree_root;
-       async->count = count;
-       async->error = 0;
-       async->transid = transid;
-       if (wait)
-               async->sync = 1;
-       else
-               async->sync = 0;
-       init_completion(&async->wait);
-
-       btrfs_init_work(&async->work, btrfs_extent_refs_helper,
-                       delayed_ref_async_start, NULL, NULL);
-
-       btrfs_queue_work(fs_info->extent_workers, &async->work);
-
-       if (wait) {
-               wait_for_completion(&async->wait);
-               ret = async->error;
-               kfree(async);
-               return ret;
-       }
-       return 0;
-}
-
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -3036,7 +2935,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 }
 
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
-                               struct btrfs_fs_info *fs_info,
                                u64 bytenr, u64 num_bytes, u64 flags,
                                int level, int is_data)
 {
@@ -3053,8 +2951,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        extent_op->is_data = is_data ? true : false;
        extent_op->level = level;
 
-       ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
-                                         num_bytes, extent_op);
+       ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
        if (ret)
                btrfs_free_delayed_extent_op(extent_op);
        return ret;
@@ -3246,13 +3143,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        u32 nritems;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
+       struct btrfs_ref generic_ref = { 0 };
+       bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
        int i;
+       int action;
        int level;
        int ret = 0;
-       int (*process_func)(struct btrfs_trans_handle *,
-                           struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64);
-
 
        if (btrfs_is_testing(fs_info))
                return 0;
@@ -3264,15 +3160,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
        if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
                return 0;
 
-       if (inc)
-               process_func = btrfs_inc_extent_ref;
-       else
-               process_func = btrfs_free_extent;
-
        if (full_backref)
                parent = buf->start;
        else
                parent = 0;
+       if (inc)
+               action = BTRFS_ADD_DELAYED_REF;
+       else
+               action = BTRFS_DROP_DELAYED_REF;
 
        for (i = 0; i < nritems; i++) {
                if (level == 0) {
@@ -3290,16 +3185,30 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 
                        num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
                        key.offset -= btrfs_file_extent_offset(buf, fi);
-                       ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, key.objectid,
-                                          key.offset);
+                       btrfs_init_generic_ref(&generic_ref, action, bytenr,
+                                              num_bytes, parent);
+                       generic_ref.real_root = root->root_key.objectid;
+                       btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
+                                           key.offset);
+                       generic_ref.skip_qgroup = for_reloc;
+                       if (inc)
+                               ret = btrfs_inc_extent_ref(trans, &generic_ref);
+                       else
+                               ret = btrfs_free_extent(trans, &generic_ref);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, i);
                        num_bytes = fs_info->nodesize;
-                       ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0);
+                       btrfs_init_generic_ref(&generic_ref, action, bytenr,
+                                              num_bytes, parent);
+                       generic_ref.real_root = root->root_key.objectid;
+                       btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
+                       generic_ref.skip_qgroup = for_reloc;
+                       if (inc)
+                               ret = btrfs_inc_extent_ref(trans, &generic_ref);
+                       else
+                               ret = btrfs_free_extent(trans, &generic_ref);
                        if (ret)
                                goto fail;
                }
@@ -3322,10 +3231,10 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
-                                struct btrfs_fs_info *fs_info,
                                 struct btrfs_path *path,
                                 struct btrfs_block_group_cache *cache)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        int ret;
        struct btrfs_root *extent_root = fs_info->extent_root;
        unsigned long bi;
@@ -3348,10 +3257,10 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
 
 }
 
-static struct btrfs_block_group_cache *
-next_block_group(struct btrfs_fs_info *fs_info,
-                struct btrfs_block_group_cache *cache)
+static struct btrfs_block_group_cache *next_block_group(
+               struct btrfs_block_group_cache *cache)
 {
+       struct btrfs_fs_info *fs_info = cache->fs_info;
        struct rb_node *node;
 
        spin_lock(&fs_info->block_group_cache_lock);
@@ -3404,7 +3313,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        if (trans->aborted)
                return 0;
 again:
-       inode = lookup_free_space_inode(fs_info, block_group, path);
+       inode = lookup_free_space_inode(block_group, path);
        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
                ret = PTR_ERR(inode);
                btrfs_release_path(path);
@@ -3418,8 +3327,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
                if (block_group->ro)
                        goto out_free;
 
-               ret = create_free_space_inode(fs_info, trans, block_group,
-                                             path);
+               ret = create_free_space_inode(trans, block_group, path);
                if (ret)
                        goto out_free;
                goto again;
@@ -3538,9 +3446,9 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
        return ret;
 }
 
-int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
-                           struct btrfs_fs_info *fs_info)
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *cache, *tmp;
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_path *path;
@@ -3652,8 +3560,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
 
                if (cache->disk_cache_state == BTRFS_DC_SETUP) {
                        cache->io_ctl.inode = NULL;
-                       ret = btrfs_write_out_cache(fs_info, trans,
-                                                   cache, path);
+                       ret = btrfs_write_out_cache(trans, cache, path);
                        if (ret == 0 && cache->io_ctl.inode) {
                                num_started++;
                                should_put = 0;
@@ -3673,8 +3580,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
                        }
                }
                if (!ret) {
-                       ret = write_one_cache_group(trans, fs_info,
-                                                   path, cache);
+                       ret = write_one_cache_group(trans, path, cache);
                        /*
                         * Our block group might still be attached to the list
                         * of new block groups in the transaction handle of some
@@ -3744,9 +3650,9 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
        return ret;
 }
 
-int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
-                                  struct btrfs_fs_info *fs_info)
+int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
 {
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        struct btrfs_block_group_cache *cache;
        struct btrfs_transaction *cur_trans = trans->transaction;
        int ret = 0;
@@ -3809,8 +3715,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 
                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
                        cache->io_ctl.inode = NULL;
-                       ret = btrfs_write_out_cache(fs_info, trans,
-                                                   cache, path);
+                       ret = btrfs_write_out_cache(trans, cache, path);
                        if (ret == 0 && cache->io_ctl.inode) {
                                num_started++;
                                should_put = 0;
@@ -3824,8 +3729,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                        }
                }
                if (!ret) {
-                       ret = write_one_cache_group(trans, fs_info,
-                                                   path, cache);
+                       ret = write_one_cache_group(trans, path, cache);
                        /*
                         * One of the free space endio workers might have
                         * created a new block group while updating a free space
@@ -3842,8 +3746,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                        if (ret == -ENOENT) {
                                wait_event(cur_trans->writer_wait,
                                   atomic_read(&cur_trans->num_writers) == 1);
-                               ret = write_one_cache_group(trans, fs_info,
-                                                           path, cache);
+                               ret = write_one_cache_group(trans, path, cache);
                        }
                        if (ret)
                                btrfs_abort_transaction(trans, ret);
@@ -3981,8 +3884,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
                                    info->space_info_kobj, "%s",
                                    alloc_name(space_info->flags));
        if (ret) {
-               percpu_counter_destroy(&space_info->total_bytes_pinned);
-               kfree(space_info);
+               kobject_put(&space_info->kobj);
                return ret;
        }
 
@@ -4732,6 +4634,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
        u64 delalloc_bytes;
+       u64 dio_bytes;
        u64 async_pages;
        u64 items;
        long time_left;
@@ -4747,7 +4650,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
 
        delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
-       if (delalloc_bytes == 0) {
+       dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
+       if (delalloc_bytes == 0 && dio_bytes == 0) {
                if (trans)
                        return;
                if (wait_ordered)
@@ -4755,8 +4659,16 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                return;
        }
 
+       /*
+        * If we are doing more ordered than delalloc we need to just wait on
+        * ordered extents, otherwise we'll waste time trying to flush delalloc
+        * that likely won't give us the space back we need.
+        */
+       if (dio_bytes > delalloc_bytes)
+               wait_ordered = true;
+
        loops = 0;
-       while (delalloc_bytes && loops < 3) {
+       while ((delalloc_bytes || dio_bytes) && loops < 3) {
                nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
 
                /*
@@ -4806,6 +4718,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
                }
                delalloc_bytes = percpu_counter_sum_positive(
                                                &fs_info->delalloc_bytes);
+               dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
        }
 }
 
@@ -5803,85 +5716,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
        return ret;
 }
 
-static void calc_refill_bytes(struct btrfs_block_rsv *block_rsv,
-                               u64 *metadata_bytes, u64 *qgroup_bytes)
-{
-       *metadata_bytes = 0;
-       *qgroup_bytes = 0;
-
-       spin_lock(&block_rsv->lock);
-       if (block_rsv->reserved < block_rsv->size)
-               *metadata_bytes = block_rsv->size - block_rsv->reserved;
-       if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
-               *qgroup_bytes = block_rsv->qgroup_rsv_size -
-                       block_rsv->qgroup_rsv_reserved;
-       spin_unlock(&block_rsv->lock);
-}
-
-/**
- * btrfs_inode_rsv_refill - refill the inode block rsv.
- * @inode - the inode we are refilling.
- * @flush - the flushing restriction.
- *
- * Essentially the same as btrfs_block_rsv_refill, except it uses the
- * block_rsv->size as the minimum size.  We'll either refill the missing amount
- * or return if we already have enough space.  This will also handle the reserve
- * tracepoint for the reserved amount.
- */
-static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
-                                 enum btrfs_reserve_flush_enum flush)
-{
-       struct btrfs_root *root = inode->root;
-       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
-       u64 num_bytes, last = 0;
-       u64 qgroup_num_bytes;
-       int ret = -ENOSPC;
-
-       calc_refill_bytes(block_rsv, &num_bytes, &qgroup_num_bytes);
-       if (num_bytes == 0)
-               return 0;
-
-       do {
-               ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes,
-                                                        true);
-               if (ret)
-                       return ret;
-               ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
-               if (ret) {
-                       btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
-                       last = num_bytes;
-                       /*
-                        * If we are fragmented we can end up with a lot of
-                        * outstanding extents which will make our size be much
-                        * larger than our reserved amount.
-                        *
-                        * If the reservation happens here, it might be very
-                        * big though not needed in the end, if the delalloc
-                        * flushing happens.
-                        *
-                        * If this is the case try and do the reserve again.
-                        */
-                       if (flush == BTRFS_RESERVE_FLUSH_ALL)
-                               calc_refill_bytes(block_rsv, &num_bytes,
-                                                  &qgroup_num_bytes);
-                       if (num_bytes == 0)
-                               return 0;
-               }
-       } while (ret && last != num_bytes);
-
-       if (!ret) {
-               block_rsv_add_bytes(block_rsv, num_bytes, false);
-               trace_btrfs_space_reservation(root->fs_info, "delalloc",
-                                             btrfs_ino(inode), num_bytes, 1);
-
-               /* Don't forget to increase qgroup_rsv_reserved */
-               spin_lock(&block_rsv->lock);
-               block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
-               spin_unlock(&block_rsv->lock);
-       }
-       return ret;
-}
-
 static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
                                     struct btrfs_block_rsv *block_rsv,
                                     u64 num_bytes, u64 *qgroup_to_release)
@@ -6182,9 +6016,25 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
        spin_unlock(&block_rsv->lock);
 }
 
+static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+                                   u64 num_bytes, u64 *meta_reserve,
+                                   u64 *qgroup_reserve)
+{
+       u64 nr_extents = count_max_extents(num_bytes);
+       u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes);
+
+       /* We add one for the inode update at finish ordered time */
+       *meta_reserve = btrfs_calc_trans_metadata_size(fs_info,
+                                               nr_extents + csum_leaves + 1);
+       *qgroup_reserve = nr_extents * fs_info->nodesize;
+}
+
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 {
-       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+       u64 meta_reserve, qgroup_reserve;
        unsigned nr_extents;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
@@ -6214,7 +6064,31 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
 
        num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
 
-       /* Add our new extents and calculate the new rsv size. */
+       /*
+        * We always want to do it this way, every other way is wrong and ends
+        * in tears.  Pre-reserving the amount we are going to add will always
+        * be the right way, because otherwise if we have enough parallelism we
+        * could end up with thousands of inodes all holding little bits of
+        * reservations they were able to make previously and the only way to
+        * reclaim that space is to ENOSPC out the operations and clear
+        * everything out and try again, which is bad.  This way we just
+        * over-reserve slightly, and clean up the mess when we are done.
+        */
+       calc_inode_reservations(fs_info, num_bytes, &meta_reserve,
+                               &qgroup_reserve);
+       ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
+       if (ret)
+               goto out_fail;
+       ret = reserve_metadata_bytes(root, block_rsv, meta_reserve, flush);
+       if (ret)
+               goto out_qgroup;
+
+       /*
+        * Now we need to update our outstanding extents and csum bytes _first_
+        * and then add the reservation to the block_rsv.  This keeps us from
+        * racing with an ordered completion or some such that would think it
+        * needs to free the reservation we just made.
+        */
        spin_lock(&inode->lock);
        nr_extents = count_max_extents(num_bytes);
        btrfs_mod_outstanding_extents(inode, nr_extents);
@@ -6222,22 +6096,21 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
        btrfs_calculate_inode_block_rsv_size(fs_info, inode);
        spin_unlock(&inode->lock);
 
-       ret = btrfs_inode_rsv_refill(inode, flush);
-       if (unlikely(ret))
-               goto out_fail;
+       /* Now we can safely add our space to our block rsv */
+       block_rsv_add_bytes(block_rsv, meta_reserve, false);
+       trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                     btrfs_ino(inode), meta_reserve, 1);
+
+       spin_lock(&block_rsv->lock);
+       block_rsv->qgroup_rsv_reserved += qgroup_reserve;
+       spin_unlock(&block_rsv->lock);
 
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
        return 0;
-
+out_qgroup:
+       btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
 out_fail:
-       spin_lock(&inode->lock);
-       nr_extents = count_max_extents(num_bytes);
-       btrfs_mod_outstanding_extents(inode, -nr_extents);
-       inode->csum_bytes -= num_bytes;
-       btrfs_calculate_inode_block_rsv_size(fs_info, inode);
-       spin_unlock(&inode->lock);
-
        btrfs_inode_rsv_release(inode, true);
        if (delalloc_lock)
                mutex_unlock(&inode->delalloc_mutex);
@@ -6361,9 +6234,9 @@ void btrfs_delalloc_release_space(struct inode *inode,
 }
 
 static int update_block_group(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *info, u64 bytenr,
-                             u64 num_bytes, int alloc)
+                             u64 bytenr, u64 num_bytes, int alloc)
 {
+       struct btrfs_fs_info *info = trans->fs_info;
        struct btrfs_block_group_cache *cache = NULL;
        u64 total = num_bytes;
        u64 old_val;
@@ -6444,7 +6317,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                if (list_empty(&cache->dirty_list)) {
                        list_add_tail(&cache->dirty_list,
                                      &trans->transaction->dirty_bgs);
-                       trans->transaction->num_dirty_bgs++;
                        trans->delayed_ref_updates++;
                        btrfs_get_block_group(cache);
                }
@@ -6491,10 +6363,11 @@ static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
        return bytenr;
 }
 
-static int pin_down_extent(struct btrfs_fs_info *fs_info,
-                          struct btrfs_block_group_cache *cache,
+static int pin_down_extent(struct btrfs_block_group_cache *cache,
                           u64 bytenr, u64 num_bytes, int reserved)
 {
+       struct btrfs_fs_info *fs_info = cache->fs_info;
+
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
@@ -6526,7 +6399,7 @@ int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
        cache = btrfs_lookup_block_group(fs_info, bytenr);
        BUG_ON(!cache); /* Logic error */
 
-       pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
+       pin_down_extent(cache, bytenr, num_bytes, reserved);
 
        btrfs_put_block_group(cache);
        return 0;
@@ -6553,7 +6426,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
         */
        cache_block_group(cache, 1);
 
-       pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
+       pin_down_extent(cache, bytenr, num_bytes, 0);
 
        /* remove us from the free space cache (if we're there at all) */
        ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
@@ -6607,9 +6480,9 @@ static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
-int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
-                                struct extent_buffer *eb)
+int btrfs_exclude_logged_extents(struct extent_buffer *eb)
 {
+       struct btrfs_fs_info *fs_info = eb->fs_info;
        struct btrfs_file_extent_item *item;
        struct btrfs_key key;
        int found_type;
@@ -7198,7 +7071,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        goto out;
                }
 
-               ret = update_block_group(trans, info, bytenr, num_bytes, 0);
+               ret = update_block_group(trans, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, ret);
                        goto out;
@@ -7272,21 +7145,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           u64 parent, int last_ref)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_ref generic_ref = { 0 };
        int pin = 1;
        int ret;
 
+       btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
+                              buf->start, buf->len, parent);
+       btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
+                           root->root_key.objectid);
+
        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                int old_ref_mod, new_ref_mod;
 
-               btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
-                                  root->root_key.objectid,
-                                  btrfs_header_level(buf), 0,
-                                  BTRFS_DROP_DELAYED_REF);
-               ret = btrfs_add_delayed_tree_ref(trans, buf->start,
-                                                buf->len, parent,
-                                                root->root_key.objectid,
-                                                btrfs_header_level(buf),
-                                                BTRFS_DROP_DELAYED_REF, NULL,
+               btrfs_ref_tree_mod(fs_info, &generic_ref);
+               ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
                                                 &old_ref_mod, &new_ref_mod);
                BUG_ON(ret); /* -ENOMEM */
                pin = old_ref_mod >= 0 && new_ref_mod < 0;
@@ -7305,8 +7177,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(fs_info, buf->start);
 
                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                       pin_down_extent(fs_info, cache, buf->start,
-                                       buf->len, 1);
+                       pin_down_extent(cache, buf->start, buf->len, 1);
                        btrfs_put_block_group(cache);
                        goto out;
                }
@@ -7320,8 +7191,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        }
 out:
        if (pin)
-               add_pinned_bytes(fs_info, buf->len, true,
-                                root->root_key.objectid);
+               add_pinned_bytes(fs_info, &generic_ref, 1);
 
        if (last_ref) {
                /*
@@ -7333,52 +7203,43 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 }
 
 /* Can return -ENOMEM */
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset)
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
 {
-       struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_fs_info *fs_info = trans->fs_info;
        int old_ref_mod, new_ref_mod;
        int ret;
 
        if (btrfs_is_testing(fs_info))
                return 0;
 
-       if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
-               btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
-                                  root_objectid, owner, offset,
-                                  BTRFS_DROP_DELAYED_REF);
-
        /*
         * tree log blocks never actually go into the extent allocation
         * tree, just update pinning info and exit early.
         */
-       if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
-               WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
+       if ((ref->type == BTRFS_REF_METADATA &&
+            ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+           (ref->type == BTRFS_REF_DATA &&
+            ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
                /* unlocks the pinned mutex */
-               btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
+               btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
                old_ref_mod = new_ref_mod = 0;
                ret = 0;
-       } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-               ret = btrfs_add_delayed_tree_ref(trans, bytenr,
-                                                num_bytes, parent,
-                                                root_objectid, (int)owner,
-                                                BTRFS_DROP_DELAYED_REF, NULL,
+       } else if (ref->type == BTRFS_REF_METADATA) {
+               ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
                                                 &old_ref_mod, &new_ref_mod);
        } else {
-               ret = btrfs_add_delayed_data_ref(trans, bytenr,
-                                                num_bytes, parent,
-                                                root_objectid, owner, offset,
-                                                0, BTRFS_DROP_DELAYED_REF,
+               ret = btrfs_add_delayed_data_ref(trans, ref, 0,
                                                 &old_ref_mod, &new_ref_mod);
        }
 
-       if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
-               bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
+       if (!((ref->type == BTRFS_REF_METADATA &&
+              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
+             (ref->type == BTRFS_REF_DATA &&
+              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
+               btrfs_ref_tree_mod(fs_info, ref);
 
-               add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
-       }
+       if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
+               add_pinned_bytes(fs_info, ref, 1);
 
        return ret;
 }
@@ -7569,7 +7430,6 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
                struct find_free_extent_ctl *ffe_ctl,
                struct btrfs_block_group_cache **cluster_bg_ret)
 {
-       struct btrfs_fs_info *fs_info = bg->fs_info;
        struct btrfs_block_group_cache *cluster_bg;
        u64 aligned_cluster;
        u64 offset;
@@ -7629,9 +7489,8 @@ static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
        aligned_cluster = max_t(u64,
                        ffe_ctl->empty_cluster + ffe_ctl->empty_size,
                        bg->full_stripe_len);
-       ret = btrfs_find_space_cluster(fs_info, bg, last_ptr,
-                       ffe_ctl->search_start, ffe_ctl->num_bytes,
-                       aligned_cluster);
+       ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
+                       ffe_ctl->num_bytes, aligned_cluster);
        if (ret == 0) {
                /* Now pull our allocation out of this cluster */
                offset = btrfs_alloc_from_cluster(bg, last_ptr,
@@ -8281,7 +8140,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
        }
 
        if (pin)
-               pin_down_extent(fs_info, cache, start, len, 1);
+               pin_down_extent(cache, start, len, 1);
        else {
                if (btrfs_test_opt(fs_info, DISCARD))
                        ret = btrfs_discard_extent(fs_info, start, len, NULL);
@@ -8370,7 +8229,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
+       ret = update_block_group(trans, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -8460,7 +8319,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
-       ret = update_block_group(trans, fs_info, extent_key.objectid,
+       ret = update_block_group(trans, extent_key.objectid,
                                 fs_info->nodesize, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -8478,19 +8337,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                     u64 offset, u64 ram_bytes,
                                     struct btrfs_key *ins)
 {
+       struct btrfs_ref generic_ref = { 0 };
        int ret;
 
        BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
 
-       btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
-                          root->root_key.objectid, owner, offset,
-                          BTRFS_ADD_DELAYED_EXTENT);
-
-       ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
-                                        ins->offset, 0,
-                                        root->root_key.objectid, owner,
-                                        offset, ram_bytes,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
+       btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+                              ins->objectid, ins->offset, 0);
+       btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
+       btrfs_ref_tree_mod(root->fs_info, &generic_ref);
+       ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
+                                        ram_bytes, NULL, NULL);
        return ret;
 }
 
@@ -8563,7 +8420,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
        btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
        btrfs_tree_lock(buf);
-       clean_tree_block(fs_info, buf);
+       btrfs_clean_tree_block(buf);
        clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
        btrfs_set_lock_blocking_write(buf);
@@ -8682,6 +8539,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
        struct btrfs_delayed_extent_op *extent_op;
+       struct btrfs_ref generic_ref = { 0 };
        u64 flags = 0;
        int ret;
        u32 blocksize = fs_info->nodesize;
@@ -8736,13 +8594,12 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                extent_op->is_data = false;
                extent_op->level = level;
 
-               btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
-                                  root_objectid, level, 0,
-                                  BTRFS_ADD_DELAYED_EXTENT);
-               ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
-                                                ins.offset, parent,
-                                                root_objectid, level,
-                                                BTRFS_ADD_DELAYED_EXTENT,
+               btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
+                                      ins.objectid, ins.offset, parent);
+               generic_ref.real_root = root->root_key.objectid;
+               btrfs_init_tree_ref(&generic_ref, level, root_objectid);
+               btrfs_ref_tree_mod(fs_info, &generic_ref);
+               ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
                                                 extent_op, NULL, NULL);
                if (ret)
                        goto out_free_delayed;
@@ -8918,7 +8775,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
                BUG_ON(ret); /* -ENOMEM */
                ret = btrfs_dec_ref(trans, root, eb, 0);
                BUG_ON(ret); /* -ENOMEM */
-               ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
+               ret = btrfs_set_disk_extent_flags(trans, eb->start,
                                                  eb->len, flag,
                                                  btrfs_header_level(eb), 0);
                BUG_ON(ret); /* -ENOMEM */
@@ -8987,6 +8844,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        u64 parent;
        struct btrfs_key key;
        struct btrfs_key first_key;
+       struct btrfs_ref ref = { 0 };
        struct extent_buffer *next;
        int level = wc->level;
        int reada = 0;
@@ -9159,9 +9017,10 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                wc->drop_level = level;
                find_next_key(path, level, &wc->drop_progress);
 
-               ret = btrfs_free_extent(trans, root, bytenr, fs_info->nodesize,
-                                       parent, root->root_key.objectid,
-                                       level - 1, 0);
+               btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
+                                      fs_info->nodesize, parent);
+               btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
+               ret = btrfs_free_extent(trans, &ref);
                if (ret)
                        goto out_unlock;
        }
@@ -9251,21 +9110,23 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                        else
                                ret = btrfs_dec_ref(trans, root, eb, 0);
                        BUG_ON(ret); /* -ENOMEM */
-                       ret = btrfs_qgroup_trace_leaf_items(trans, eb);
-                       if (ret) {
-                               btrfs_err_rl(fs_info,
-                                            "error %d accounting leaf items. Quota is out of sync, rescan required.",
+                       if (is_fstree(root->root_key.objectid)) {
+                               ret = btrfs_qgroup_trace_leaf_items(trans, eb);
+                               if (ret) {
+                                       btrfs_err_rl(fs_info,
+       "error %d accounting leaf items, quota is out of sync, rescan required",
                                             ret);
+                               }
                        }
                }
-               /* make block locked assertion in clean_tree_block happy */
+               /* make block locked assertion in btrfs_clean_tree_block happy */
                if (!path->locks[level] &&
                    btrfs_header_generation(eb) == trans->transid) {
                        btrfs_tree_lock(eb);
                        btrfs_set_lock_blocking_write(eb);
                        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
                }
-               clean_tree_block(fs_info, eb);
+               btrfs_clean_tree_block(eb);
        }
 
        if (eb == root->node) {
@@ -9921,12 +9782,10 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
  */
 int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
 {
-       struct btrfs_root *root = fs_info->extent_root;
        struct btrfs_block_group_cache *block_group;
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
        struct btrfs_device *device;
-       struct btrfs_trans_handle *trans;
        u64 min_free;
        u64 dev_min = 1;
        u64 dev_nr = 0;
@@ -10025,13 +9884,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                min_free = div64_u64(min_free, dev_min);
        }
 
-       /* We need to do this so that we can look at pending chunks */
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans)) {
-               ret = PTR_ERR(trans);
-               goto out;
-       }
-
        mutex_lock(&fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 dev_offset;
@@ -10042,7 +9894,7 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free &&
                    !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
-                       ret = find_free_dev_extent(trans, device, min_free,
+                       ret = find_free_dev_extent(device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
                                dev_nr++;
@@ -10058,7 +9910,6 @@ int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
                           "no space to allocate a new chunk for block group %llu",
                           block_group->key.objectid);
        mutex_unlock(&fs_info->chunk_mutex);
-       btrfs_end_transaction(trans);
 out:
        btrfs_put_block_group(block_group);
        return ret;
@@ -10159,7 +10010,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
                        if (block_group->iref)
                                break;
                        spin_unlock(&block_group->lock);
-                       block_group = next_block_group(info, block_group);
+                       block_group = next_block_group(block_group);
                }
                if (!block_group) {
                        if (last == 0)
@@ -10660,7 +10511,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
        struct btrfs_block_group_cache *cache;
        int ret;
 
-       btrfs_set_log_full_commit(fs_info, trans);
+       btrfs_set_log_full_commit(trans);
 
        cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
        if (!cache)
@@ -10808,7 +10659,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * get the inode first so any iput calls done for the io_list
         * aren't the final iput (no unlinks allowed now)
         */
-       inode = lookup_free_space_inode(fs_info, block_group, path);
+       inode = lookup_free_space_inode(block_group, path);
 
        mutex_lock(&trans->transaction->cache_write_mutex);
        /*
@@ -10952,10 +10803,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        memcpy(&key, &block_group->key, sizeof(key));
 
        mutex_lock(&fs_info->chunk_mutex);
-       if (!list_empty(&em->list)) {
-               /* We're in the transaction->pending_chunks list. */
-               free_extent_map(em);
-       }
        spin_lock(&block_group->lock);
        block_group->removed = 1;
        /*
@@ -10982,25 +10829,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * the transaction commit has completed.
         */
        remove_em = (atomic_read(&block_group->trimming) == 0);
-       /*
-        * Make sure a trimmer task always sees the em in the pinned_chunks list
-        * if it sees block_group->removed == 1 (needs to lock block_group->lock
-        * before checking block_group->removed).
-        */
-       if (!remove_em) {
-               /*
-                * Our em might be in trans->transaction->pending_chunks which
-                * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
-                * and so is the fs_info->pinned_chunks list.
-                *
-                * So at this point we must be holding the chunk_mutex to avoid
-                * any races with chunk allocation (more specifically at
-                * volumes.c:contains_pending_extent()), to ensure it always
-                * sees the em, either in the pending_chunks list or in the
-                * pinned_chunks list.
-                */
-               list_move_tail(&em->list, &fs_info->pinned_chunks);
-       }
        spin_unlock(&block_group->lock);
 
        if (remove_em) {
@@ -11008,11 +10836,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
                em_tree = &fs_info->mapping_tree.map_tree;
                write_lock(&em_tree->lock);
-               /*
-                * The em might be in the pending_chunks list, so make sure the
-                * chunk mutex is locked, since remove_extent_mapping() will
-                * delete us from that list.
-                */
                remove_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
                /* once for the tree */
@@ -11314,10 +11137,9 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
  * it while performing the free space search since we have already
  * held back allocations.
  */
-static int btrfs_trim_free_extents(struct btrfs_device *device,
-                                  u64 minlen, u64 *trimmed)
+static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
 {
-       u64 start = 0, len = 0;
+       u64 start = SZ_1M, len = 0, end = 0;
        int ret;
 
        *trimmed = 0;
@@ -11338,43 +11160,36 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
 
        while (1) {
                struct btrfs_fs_info *fs_info = device->fs_info;
-               struct btrfs_transaction *trans;
                u64 bytes;
 
                ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
                if (ret)
                        break;
 
-               ret = down_read_killable(&fs_info->commit_root_sem);
-               if (ret) {
-                       mutex_unlock(&fs_info->chunk_mutex);
-                       break;
-               }
-
-               spin_lock(&fs_info->trans_lock);
-               trans = fs_info->running_transaction;
-               if (trans)
-                       refcount_inc(&trans->use_count);
-               spin_unlock(&fs_info->trans_lock);
-
-               if (!trans)
-                       up_read(&fs_info->commit_root_sem);
-
-               ret = find_free_dev_extent_start(trans, device, minlen, start,
-                                                &start, &len);
-               if (trans) {
-                       up_read(&fs_info->commit_root_sem);
-                       btrfs_put_transaction(trans);
-               }
+               find_first_clear_extent_bit(&device->alloc_state, start,
+                                           &start, &end,
+                                           CHUNK_TRIMMED | CHUNK_ALLOCATED);
+               /*
+                * If find_first_clear_extent_bit find a range that spans the
+                * end of the device it will set end to -1, in this case it's up
+                * to the caller to trim the value to the size of the device.
+                */
+               end = min(end, device->total_bytes - 1);
+               len = end - start + 1;
 
-               if (ret) {
+               /* We didn't find any extents */
+               if (!len) {
                        mutex_unlock(&fs_info->chunk_mutex);
-                       if (ret == -ENOSPC)
-                               ret = 0;
+                       ret = 0;
                        break;
                }
 
-               ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+               ret = btrfs_issue_discard(device->bdev, start, len,
+                                         &bytes);
+               if (!ret)
+                       set_extent_bits(&device->alloc_state, start,
+                                       start + bytes - 1,
+                                       CHUNK_TRIMMED);
                mutex_unlock(&fs_info->chunk_mutex);
 
                if (ret)
@@ -11419,7 +11234,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
        int ret = 0;
 
        cache = btrfs_lookup_first_block_group(fs_info, range->start);
-       for (; cache; cache = next_block_group(fs_info, cache)) {
+       for (; cache; cache = next_block_group(cache)) {
                if (cache->key.objectid >= (range->start + range->len)) {
                        btrfs_put_block_group(cache);
                        break;
@@ -11466,8 +11281,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
        mutex_lock(&fs_info->fs_devices->device_list_mutex);
        devices = &fs_info->fs_devices->devices;
        list_for_each_entry(device, devices, dev_list) {
-               ret = btrfs_trim_free_extents(device, range->minlen,
-                                             &group_trimmed);
+               ret = btrfs_trim_free_extents(device, &group_trimmed);
                if (ret) {
                        dev_failed++;
                        dev_ret = ret;