]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - fs/btrfs/block-group.c
Merge tag 'for-linus-5.6-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / fs / btrfs / block-group.c
index 6934a5b8708fedd0dcd0c3f6bce201f0d795d1d5..404e050ce8eee36c0142706975b5648dd2af72c9 100644 (file)
@@ -14,6 +14,8 @@
 #include "sysfs.h"
 #include "tree-log.h"
 #include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
 
 /*
  * Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
        return extended_to_chunk(flags | allowed);
 }
 
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
 {
        unsigned seq;
        u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
        return btrfs_reduce_alloc_profile(fs_info, flags);
 }
 
-u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
-       return get_alloc_profile(fs_info, orig_flags);
-}
-
 void btrfs_get_block_group(struct btrfs_block_group *cache)
 {
        atomic_inc(&cache->count);
@@ -131,6 +128,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
 
+               /*
+                * A block_group shouldn't be on the discard_list anymore.
+                * Remove the block_group from the discard_list to prevent us
+                * from causing a panic due to NULL pointer dereference.
+                */
+               if (WARN_ON(!list_empty(&cache->discard_list)))
+                       btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+                                                 cache);
+
                /*
                 * If not empty, someone is still holding mutex of
                 * full_stripe_lock, which can only be released by caller.
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
                        total_added += size;
-                       ret = btrfs_add_free_space(block_group, start,
-                                                  size);
+                       ret = btrfs_add_free_space_async_trimmed(block_group,
+                                                                start, size);
                        BUG_ON(ret); /* -ENOMEM or logic error */
                        start = extent_end + 1;
                } else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
        if (start < end) {
                size = end - start;
                total_added += size;
-               ret = btrfs_add_free_space(block_group, start, size);
+               ret = btrfs_add_free_space_async_trimmed(block_group, start,
+                                                        size);
                BUG_ON(ret); /* -ENOMEM or logic error */
        }
 
@@ -1184,22 +1191,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 {
        struct btrfs_space_info *sinfo = cache->space_info;
        u64 num_bytes;
-       u64 sinfo_used;
-       u64 min_allocable_bytes;
        int ret = -ENOSPC;
 
-       /*
-        * We need some metadata space and system metadata space for
-        * allocating chunks in some corner cases until we force to set
-        * it to be readonly.
-        */
-       if ((sinfo->flags &
-            (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
-           !force)
-               min_allocable_bytes = SZ_1M;
-       else
-               min_allocable_bytes = 0;
-
        spin_lock(&sinfo->lock);
        spin_lock(&cache->lock);
 
@@ -1211,20 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
 
        num_bytes = cache->length - cache->reserved - cache->pinned -
                    cache->bytes_super - cache->used;
-       sinfo_used = btrfs_space_info_used(sinfo, true);
 
        /*
-        * sinfo_used + num_bytes should always <= sinfo->total_bytes.
-        *
-        * Here we make sure if we mark this bg RO, we still have enough
-        * free space as buffer (if min_allocable_bytes is not 0).
+        * Data never overcommits, even in mixed mode, so do just the straight
+        * check of left over space in how much we have allocated.
         */
-       if (sinfo_used + num_bytes + min_allocable_bytes <=
-           sinfo->total_bytes) {
+       if (force) {
+               ret = 0;
+       } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
+               u64 sinfo_used = btrfs_space_info_used(sinfo, true);
+
+               /*
+                * Here we make sure if we mark this bg RO, we still have enough
+                * free space as buffer.
+                */
+               if (sinfo_used + num_bytes <= sinfo->total_bytes)
+                       ret = 0;
+       } else {
+               /*
+                * We overcommit metadata, so we need to do the
+                * btrfs_can_overcommit check here, and we need to pass in
+                * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
+                * leeway to allow us to mark this block group as read only.
+                */
+               if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
+                                        BTRFS_RESERVE_NO_FLUSH))
+                       ret = 0;
+       }
+
+       if (!ret) {
                sinfo->bytes_readonly += num_bytes;
                cache->ro++;
                list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
-               ret = 0;
        }
 out:
        spin_unlock(&cache->lock);
@@ -1232,9 +1243,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
        if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
                btrfs_info(cache->fs_info,
                        "unable to make block group %llu ro", cache->start);
-               btrfs_info(cache->fs_info,
-                       "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
-                       sinfo_used, num_bytes, min_allocable_bytes);
                btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
        }
        return ret;
@@ -1249,6 +1257,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
        struct btrfs_block_group *block_group;
        struct btrfs_space_info *space_info;
        struct btrfs_trans_handle *trans;
+       const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
        int ret = 0;
 
        if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1281,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                }
                spin_unlock(&fs_info->unused_bgs_lock);
 
+               btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
                mutex_lock(&fs_info->delete_unused_bgs_mutex);
 
                /* Don't want to race with allocators so take the groups_sem */
                down_write(&space_info->groups_sem);
+
+               /*
+                * Async discard moves the final block group discard to be prior
+                * to the unused_bgs code path.  Therefore, if it's not fully
+                * trimmed, punt it back to the async discard lists.
+                */
+               if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+                   !btrfs_is_free_space_trimmed(block_group)) {
+                       trace_btrfs_skip_unused_block_group(block_group);
+                       up_write(&space_info->groups_sem);
+                       /* Requeue if we failed because of async discard */
+                       btrfs_discard_queue_work(&fs_info->discard_ctl,
+                                                block_group);
+                       goto next;
+               }
+
                spin_lock(&block_group->lock);
                if (block_group->reserved || block_group->pinned ||
                    block_group->used || block_group->ro ||
@@ -1347,6 +1374,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                }
                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
 
+               /*
+                * At this point, the block_group is read only and should fail
+                * new allocations.  However, btrfs_finish_extent_commit() can
+                * cause this block_group to be placed back on the discard
+                * lists because now the block_group isn't fully discarded.
+                * Bail here and try again later after discarding everything.
+                */
+               spin_lock(&fs_info->discard_ctl.lock);
+               if (!list_empty(&block_group->discard_list)) {
+                       spin_unlock(&fs_info->discard_ctl.lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       btrfs_discard_queue_work(&fs_info->discard_ctl,
+                                                block_group);
+                       goto end_trans;
+               }
+               spin_unlock(&fs_info->discard_ctl.lock);
+
                /* Reset pinned so btrfs_put_block_group doesn't complain */
                spin_lock(&space_info->lock);
                spin_lock(&block_group->lock);
@@ -1362,8 +1406,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                spin_unlock(&block_group->lock);
                spin_unlock(&space_info->lock);
 
+               /*
+                * The normal path here is an unused block group is passed here,
+                * then trimming is handled in the transaction commit path.
+                * Async discard interposes before this to do the trimming
+                * before coming down the unused block group path as trimming
+                * will no longer be done later in the transaction commit path.
+                */
+               if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+                       goto flip_async;
+
                /* DISCARD can flip during remount */
-               trimming = btrfs_test_opt(fs_info, DISCARD);
+               trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
 
                /* Implicit trim during transaction commit. */
                if (trimming)
@@ -1406,6 +1460,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                spin_lock(&fs_info->unused_bgs_lock);
        }
        spin_unlock(&fs_info->unused_bgs_lock);
+       return;
+
+flip_async:
+       btrfs_end_transaction(trans);
+       mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+       btrfs_put_block_group(block_group);
+       btrfs_discard_punt_unused_bgs_list(fs_info);
 }
 
 void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1577,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        write_sequnlock(&fs_info->profiles_lock);
 }
 
+/**
+ * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * @chunk_start:   logical address of block group
+ * @physical:     physical address to map to logical addresses
+ * @logical:      return array of logical addresses which map to @physical
+ * @naddrs:       length of @logical
+ * @stripe_len:    size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+EXPORT_FOR_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+                    u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+{
+       struct extent_map *em;
+       struct map_lookup *map;
+       u64 *buf;
+       u64 bytenr;
+       u64 data_stripe_length;
+       u64 io_stripe_size;
+       int i, nr = 0;
+       int ret = 0;
+
+       em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+       if (IS_ERR(em))
+               return -EIO;
+
+       map = em->map_lookup;
+       data_stripe_length = em->len;
+       io_stripe_size = map->stripe_len;
+
+       if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+               data_stripe_length = div_u64(data_stripe_length,
+                                            map->num_stripes / map->sub_stripes);
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+               data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               data_stripe_length = div_u64(data_stripe_length,
+                                            nr_data_stripes(map));
+               io_stripe_size = map->stripe_len * nr_data_stripes(map);
+       }
+
+       buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+       if (!buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < map->num_stripes; i++) {
+               bool already_inserted = false;
+               u64 stripe_nr;
+               int j;
+
+               if (!in_range(physical, map->stripes[i].physical,
+                             data_stripe_length))
+                       continue;
+
+               stripe_nr = physical - map->stripes[i].physical;
+               stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+
+               if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+                       stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+               } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+               }
+               /*
+                * The remaining case would be for RAID56, multiply by
+                * nr_data_stripes().  Alternatively, just use rmap_len below
+                * instead of map->stripe_len
+                */
+
+               bytenr = chunk_start + stripe_nr * io_stripe_size;
+
+               /* Ensure we don't add duplicate addresses */
+               for (j = 0; j < nr; j++) {
+                       if (buf[j] == bytenr) {
+                               already_inserted = true;
+                               break;
+                       }
+               }
+
+               if (!already_inserted)
+                       buf[nr++] = bytenr;
+       }
+
+       *logical = buf;
+       *naddrs = nr;
+       *stripe_len = io_stripe_size;
+out:
+       free_extent_map(em);
+       return ret;
+}
+
 static int exclude_super_stripes(struct btrfs_block_group *cache)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,6 +1767,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
        cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
        set_free_space_tree_thresholds(cache);
 
+       cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        init_rwsem(&cache->data_rwsem);
@@ -1617,6 +1776,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
+       INIT_LIST_HEAD(&cache->discard_list);
        INIT_LIST_HEAD(&cache->dirty_list);
        INIT_LIST_HEAD(&cache->io_list);
        btrfs_init_free_space_ctl(cache);
@@ -1775,7 +1935,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
                inc_block_group_ro(cache, 1);
        } else if (cache->used == 0) {
                ASSERT(list_empty(&cache->bg_list));
-               btrfs_mark_bg_unused(cache);
+               if (btrfs_test_opt(info, DISCARD_ASYNC))
+                       btrfs_discard_queue_work(&info->discard_ctl, cache);
+               else
+                       btrfs_mark_bg_unused(cache);
        }
        return 0;
 error:
@@ -2077,7 +2240,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
                }
        }
 
-       ret = inc_block_group_ro(cache, !do_chunk_alloc);
+       ret = inc_block_group_ro(cache, 0);
        if (!do_chunk_alloc)
                goto unlock_out;
        if (!ret)
@@ -2738,8 +2901,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
                 * dirty list to avoid races between cleaner kthread and space
                 * cache writeout.
                 */
-               if (!alloc && old_val == 0)
-                       btrfs_mark_bg_unused(cache);
+               if (!alloc && old_val == 0) {
+                       if (!btrfs_test_opt(info, DISCARD_ASYNC))
+                               btrfs_mark_bg_unused(cache);
+               }
 
                btrfs_put_block_group(cache);
                total -= num_bytes;