Merge tag 'for-linus-5.6-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux.git] / fs / btrfs / block-group.c
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c

index 6934a5b8708fedd0dcd0c3f6bce201f0d795d1d5..404e050ce8eee36c0142706975b5648dd2af72c9 100644 (file)
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -14,6 +14,8 @@
  #include "sysfs.h"
  #include "tree-log.h"
  #include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
  
  /*
   * Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
         return extended_to_chunk(flags | allowed);
  }
  
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
  {
         unsigned seq;
         u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
         return btrfs_reduce_alloc_profile(fs_info, flags);
  }
  
-u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
-       return get_alloc_profile(fs_info, orig_flags);
-}
-
  void btrfs_get_block_group(struct btrfs_block_group *cache)
  {
         atomic_inc(&cache->count);
@@ -131,6 +128,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
                 WARN_ON(cache->pinned > 0);
                 WARN_ON(cache->reserved > 0);
  
+               /*
+                * A block_group shouldn't be on the discard_list anymore.
+                * Remove the block_group from the discard_list to prevent us
+                * from causing a panic due to NULL pointer dereference.
+                */
+               if (WARN_ON(!list_empty(&cache->discard_list)))
+                       btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+                                                 cache);
+
                 /*
                  * If not empty, someone is still holding mutex of
                  * full_stripe_lock, which can only be released by caller.
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
                 } else if (extent_start > start && extent_start < end) {
                         size = extent_start - start;
                         total_added += size;
-                       ret = btrfs_add_free_space(block_group, start,
-                                                  size);
+                       ret = btrfs_add_free_space_async_trimmed(block_group,
+                                                                start, size);
                         BUG_ON(ret); /* -ENOMEM or logic error */
                         start = extent_end + 1;
                 } else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
         if (start < end) {
                 size = end - start;
                 total_added += size;
-               ret = btrfs_add_free_space(block_group, start, size);
+               ret = btrfs_add_free_space_async_trimmed(block_group, start,
+                                                        size);
                 BUG_ON(ret); /* -ENOMEM or logic error */
         }
  
@@ -1184,22 +1191,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
         u64 num_bytes;
-       u64 sinfo_used;
-       u64 min_allocable_bytes;
         int ret = -ENOSPC;
  
-       /*
-        * We need some metadata space and system metadata space for
-        * allocating chunks in some corner cases until we force to set
-        * it to be readonly.
-        */
-       if ((sinfo->flags &
-            (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
-           !force)
-               min_allocable_bytes = SZ_1M;
-       else
-               min_allocable_bytes = 0;
-
         spin_lock(&sinfo->lock);
         spin_lock(&cache->lock);
  
@@ -1211,20 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
  
         num_bytes = cache->length - cache->reserved - cache->pinned -
                     cache->bytes_super - cache->used;
-       sinfo_used = btrfs_space_info_used(sinfo, true);
  
         /*
-        * sinfo_used + num_bytes should always <= sinfo->total_bytes.
-        *
-        * Here we make sure if we mark this bg RO, we still have enough
-        * free space as buffer (if min_allocable_bytes is not 0).
+        * Data never overcommits, even in mixed mode, so do just the straight
+        * check of left over space in how much we have allocated.
          */
-       if (sinfo_used + num_bytes + min_allocable_bytes <=
-           sinfo->total_bytes) {
+       if (force) {
+               ret = 0;
+       } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
+               u64 sinfo_used = btrfs_space_info_used(sinfo, true);
+
+               /*
+                * Here we make sure if we mark this bg RO, we still have enough
+                * free space as buffer.
+                */
+               if (sinfo_used + num_bytes <= sinfo->total_bytes)
+                       ret = 0;
+       } else {
+               /*
+                * We overcommit metadata, so we need to do the
+                * btrfs_can_overcommit check here, and we need to pass in
+                * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
+                * leeway to allow us to mark this block group as read only.
+                */
+               if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
+                                        BTRFS_RESERVE_NO_FLUSH))
+                       ret = 0;
+       }
+
+       if (!ret) {
                 sinfo->bytes_readonly += num_bytes;
                 cache->ro++;
                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
-               ret = 0;
         }
  out:
         spin_unlock(&cache->lock);
@@ -1232,9 +1243,6 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
         if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
                 btrfs_info(cache->fs_info,
                         "unable to make block group %llu ro", cache->start);
-               btrfs_info(cache->fs_info,
-                       "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
-                       sinfo_used, num_bytes, min_allocable_bytes);
                 btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
         }
         return ret;
@@ -1249,6 +1257,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
         struct btrfs_block_group *block_group;
         struct btrfs_space_info *space_info;
         struct btrfs_trans_handle *trans;
+       const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
         int ret = 0;
  
         if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1281,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 }
                 spin_unlock(&fs_info->unused_bgs_lock);
  
+               btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
  
                 /* Don't want to race with allocators so take the groups_sem */
                 down_write(&space_info->groups_sem);
+
+               /*
+                * Async discard moves the final block group discard to be prior
+                * to the unused_bgs code path.  Therefore, if it's not fully
+                * trimmed, punt it back to the async discard lists.
+                */
+               if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+                   !btrfs_is_free_space_trimmed(block_group)) {
+                       trace_btrfs_skip_unused_block_group(block_group);
+                       up_write(&space_info->groups_sem);
+                       /* Requeue if we failed because of async discard */
+                       btrfs_discard_queue_work(&fs_info->discard_ctl,
+                                                block_group);
+                       goto next;
+               }
+
                 spin_lock(&block_group->lock);
                 if (block_group->reserved || block_group->pinned ||
                     block_group->used || block_group->ro ||
@@ -1347,6 +1374,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 }
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  
+               /*
+                * At this point, the block_group is read only and should fail
+                * new allocations.  However, btrfs_finish_extent_commit() can
+                * cause this block_group to be placed back on the discard
+                * lists because now the block_group isn't fully discarded.
+                * Bail here and try again later after discarding everything.
+                */
+               spin_lock(&fs_info->discard_ctl.lock);
+               if (!list_empty(&block_group->discard_list)) {
+                       spin_unlock(&fs_info->discard_ctl.lock);
+                       btrfs_dec_block_group_ro(block_group);
+                       btrfs_discard_queue_work(&fs_info->discard_ctl,
+                                                block_group);
+                       goto end_trans;
+               }
+               spin_unlock(&fs_info->discard_ctl.lock);
+
                 /* Reset pinned so btrfs_put_block_group doesn't complain */
                 spin_lock(&space_info->lock);
                 spin_lock(&block_group->lock);
@@ -1362,8 +1406,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&block_group->lock);
                 spin_unlock(&space_info->lock);
  
+               /*
+                * The normal path here is an unused block group is passed here,
+                * then trimming is handled in the transaction commit path.
+                * Async discard interposes before this to do the trimming
+                * before coming down the unused block group path as trimming
+                * will no longer be done later in the transaction commit path.
+                */
+               if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+                       goto flip_async;
+
                 /* DISCARD can flip during remount */
-               trimming = btrfs_test_opt(fs_info, DISCARD);
+               trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
  
                 /* Implicit trim during transaction commit. */
                 if (trimming)
@@ -1406,6 +1460,13 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_lock(&fs_info->unused_bgs_lock);
         }
         spin_unlock(&fs_info->unused_bgs_lock);
+       return;
+
+flip_async:
+       btrfs_end_transaction(trans);
+       mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+       btrfs_put_block_group(block_group);
+       btrfs_discard_punt_unused_bgs_list(fs_info);
  }
  
  void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1577,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
         write_sequnlock(&fs_info->profiles_lock);
  }
  
+/**
+ * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * @chunk_start:   logical address of block group
+ * @physical:     physical address to map to logical addresses
+ * @logical:      return array of logical addresses which map to @physical
+ * @naddrs:       length of @logical
+ * @stripe_len:    size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+EXPORT_FOR_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+                    u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+{
+       struct extent_map *em;
+       struct map_lookup *map;
+       u64 *buf;
+       u64 bytenr;
+       u64 data_stripe_length;
+       u64 io_stripe_size;
+       int i, nr = 0;
+       int ret = 0;
+
+       em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+       if (IS_ERR(em))
+               return -EIO;
+
+       map = em->map_lookup;
+       data_stripe_length = em->len;
+       io_stripe_size = map->stripe_len;
+
+       if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+               data_stripe_length = div_u64(data_stripe_length,
+                                            map->num_stripes / map->sub_stripes);
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+               data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
+       else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+               data_stripe_length = div_u64(data_stripe_length,
+                                            nr_data_stripes(map));
+               io_stripe_size = map->stripe_len * nr_data_stripes(map);
+       }
+
+       buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+       if (!buf) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       for (i = 0; i < map->num_stripes; i++) {
+               bool already_inserted = false;
+               u64 stripe_nr;
+               int j;
+
+               if (!in_range(physical, map->stripes[i].physical,
+                             data_stripe_length))
+                       continue;
+
+               stripe_nr = physical - map->stripes[i].physical;
+               stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+
+               if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+                       stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+               } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                       stripe_nr = stripe_nr * map->num_stripes + i;
+               }
+               /*
+                * The remaining case would be for RAID56, multiply by
+                * nr_data_stripes().  Alternatively, just use rmap_len below
+                * instead of map->stripe_len
+                */
+
+               bytenr = chunk_start + stripe_nr * io_stripe_size;
+
+               /* Ensure we don't add duplicate addresses */
+               for (j = 0; j < nr; j++) {
+                       if (buf[j] == bytenr) {
+                               already_inserted = true;
+                               break;
+                       }
+               }
+
+               if (!already_inserted)
+                       buf[nr++] = bytenr;
+       }
+
+       *logical = buf;
+       *naddrs = nr;
+       *stripe_len = io_stripe_size;
+out:
+       free_extent_map(em);
+       return ret;
+}
+
  static int exclude_super_stripes(struct btrfs_block_group *cache)
  {
         struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,6 +1767,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
         cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
         set_free_space_tree_thresholds(cache);
  
+       cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
         atomic_set(&cache->count, 1);
         spin_lock_init(&cache->lock);
         init_rwsem(&cache->data_rwsem);
@@ -1617,6 +1776,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
         INIT_LIST_HEAD(&cache->cluster_list);
         INIT_LIST_HEAD(&cache->bg_list);
         INIT_LIST_HEAD(&cache->ro_list);
+       INIT_LIST_HEAD(&cache->discard_list);
         INIT_LIST_HEAD(&cache->dirty_list);
         INIT_LIST_HEAD(&cache->io_list);
         btrfs_init_free_space_ctl(cache);
@@ -1775,7 +1935,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
                 inc_block_group_ro(cache, 1);
         } else if (cache->used == 0) {
                 ASSERT(list_empty(&cache->bg_list));
-               btrfs_mark_bg_unused(cache);
+               if (btrfs_test_opt(info, DISCARD_ASYNC))
+                       btrfs_discard_queue_work(&info->discard_ctl, cache);
+               else
+                       btrfs_mark_bg_unused(cache);
         }
         return 0;
  error:
@@ -2077,7 +2240,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
                 }
         }
  
-       ret = inc_block_group_ro(cache, !do_chunk_alloc);
+       ret = inc_block_group_ro(cache, 0);
         if (!do_chunk_alloc)
                 goto unlock_out;
         if (!ret)
@@ -2738,8 +2901,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
                  * dirty list to avoid races between cleaner kthread and space
                  * cache writeout.
                  */
-               if (!alloc && old_val == 0)
-                       btrfs_mark_bg_unused(cache);
+               if (!alloc && old_val == 0) {
+                       if (!btrfs_test_opt(info, DISCARD_ASYNC))
+                               btrfs_mark_bg_unused(cache);
+               }
  
                 btrfs_put_block_group(cache);
                 total -= num_bytes;