Merge branch 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu

[linux.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index fc60ed7e940ead5ae7d7332ee9f64b9ffe922aca..e5ef40c603ca36d64fbbf3dc965fb4ac63db9ae4 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
  /*
   * Block multiqueue core code
   *
@@ -143,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
  
  void blk_freeze_queue_start(struct request_queue *q)
  {
-       int freeze_depth;
-
-       freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-       if (freeze_depth == 1) {
+       mutex_lock(&q->mq_freeze_lock);
+       if (++q->mq_freeze_depth == 1) {
                 percpu_ref_kill(&q->q_usage_counter);
+               mutex_unlock(&q->mq_freeze_lock);
                 if (queue_is_mq(q))
                         blk_mq_run_hw_queues(q, false);
+       } else {
+               mutex_unlock(&q->mq_freeze_lock);
         }
  }
  EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -198,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
  
  void blk_mq_unfreeze_queue(struct request_queue *q)
  {
-       int freeze_depth;
-
-       freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-       WARN_ON_ONCE(freeze_depth < 0);
-       if (!freeze_depth) {
+       mutex_lock(&q->mq_freeze_lock);
+       q->mq_freeze_depth--;
+       WARN_ON_ONCE(q->mq_freeze_depth < 0);
+       if (!q->mq_freeze_depth) {
                 percpu_ref_resurrect(&q->q_usage_counter);
                 wake_up_all(&q->mq_freeze_wq);
         }
+       mutex_unlock(&q->mq_freeze_lock);
  }
  EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
  
@@ -353,13 +355,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
         struct elevator_queue *e = q->elevator;
         struct request *rq;
         unsigned int tag;
-       bool put_ctx_on_error = false;
+       bool clear_ctx_on_error = false;
  
         blk_queue_enter_live(q);
         data->q = q;
         if (likely(!data->ctx)) {
                 data->ctx = blk_mq_get_ctx(q);
-               put_ctx_on_error = true;
+               clear_ctx_on_error = true;
         }
         if (likely(!data->hctx))
                 data->hctx = blk_mq_map_queue(q, data->cmd_flags,
@@ -385,10 +387,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
  
         tag = blk_mq_get_tag(data);
         if (tag == BLK_MQ_TAG_FAIL) {
-               if (put_ctx_on_error) {
-                       blk_mq_put_ctx(data->ctx);
+               if (clear_ctx_on_error)
                         data->ctx = NULL;
-               }
                 blk_queue_exit(q);
                 return NULL;
         }
@@ -425,8 +425,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
         if (!rq)
                 return ERR_PTR(-EWOULDBLOCK);
  
-       blk_mq_put_ctx(alloc_data.ctx);
-
         rq->__data_len = 0;
         rq->__sector = (sector_t) -1;
         rq->bio = rq->biotail = NULL;
@@ -1762,9 +1760,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
         }
  }
  
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
+               unsigned int nr_segs)
  {
-       blk_init_request_from_bio(rq, bio);
+       if (bio->bi_opf & REQ_RAHEAD)
+               rq->cmd_flags |= REQ_FAILFAST_MASK;
+
+       rq->__sector = bio->bi_iter.bi_sector;
+       rq->write_hint = bio->bi_write_hint;
+       blk_rq_bio_prep(rq, bio, nr_segs);
  
         blk_account_io_start(rq, true);
  }
@@ -1934,20 +1938,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
         struct request *rq;
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
+       unsigned int nr_segs;
         blk_qc_t cookie;
  
         blk_queue_bounce(q, &bio);
-
-       blk_queue_split(q, &bio);
+       __blk_queue_split(q, &bio, &nr_segs);
  
         if (!bio_integrity_prep(bio))
                 return BLK_QC_T_NONE;
  
         if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, &same_queue_rq))
+           blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
                 return BLK_QC_T_NONE;
  
-       if (blk_mq_sched_bio_merge(q, bio))
+       if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                 return BLK_QC_T_NONE;
  
         rq_qos_throttle(q, bio);
@@ -1967,11 +1971,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
         cookie = request_to_qc_t(data.hctx, rq);
  
+       blk_mq_bio_to_request(rq, bio, nr_segs);
+
         plug = current->plug;
         if (unlikely(is_flush_fua)) {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
-
                 /* bypass scheduler for flush rq */
                 blk_insert_flush(rq);
                 blk_mq_run_hw_queue(data.hctx, true);
@@ -1983,9 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 unsigned int request_count = plug->rq_count;
                 struct request *last = NULL;
  
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
-
                 if (!request_count)
                         trace_block_plug(q);
                 else
@@ -1999,8 +1999,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  
                 blk_add_rq_to_plug(plug, rq);
         } else if (plug && !blk_queue_nomerges(q)) {
-               blk_mq_bio_to_request(rq, bio);
-
                 /*
                  * We do limited plugging. If the bio can be merged, do that.
                  * Otherwise the existing request in the plug list will be
@@ -2017,8 +2015,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 blk_add_rq_to_plug(plug, rq);
                 trace_block_plug(q);
  
-               blk_mq_put_ctx(data.ctx);
-
                 if (same_queue_rq) {
                         data.hctx = same_queue_rq->mq_hctx;
                         trace_block_unplug(q, 1, true);
@@ -2027,12 +2023,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 }
         } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
                         !data.hctx->dispatch_busy)) {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
         } else {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
                 blk_mq_sched_insert_request(rq, false, true, true);
         }
  
@@ -2062,7 +2054,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                 list_del_init(&page->lru);
                 /*
                  * Remove kmemleak object previously allocated in
-                * blk_mq_init_rq_map().
+                * blk_mq_alloc_rqs().
                  */
                 kmemleak_free(page_address(page));
                 __free_pages(page, page->private);
@@ -2267,12 +2259,11 @@ static void blk_mq_exit_hctx(struct request_queue *q,
         if (set->ops->exit_hctx)
                 set->ops->exit_hctx(hctx, hctx_idx);
  
-       if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->srcu);
-
         blk_mq_remove_cpuhp(hctx);
-       blk_free_flush_queue(hctx->fq);
-       sbitmap_free(&hctx->ctx_map);
+
+       spin_lock(&q->unused_hctx_lock);
+       list_add(&hctx->hctx_list, &q->unused_hctx_list);
+       spin_unlock(&q->unused_hctx_lock);
  }
  
  static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -2289,15 +2280,65 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
         }
  }
  
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
+{
+       int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
+
+       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
+                          __alignof__(struct blk_mq_hw_ctx)) !=
+                    sizeof(struct blk_mq_hw_ctx));
+
+       if (tag_set->flags & BLK_MQ_F_BLOCKING)
+               hw_ctx_size += sizeof(struct srcu_struct);
+
+       return hw_ctx_size;
+}
+
  static int blk_mq_init_hctx(struct request_queue *q,
                 struct blk_mq_tag_set *set,
                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
  {
-       int node;
+       hctx->queue_num = hctx_idx;
  
-       node = hctx->numa_node;
+       cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
+
+       hctx->tags = set->tags[hctx_idx];
+
+       if (set->ops->init_hctx &&
+           set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
+               goto unregister_cpu_notifier;
+
+       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
+                               hctx->numa_node))
+               goto exit_hctx;
+       return 0;
+
+ exit_hctx:
+       if (set->ops->exit_hctx)
+               set->ops->exit_hctx(hctx, hctx_idx);
+ unregister_cpu_notifier:
+       blk_mq_remove_cpuhp(hctx);
+       return -1;
+}
+
+static struct blk_mq_hw_ctx *
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
+               int node)
+{
+       struct blk_mq_hw_ctx *hctx;
+       gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+       hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+       if (!hctx)
+               goto fail_alloc_hctx;
+
+       if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
+               goto free_hctx;
+
+       atomic_set(&hctx->nr_active, 0);
         if (node == NUMA_NO_NODE)
-               node = hctx->numa_node = set->numa_node;
+               node = set->numa_node;
+       hctx->numa_node = node;
  
         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
         spin_lock_init(&hctx->lock);
@@ -2305,58 +2346,47 @@ static int blk_mq_init_hctx(struct request_queue *q,
         hctx->queue = q;
         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
  
-       cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
-
-       hctx->tags = set->tags[hctx_idx];
+       INIT_LIST_HEAD(&hctx->hctx_list);
  
         /*
          * Allocate space for all possible cpus to avoid allocation at
          * runtime
          */
         hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
+                       gfp, node);
         if (!hctx->ctxs)
-               goto unregister_cpu_notifier;
+               goto free_cpumask;
  
         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
-                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
+                               gfp, node))
                 goto free_ctxs;
-
         hctx->nr_ctx = 0;
  
         spin_lock_init(&hctx->dispatch_wait_lock);
         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
  
-       if (set->ops->init_hctx &&
-           set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
-               goto free_bitmap;
-
         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
+                       gfp);
         if (!hctx->fq)
-               goto exit_hctx;
-
-       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
-               goto free_fq;
+               goto free_bitmap;
  
         if (hctx->flags & BLK_MQ_F_BLOCKING)
                 init_srcu_struct(hctx->srcu);
+       blk_mq_hctx_kobj_init(hctx);
  
-       return 0;
+       return hctx;
  
- free_fq:
-       blk_free_flush_queue(hctx->fq);
- exit_hctx:
-       if (set->ops->exit_hctx)
-               set->ops->exit_hctx(hctx, hctx_idx);
   free_bitmap:
         sbitmap_free(&hctx->ctx_map);
   free_ctxs:
         kfree(hctx->ctxs);
- unregister_cpu_notifier:
-       blk_mq_remove_cpuhp(hctx);
-       return -1;
+ free_cpumask:
+       free_cpumask_var(hctx->cpumask);
+ free_hctx:
+       kfree(hctx);
+ fail_alloc_hctx:
+       return NULL;
  }
  
  static void blk_mq_init_cpu_queues(struct request_queue *q,
@@ -2631,13 +2661,17 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
   */
  void blk_mq_release(struct request_queue *q)
  {
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
+       struct blk_mq_hw_ctx *hctx, *next;
+       int i;
  
-       /* hctx kobj stays in hctx */
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (!hctx)
-                       continue;
+       cancel_delayed_work_sync(&q->requeue_work);
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
+
+       /* all hctx are in .unused_hctx_list now */
+       list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
+               list_del_init(&hctx->hctx_list);
                 kobject_put(&hctx->kobj);
         }
  
@@ -2700,51 +2734,38 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
  }
  EXPORT_SYMBOL(blk_mq_init_sq_queue);
  
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-       int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-                          __alignof__(struct blk_mq_hw_ctx)) !=
-                    sizeof(struct blk_mq_hw_ctx));
-
-       if (tag_set->flags & BLK_MQ_F_BLOCKING)
-               hw_ctx_size += sizeof(struct srcu_struct);
-
-       return hw_ctx_size;
-}
-
  static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                 struct blk_mq_tag_set *set, struct request_queue *q,
                 int hctx_idx, int node)
  {
-       struct blk_mq_hw_ctx *hctx;
-
-       hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                       node);
-       if (!hctx)
-               return NULL;
+       struct blk_mq_hw_ctx *hctx = NULL, *tmp;
  
-       if (!zalloc_cpumask_var_node(&hctx->cpumask,
-                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                               node)) {
-               kfree(hctx);
-               return NULL;
+       /* reuse dead hctx first */
+       spin_lock(&q->unused_hctx_lock);
+       list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
+               if (tmp->numa_node == node) {
+                       hctx = tmp;
+                       break;
+               }
         }
+       if (hctx)
+               list_del_init(&hctx->hctx_list);
+       spin_unlock(&q->unused_hctx_lock);
  
-       atomic_set(&hctx->nr_active, 0);
-       hctx->numa_node = node;
-       hctx->queue_num = hctx_idx;
+       if (!hctx)
+               hctx = blk_mq_alloc_hctx(q, set, node);
+       if (!hctx)
+               goto fail;
  
-       if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) {
-               free_cpumask_var(hctx->cpumask);
-               kfree(hctx);
-               return NULL;
-       }
-       blk_mq_hctx_kobj_init(hctx);
+       if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
+               goto free_hctx;
  
         return hctx;
+
+ free_hctx:
+       kobject_put(&hctx->kobj);
+ fail:
+       return NULL;
  }
  
  static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -2770,10 +2791,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
  
                 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
                 if (hctx) {
-                       if (hctxs[i]) {
+                       if (hctxs[i])
                                 blk_mq_exit_hctx(q, set, hctxs[i], i);
-                               kobject_put(&hctxs[i]->kobj);
-                       }
                         hctxs[i] = hctx;
                 } else {
                         if (hctxs[i])
@@ -2804,9 +2823,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                         if (hctx->tags)
                                 blk_mq_free_map_and_requests(set, j);
                         blk_mq_exit_hctx(q, set, hctx, j);
-                       kobject_put(&hctx->kobj);
                         hctxs[j] = NULL;
-
                 }
         }
         mutex_unlock(&q->sysfs_lock);
@@ -2838,7 +2855,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                 goto err_exit;
  
         if (blk_mq_alloc_ctxs(q))
-               goto err_exit;
+               goto err_poll;
  
         /* init q->mq_kobj and sw queues' kobjects */
         blk_mq_sysfs_init(q);
@@ -2849,6 +2866,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         if (!q->queue_hw_ctx)
                 goto err_sys_init;
  
+       INIT_LIST_HEAD(&q->unused_hctx_list);
+       spin_lock_init(&q->unused_hctx_lock);
+
         blk_mq_realloc_hw_ctxs(set, q);
         if (!q->nr_hw_queues)
                 goto err_hctxs;
@@ -2899,13 +2919,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         kfree(q->queue_hw_ctx);
  err_sys_init:
         blk_mq_sysfs_deinit(q);
+err_poll:
+       blk_stat_free_callback(q->poll_cb);
+       q->poll_cb = NULL;
  err_exit:
         q->mq_ops = NULL;
         return ERR_PTR(-ENOMEM);
  }
  EXPORT_SYMBOL(blk_mq_init_allocated_queue);
  
-void blk_mq_free_queue(struct request_queue *q)
+/* tags can _not_ be used after returning from blk_mq_exit_queue */
+void blk_mq_exit_queue(struct request_queue *q)
  {
         struct blk_mq_tag_set   *set = q->tag_set;