]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - block/blk-mq.c
Merge branch 'for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu
[linux.git] / block / blk-mq.c
index fc60ed7e940ead5ae7d7332ee9f64b9ffe922aca..e5ef40c603ca36d64fbbf3dc965fb4ac63db9ae4 100644 (file)
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * Block multiqueue core code
  *
@@ -143,13 +144,14 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 
 void blk_freeze_queue_start(struct request_queue *q)
 {
-       int freeze_depth;
-
-       freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
-       if (freeze_depth == 1) {
+       mutex_lock(&q->mq_freeze_lock);
+       if (++q->mq_freeze_depth == 1) {
                percpu_ref_kill(&q->q_usage_counter);
+               mutex_unlock(&q->mq_freeze_lock);
                if (queue_is_mq(q))
                        blk_mq_run_hw_queues(q, false);
+       } else {
+               mutex_unlock(&q->mq_freeze_lock);
        }
 }
 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
@@ -198,14 +200,14 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 
 void blk_mq_unfreeze_queue(struct request_queue *q)
 {
-       int freeze_depth;
-
-       freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
-       WARN_ON_ONCE(freeze_depth < 0);
-       if (!freeze_depth) {
+       mutex_lock(&q->mq_freeze_lock);
+       q->mq_freeze_depth--;
+       WARN_ON_ONCE(q->mq_freeze_depth < 0);
+       if (!q->mq_freeze_depth) {
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
+       mutex_unlock(&q->mq_freeze_lock);
 }
 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 
@@ -353,13 +355,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
        struct elevator_queue *e = q->elevator;
        struct request *rq;
        unsigned int tag;
-       bool put_ctx_on_error = false;
+       bool clear_ctx_on_error = false;
 
        blk_queue_enter_live(q);
        data->q = q;
        if (likely(!data->ctx)) {
                data->ctx = blk_mq_get_ctx(q);
-               put_ctx_on_error = true;
+               clear_ctx_on_error = true;
        }
        if (likely(!data->hctx))
                data->hctx = blk_mq_map_queue(q, data->cmd_flags,
@@ -385,10 +387,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_TAG_FAIL) {
-               if (put_ctx_on_error) {
-                       blk_mq_put_ctx(data->ctx);
+               if (clear_ctx_on_error)
                        data->ctx = NULL;
-               }
                blk_queue_exit(q);
                return NULL;
        }
@@ -425,8 +425,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
        if (!rq)
                return ERR_PTR(-EWOULDBLOCK);
 
-       blk_mq_put_ctx(alloc_data.ctx);
-
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
@@ -1762,9 +1760,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
        }
 }
 
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
+               unsigned int nr_segs)
 {
-       blk_init_request_from_bio(rq, bio);
+       if (bio->bi_opf & REQ_RAHEAD)
+               rq->cmd_flags |= REQ_FAILFAST_MASK;
+
+       rq->__sector = bio->bi_iter.bi_sector;
+       rq->write_hint = bio->bi_write_hint;
+       blk_rq_bio_prep(rq, bio, nr_segs);
 
        blk_account_io_start(rq, true);
 }
@@ -1934,20 +1938,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
        struct request *rq;
        struct blk_plug *plug;
        struct request *same_queue_rq = NULL;
+       unsigned int nr_segs;
        blk_qc_t cookie;
 
        blk_queue_bounce(q, &bio);
-
-       blk_queue_split(q, &bio);
+       __blk_queue_split(q, &bio, &nr_segs);
 
        if (!bio_integrity_prep(bio))
                return BLK_QC_T_NONE;
 
        if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, &same_queue_rq))
+           blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
                return BLK_QC_T_NONE;
 
-       if (blk_mq_sched_bio_merge(q, bio))
+       if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                return BLK_QC_T_NONE;
 
        rq_qos_throttle(q, bio);
@@ -1967,11 +1971,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
        cookie = request_to_qc_t(data.hctx, rq);
 
+       blk_mq_bio_to_request(rq, bio, nr_segs);
+
        plug = current->plug;
        if (unlikely(is_flush_fua)) {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
-
                /* bypass scheduler for flush rq */
                blk_insert_flush(rq);
                blk_mq_run_hw_queue(data.hctx, true);
@@ -1983,9 +1986,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                unsigned int request_count = plug->rq_count;
                struct request *last = NULL;
 
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
-
                if (!request_count)
                        trace_block_plug(q);
                else
@@ -1999,8 +1999,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
                blk_add_rq_to_plug(plug, rq);
        } else if (plug && !blk_queue_nomerges(q)) {
-               blk_mq_bio_to_request(rq, bio);
-
                /*
                 * We do limited plugging. If the bio can be merged, do that.
                 * Otherwise the existing request in the plug list will be
@@ -2017,8 +2015,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                blk_add_rq_to_plug(plug, rq);
                trace_block_plug(q);
 
-               blk_mq_put_ctx(data.ctx);
-
                if (same_queue_rq) {
                        data.hctx = same_queue_rq->mq_hctx;
                        trace_block_unplug(q, 1, true);
@@ -2027,12 +2023,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                }
        } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
                        !data.hctx->dispatch_busy)) {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
                blk_mq_try_issue_directly(data.hctx, rq, &cookie);
        } else {
-               blk_mq_put_ctx(data.ctx);
-               blk_mq_bio_to_request(rq, bio);
                blk_mq_sched_insert_request(rq, false, true, true);
        }
 
@@ -2062,7 +2054,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                list_del_init(&page->lru);
                /*
                 * Remove kmemleak object previously allocated in
-                * blk_mq_init_rq_map().
+                * blk_mq_alloc_rqs().
                 */
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
@@ -2267,12 +2259,11 @@ static void blk_mq_exit_hctx(struct request_queue *q,
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 
-       if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->srcu);
-
        blk_mq_remove_cpuhp(hctx);
-       blk_free_flush_queue(hctx->fq);
-       sbitmap_free(&hctx->ctx_map);
+
+       spin_lock(&q->unused_hctx_lock);
+       list_add(&hctx->hctx_list, &q->unused_hctx_list);
+       spin_unlock(&q->unused_hctx_lock);
 }
 
 static void blk_mq_exit_hw_queues(struct request_queue *q,
@@ -2289,15 +2280,65 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
        }
 }
 
+static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
+{
+       int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
+
+       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
+                          __alignof__(struct blk_mq_hw_ctx)) !=
+                    sizeof(struct blk_mq_hw_ctx));
+
+       if (tag_set->flags & BLK_MQ_F_BLOCKING)
+               hw_ctx_size += sizeof(struct srcu_struct);
+
+       return hw_ctx_size;
+}
+
 static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
-       int node;
+       hctx->queue_num = hctx_idx;
 
-       node = hctx->numa_node;
+       cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
+
+       hctx->tags = set->tags[hctx_idx];
+
+       if (set->ops->init_hctx &&
+           set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
+               goto unregister_cpu_notifier;
+
+       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
+                               hctx->numa_node))
+               goto exit_hctx;
+       return 0;
+
+ exit_hctx:
+       if (set->ops->exit_hctx)
+               set->ops->exit_hctx(hctx, hctx_idx);
+ unregister_cpu_notifier:
+       blk_mq_remove_cpuhp(hctx);
+       return -1;
+}
+
+static struct blk_mq_hw_ctx *
+blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
+               int node)
+{
+       struct blk_mq_hw_ctx *hctx;
+       gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
+
+       hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+       if (!hctx)
+               goto fail_alloc_hctx;
+
+       if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
+               goto free_hctx;
+
+       atomic_set(&hctx->nr_active, 0);
        if (node == NUMA_NO_NODE)
-               node = hctx->numa_node = set->numa_node;
+               node = set->numa_node;
+       hctx->numa_node = node;
 
        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
@@ -2305,58 +2346,47 @@ static int blk_mq_init_hctx(struct request_queue *q,
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
 
-       cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
-
-       hctx->tags = set->tags[hctx_idx];
+       INIT_LIST_HEAD(&hctx->hctx_list);
 
        /*
         * Allocate space for all possible cpus to avoid allocation at
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node);
+                       gfp, node);
        if (!hctx->ctxs)
-               goto unregister_cpu_notifier;
+               goto free_cpumask;
 
        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
-                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node))
+                               gfp, node))
                goto free_ctxs;
-
        hctx->nr_ctx = 0;
 
        spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
 
-       if (set->ops->init_hctx &&
-           set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
-               goto free_bitmap;
-
        hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
+                       gfp);
        if (!hctx->fq)
-               goto exit_hctx;
-
-       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
-               goto free_fq;
+               goto free_bitmap;
 
        if (hctx->flags & BLK_MQ_F_BLOCKING)
                init_srcu_struct(hctx->srcu);
+       blk_mq_hctx_kobj_init(hctx);
 
-       return 0;
+       return hctx;
 
- free_fq:
-       blk_free_flush_queue(hctx->fq);
- exit_hctx:
-       if (set->ops->exit_hctx)
-               set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
        sbitmap_free(&hctx->ctx_map);
  free_ctxs:
        kfree(hctx->ctxs);
- unregister_cpu_notifier:
-       blk_mq_remove_cpuhp(hctx);
-       return -1;
+ free_cpumask:
+       free_cpumask_var(hctx->cpumask);
+ free_hctx:
+       kfree(hctx);
+ fail_alloc_hctx:
+       return NULL;
 }
 
 static void blk_mq_init_cpu_queues(struct request_queue *q,
@@ -2631,13 +2661,17 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
  */
 void blk_mq_release(struct request_queue *q)
 {
-       struct blk_mq_hw_ctx *hctx;
-       unsigned int i;
+       struct blk_mq_hw_ctx *hctx, *next;
+       int i;
 
-       /* hctx kobj stays in hctx */
-       queue_for_each_hw_ctx(q, hctx, i) {
-               if (!hctx)
-                       continue;
+       cancel_delayed_work_sync(&q->requeue_work);
+
+       queue_for_each_hw_ctx(q, hctx, i)
+               WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
+
+       /* all hctx are in .unused_hctx_list now */
+       list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
+               list_del_init(&hctx->hctx_list);
                kobject_put(&hctx->kobj);
        }
 
@@ -2700,51 +2734,38 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
 }
 EXPORT_SYMBOL(blk_mq_init_sq_queue);
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-       int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-                          __alignof__(struct blk_mq_hw_ctx)) !=
-                    sizeof(struct blk_mq_hw_ctx));
-
-       if (tag_set->flags & BLK_MQ_F_BLOCKING)
-               hw_ctx_size += sizeof(struct srcu_struct);
-
-       return hw_ctx_size;
-}
-
 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                struct blk_mq_tag_set *set, struct request_queue *q,
                int hctx_idx, int node)
 {
-       struct blk_mq_hw_ctx *hctx;
-
-       hctx = kzalloc_node(blk_mq_hw_ctx_size(set),
-                       GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                       node);
-       if (!hctx)
-               return NULL;
+       struct blk_mq_hw_ctx *hctx = NULL, *tmp;
 
-       if (!zalloc_cpumask_var_node(&hctx->cpumask,
-                               GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
-                               node)) {
-               kfree(hctx);
-               return NULL;
+       /* reuse dead hctx first */
+       spin_lock(&q->unused_hctx_lock);
+       list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
+               if (tmp->numa_node == node) {
+                       hctx = tmp;
+                       break;
+               }
        }
+       if (hctx)
+               list_del_init(&hctx->hctx_list);
+       spin_unlock(&q->unused_hctx_lock);
 
-       atomic_set(&hctx->nr_active, 0);
-       hctx->numa_node = node;
-       hctx->queue_num = hctx_idx;
+       if (!hctx)
+               hctx = blk_mq_alloc_hctx(q, set, node);
+       if (!hctx)
+               goto fail;
 
-       if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) {
-               free_cpumask_var(hctx->cpumask);
-               kfree(hctx);
-               return NULL;
-       }
-       blk_mq_hctx_kobj_init(hctx);
+       if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
+               goto free_hctx;
 
        return hctx;
+
+ free_hctx:
+       kobject_put(&hctx->kobj);
+ fail:
+       return NULL;
 }
 
 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
@@ -2770,10 +2791,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 
                hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
                if (hctx) {
-                       if (hctxs[i]) {
+                       if (hctxs[i])
                                blk_mq_exit_hctx(q, set, hctxs[i], i);
-                               kobject_put(&hctxs[i]->kobj);
-                       }
                        hctxs[i] = hctx;
                } else {
                        if (hctxs[i])
@@ -2804,9 +2823,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                        if (hctx->tags)
                                blk_mq_free_map_and_requests(set, j);
                        blk_mq_exit_hctx(q, set, hctx, j);
-                       kobject_put(&hctx->kobj);
                        hctxs[j] = NULL;
-
                }
        }
        mutex_unlock(&q->sysfs_lock);
@@ -2838,7 +2855,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                goto err_exit;
 
        if (blk_mq_alloc_ctxs(q))
-               goto err_exit;
+               goto err_poll;
 
        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);
@@ -2849,6 +2866,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        if (!q->queue_hw_ctx)
                goto err_sys_init;
 
+       INIT_LIST_HEAD(&q->unused_hctx_list);
+       spin_lock_init(&q->unused_hctx_lock);
+
        blk_mq_realloc_hw_ctxs(set, q);
        if (!q->nr_hw_queues)
                goto err_hctxs;
@@ -2899,13 +2919,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
        kfree(q->queue_hw_ctx);
 err_sys_init:
        blk_mq_sysfs_deinit(q);
+err_poll:
+       blk_stat_free_callback(q->poll_cb);
+       q->poll_cb = NULL;
 err_exit:
        q->mq_ops = NULL;
        return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
 
-void blk_mq_free_queue(struct request_queue *q)
+/* tags can _not_ be used after returning from blk_mq_exit_queue */
+void blk_mq_exit_queue(struct request_queue *q)
 {
        struct blk_mq_tag_set   *set = q->tag_set;