2 * blk-mq scheduling framework
4 * Copyright (C) 2016 Jens Axboe
6 #include <linux/kernel.h>
7 #include <linux/module.h>
8 #include <linux/blk-mq.h>
10 #include <trace/events/block.h>
14 #include "blk-mq-debugfs.h"
15 #include "blk-mq-sched.h"
16 #include "blk-mq-tag.h"
19 void blk_mq_sched_free_hctx_data(struct request_queue *q,
20 void (*exit)(struct blk_mq_hw_ctx *))
22 struct blk_mq_hw_ctx *hctx;
25 queue_for_each_hw_ctx(q, hctx, i) {
26 if (exit && hctx->sched_data)
28 kfree(hctx->sched_data);
29 hctx->sched_data = NULL;
32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
34 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
36 struct request_queue *q = rq->q;
37 struct io_context *ioc = rq_ioc(bio);
40 spin_lock_irq(q->queue_lock);
41 icq = ioc_lookup_icq(ioc, q);
42 spin_unlock_irq(q->queue_lock);
45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
49 get_io_context(icq->ioc);
53 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
55 struct request_queue *q = hctx->queue;
56 struct elevator_queue *e = q->elevator;
57 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
58 bool did_work = false;
61 if (unlikely(blk_mq_hctx_stopped(hctx)))
67 * If we have previous entries on our dispatch list, grab them first for
70 if (!list_empty_careful(&hctx->dispatch)) {
71 spin_lock(&hctx->lock);
72 if (!list_empty(&hctx->dispatch))
73 list_splice_init(&hctx->dispatch, &rq_list);
74 spin_unlock(&hctx->lock);
78 * Only ask the scheduler for requests, if we didn't have residual
79 * requests from the dispatch list. This is to avoid the case where
80 * we only ever dispatch a fraction of the requests available because
81 * of low device queue depth. Once we pull requests out of the IO
82 * scheduler, we can no longer merge or sort them. So it's best to
83 * leave them there for as long as we can. Mark the hw queue as
84 * needing a restart in that case.
86 if (!list_empty(&rq_list)) {
87 blk_mq_sched_mark_restart_hctx(hctx);
88 did_work = blk_mq_dispatch_rq_list(q, &rq_list);
89 } else if (!has_sched_dispatch) {
90 blk_mq_flush_busy_ctxs(hctx, &rq_list);
91 blk_mq_dispatch_rq_list(q, &rq_list);
95 * We want to dispatch from the scheduler if we had no work left
96 * on the dispatch list, OR if we did have work but weren't able
99 if (!did_work && has_sched_dispatch) {
103 rq = e->type->ops.mq.dispatch_request(hctx);
106 list_add(&rq->queuelist, &rq_list);
107 } while (blk_mq_dispatch_rq_list(q, &rq_list));
111 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
112 struct request **merged_request)
116 switch (elv_merge(q, &rq, bio)) {
117 case ELEVATOR_BACK_MERGE:
118 if (!blk_mq_sched_allow_merge(q, rq, bio))
120 if (!bio_attempt_back_merge(q, rq, bio))
122 *merged_request = attempt_back_merge(q, rq);
123 if (!*merged_request)
124 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
126 case ELEVATOR_FRONT_MERGE:
127 if (!blk_mq_sched_allow_merge(q, rq, bio))
129 if (!bio_attempt_front_merge(q, rq, bio))
131 *merged_request = attempt_front_merge(q, rq);
132 if (!*merged_request)
133 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
139 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
142 * Reverse check our software queue for entries that we could potentially
143 * merge with. Currently includes a hand-wavy stop count of 8, to not spend
144 * too much time checking for merges.
146 static bool blk_mq_attempt_merge(struct request_queue *q,
147 struct blk_mq_ctx *ctx, struct bio *bio)
152 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
158 if (!blk_rq_merge_ok(rq, bio))
161 switch (blk_try_merge(rq, bio)) {
162 case ELEVATOR_BACK_MERGE:
163 if (blk_mq_sched_allow_merge(q, rq, bio))
164 merged = bio_attempt_back_merge(q, rq, bio);
166 case ELEVATOR_FRONT_MERGE:
167 if (blk_mq_sched_allow_merge(q, rq, bio))
168 merged = bio_attempt_front_merge(q, rq, bio);
170 case ELEVATOR_DISCARD_MERGE:
171 merged = bio_attempt_discard_merge(q, rq, bio);
185 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
187 struct elevator_queue *e = q->elevator;
188 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
189 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
192 if (e && e->type->ops.mq.bio_merge) {
194 return e->type->ops.mq.bio_merge(hctx, bio);
197 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
198 /* default per sw-queue merge */
199 spin_lock(&ctx->lock);
200 ret = blk_mq_attempt_merge(q, ctx, bio);
201 spin_unlock(&ctx->lock);
208 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
210 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
212 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
214 void blk_mq_sched_request_inserted(struct request *rq)
216 trace_block_rq_insert(rq->q, rq);
218 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
220 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
224 rq->rq_flags |= RQF_SORTED;
229 * If we already have a real request tag, send directly to
232 spin_lock(&hctx->lock);
233 list_add(&rq->queuelist, &hctx->dispatch);
234 spin_unlock(&hctx->lock);
238 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
240 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
241 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
242 if (blk_mq_hctx_has_pending(hctx)) {
243 blk_mq_run_hw_queue(hctx, true);
251 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
253 * @skip: the list element that will not be examined. Iteration starts at
255 * @head: head of the list to examine. This list must have at least one
256 * element, namely @skip.
257 * @member: name of the list_head structure within typeof(*pos).
259 #define list_for_each_entry_rcu_rr(pos, skip, head, member) \
260 for ((pos) = (skip); \
261 (pos = (pos)->member.next != (head) ? list_entry_rcu( \
262 (pos)->member.next, typeof(*pos), member) : \
263 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
267 * Called after a driver tag has been freed to check whether a hctx needs to
268 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
269 * queues in a round-robin fashion if the tag set of @hctx is shared with other
272 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
274 struct blk_mq_tags *const tags = hctx->tags;
275 struct blk_mq_tag_set *const set = hctx->queue->tag_set;
276 struct request_queue *const queue = hctx->queue, *q;
277 struct blk_mq_hw_ctx *hctx2;
280 if (set->flags & BLK_MQ_F_TAG_SHARED) {
282 list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
284 queue_for_each_hw_ctx(q, hctx2, i)
285 if (hctx2->tags == tags &&
286 blk_mq_sched_restart_hctx(hctx2))
289 j = hctx->queue_num + 1;
290 for (i = 0; i < queue->nr_hw_queues; i++, j++) {
291 if (j == queue->nr_hw_queues)
293 hctx2 = queue->queue_hw_ctx[j];
294 if (hctx2->tags == tags &&
295 blk_mq_sched_restart_hctx(hctx2))
301 blk_mq_sched_restart_hctx(hctx);
306 * Add flush/fua to the queue. If we fail getting a driver tag, then
307 * punt to the requeue list. Requeue will re-invoke us from a context
308 * that's safe to block from.
310 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
311 struct request *rq, bool can_block)
313 if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
314 blk_insert_flush(rq);
315 blk_mq_run_hw_queue(hctx, true);
317 blk_mq_add_to_requeue_list(rq, false, true);
320 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
321 bool run_queue, bool async, bool can_block)
323 struct request_queue *q = rq->q;
324 struct elevator_queue *e = q->elevator;
325 struct blk_mq_ctx *ctx = rq->mq_ctx;
326 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
328 if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
329 blk_mq_sched_insert_flush(hctx, rq, can_block);
333 if (e && blk_mq_sched_bypass_insert(hctx, rq))
336 if (e && e->type->ops.mq.insert_requests) {
339 list_add(&rq->queuelist, &list);
340 e->type->ops.mq.insert_requests(hctx, &list, at_head);
342 spin_lock(&ctx->lock);
343 __blk_mq_insert_request(hctx, rq, at_head);
344 spin_unlock(&ctx->lock);
349 blk_mq_run_hw_queue(hctx, async);
352 void blk_mq_sched_insert_requests(struct request_queue *q,
353 struct blk_mq_ctx *ctx,
354 struct list_head *list, bool run_queue_async)
356 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
357 struct elevator_queue *e = hctx->queue->elevator;
360 struct request *rq, *next;
363 * We bypass requests that already have a driver tag assigned,
364 * which should only be flushes. Flushes are only ever inserted
365 * as single requests, so we shouldn't ever hit the
366 * WARN_ON_ONCE() below (but let's handle it just in case).
368 list_for_each_entry_safe(rq, next, list, queuelist) {
369 if (WARN_ON_ONCE(rq->tag != -1)) {
370 list_del_init(&rq->queuelist);
371 blk_mq_sched_bypass_insert(hctx, rq);
376 if (e && e->type->ops.mq.insert_requests)
377 e->type->ops.mq.insert_requests(hctx, list, false);
379 blk_mq_insert_requests(hctx, ctx, list);
381 blk_mq_run_hw_queue(hctx, run_queue_async);
384 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
385 struct blk_mq_hw_ctx *hctx,
386 unsigned int hctx_idx)
388 if (hctx->sched_tags) {
389 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
390 blk_mq_free_rq_map(hctx->sched_tags);
391 hctx->sched_tags = NULL;
395 static int blk_mq_sched_alloc_tags(struct request_queue *q,
396 struct blk_mq_hw_ctx *hctx,
397 unsigned int hctx_idx)
399 struct blk_mq_tag_set *set = q->tag_set;
402 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
404 if (!hctx->sched_tags)
407 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
409 blk_mq_sched_free_tags(set, hctx, hctx_idx);
414 static void blk_mq_sched_tags_teardown(struct request_queue *q)
416 struct blk_mq_tag_set *set = q->tag_set;
417 struct blk_mq_hw_ctx *hctx;
420 queue_for_each_hw_ctx(q, hctx, i)
421 blk_mq_sched_free_tags(set, hctx, i);
424 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
425 unsigned int hctx_idx)
427 struct elevator_queue *e = q->elevator;
433 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
437 if (e->type->ops.mq.init_hctx) {
438 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
440 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
445 blk_mq_debugfs_register_sched_hctx(q, hctx);
450 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
451 unsigned int hctx_idx)
453 struct elevator_queue *e = q->elevator;
458 blk_mq_debugfs_unregister_sched_hctx(hctx);
460 if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
461 e->type->ops.mq.exit_hctx(hctx, hctx_idx);
462 hctx->sched_data = NULL;
465 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
468 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
470 struct blk_mq_hw_ctx *hctx;
471 struct elevator_queue *eq;
481 * Default to 256, since we don't split into sync/async like the
482 * old code did. Additionally, this is a per-hw queue depth.
484 q->nr_requests = 2 * BLKDEV_MAX_RQ;
486 queue_for_each_hw_ctx(q, hctx, i) {
487 ret = blk_mq_sched_alloc_tags(q, hctx, i);
492 ret = e->ops.mq.init_sched(q, e);
496 blk_mq_debugfs_register_sched(q);
498 queue_for_each_hw_ctx(q, hctx, i) {
499 if (e->ops.mq.init_hctx) {
500 ret = e->ops.mq.init_hctx(hctx, i);
503 blk_mq_exit_sched(q, eq);
504 kobject_put(&eq->kobj);
508 blk_mq_debugfs_register_sched_hctx(q, hctx);
514 blk_mq_sched_tags_teardown(q);
519 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
521 struct blk_mq_hw_ctx *hctx;
524 queue_for_each_hw_ctx(q, hctx, i) {
525 blk_mq_debugfs_unregister_sched_hctx(hctx);
526 if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
527 e->type->ops.mq.exit_hctx(hctx, i);
528 hctx->sched_data = NULL;
531 blk_mq_debugfs_unregister_sched(q);
532 if (e->type->ops.mq.exit_sched)
533 e->type->ops.mq.exit_sched(e);
534 blk_mq_sched_tags_teardown(q);
538 int blk_mq_sched_init(struct request_queue *q)
542 mutex_lock(&q->sysfs_lock);
543 ret = elevator_init(q, NULL);
544 mutex_unlock(&q->sysfs_lock);