block/blk-mq-sched.c

   1 /*
   2  * blk-mq scheduling framework
   3  *
   4  * Copyright (C) 2016 Jens Axboe
   5  */
   6 #include <linux/kernel.h>
   7 #include <linux/module.h>
   8 #include <linux/blk-mq.h>
   9
  10 #include <trace/events/block.h>
  11
  12 #include "blk.h"
  13 #include "blk-mq.h"
  14 #include "blk-mq-debugfs.h"
  15 #include "blk-mq-sched.h"
  16 #include "blk-mq-tag.h"
  17 #include "blk-wbt.h"
  18
  19 void blk_mq_sched_free_hctx_data(struct request_queue *q,
  20                                  void (*exit)(struct blk_mq_hw_ctx *))
  21 {
  22         struct blk_mq_hw_ctx *hctx;
  23         int i;
  24
  25         queue_for_each_hw_ctx(q, hctx, i) {
  26                 if (exit && hctx->sched_data)
  27                         exit(hctx);
  28                 kfree(hctx->sched_data);
  29                 hctx->sched_data = NULL;
  30         }
  31 }
  32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
  33
  34 void blk_mq_sched_assign_ioc(struct request *rq)
  35 {
  36         struct request_queue *q = rq->q;
  37         struct io_context *ioc;
  38         struct io_cq *icq;
  39
  40         /*
  41          * May not have an IO context if it's a passthrough request
  42          */
  43         ioc = current->io_context;
  44         if (!ioc)
  45                 return;
  46
  47         spin_lock_irq(&q->queue_lock);
  48         icq = ioc_lookup_icq(ioc, q);
  49         spin_unlock_irq(&q->queue_lock);
  50
  51         if (!icq) {
  52                 icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
  53                 if (!icq)
  54                         return;
  55         }
  56         get_io_context(icq->ioc);
  57         rq->elv.icq = icq;
  58 }
  59
  60 /*
  61  * Mark a hardware queue as needing a restart. For shared queues, maintain
  62  * a count of how many hardware queues are marked for restart.
  63  */
  64 static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
  65 {
  66         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  67                 return;
  68
  69         set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  70 }
  71
  72 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
  73 {
  74         if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
  75                 return;
  76         clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
  77
  78         blk_mq_run_hw_queue(hctx, true);
  79 }
  80
  81 /*
  82  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
  83  * its queue by itself in its completion handler, so we don't need to
  84  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
  85  */
  86 static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
  87 {
  88         struct request_queue *q = hctx->queue;
  89         struct elevator_queue *e = q->elevator;
  90         LIST_HEAD(rq_list);
  91
  92         do {
  93                 struct request *rq;
  94
  95                 if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
  96                         break;
  97
  98                 if (!blk_mq_get_dispatch_budget(hctx))
  99                         break;
 100
 101                 rq = e->type->ops.dispatch_request(hctx);
 102                 if (!rq) {
 103                         blk_mq_put_dispatch_budget(hctx);
 104                         break;
 105                 }
 106
 107                 /*
 108                  * Now this rq owns the budget which has to be released
 109                  * if this rq won't be queued to driver via .queue_rq()
 110                  * in blk_mq_dispatch_rq_list().
 111                  */
 112                 list_add(&rq->queuelist, &rq_list);
 113         } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 114 }
 115
 116 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 117                                           struct blk_mq_ctx *ctx)
 118 {
 119         unsigned short idx = ctx->index_hw[hctx->type];
 120
 121         if (++idx == hctx->nr_ctx)
 122                 idx = 0;
 123
 124         return hctx->ctxs[idx];
 125 }
 126
 127 /*
 128  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
 129  * its queue by itself in its completion handler, so we don't need to
 130  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
 131  */
 132 static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
 133 {
 134         struct request_queue *q = hctx->queue;
 135         LIST_HEAD(rq_list);
 136         struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
 137
 138         do {
 139                 struct request *rq;
 140
 141                 if (!sbitmap_any_bit_set(&hctx->ctx_map))
 142                         break;
 143
 144                 if (!blk_mq_get_dispatch_budget(hctx))
 145                         break;
 146
 147                 rq = blk_mq_dequeue_from_ctx(hctx, ctx);
 148                 if (!rq) {
 149                         blk_mq_put_dispatch_budget(hctx);
 150                         break;
 151                 }
 152
 153                 /*
 154                  * Now this rq owns the budget which has to be released
 155                  * if this rq won't be queued to driver via .queue_rq()
 156                  * in blk_mq_dispatch_rq_list().
 157                  */
 158                 list_add(&rq->queuelist, &rq_list);
 159
 160                 /* round robin for fair dispatch */
 161                 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 162
 163         } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
 164
 165         WRITE_ONCE(hctx->dispatch_from, ctx);
 166 }
 167
 168 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 169 {
 170         struct request_queue *q = hctx->queue;
 171         struct elevator_queue *e = q->elevator;
 172         const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
 173         LIST_HEAD(rq_list);
 174
 175         /* RCU or SRCU read lock is needed before checking quiesced flag */
 176         if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
 177                 return;
 178
 179         hctx->run++;
 180
 181         /*
 182          * If we have previous entries on our dispatch list, grab them first for
 183          * more fair dispatch.
 184          */
 185         if (!list_empty_careful(&hctx->dispatch)) {
 186                 spin_lock(&hctx->lock);
 187                 if (!list_empty(&hctx->dispatch))
 188                         list_splice_init(&hctx->dispatch, &rq_list);
 189                 spin_unlock(&hctx->lock);
 190         }
 191
 192         /*
 193          * Only ask the scheduler for requests, if we didn't have residual
 194          * requests from the dispatch list. This is to avoid the case where
 195          * we only ever dispatch a fraction of the requests available because
 196          * of low device queue depth. Once we pull requests out of the IO
 197          * scheduler, we can no longer merge or sort them. So it's best to
 198          * leave them there for as long as we can. Mark the hw queue as
 199          * needing a restart in that case.
 200          *
 201          * We want to dispatch from the scheduler if there was nothing
 202          * on the dispatch list or we were able to dispatch from the
 203          * dispatch list.
 204          */
 205         if (!list_empty(&rq_list)) {
 206                 blk_mq_sched_mark_restart_hctx(hctx);
 207                 if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
 208                         if (has_sched_dispatch)
 209                                 blk_mq_do_dispatch_sched(hctx);
 210                         else
 211                                 blk_mq_do_dispatch_ctx(hctx);
 212                 }
 213         } else if (has_sched_dispatch) {
 214                 blk_mq_do_dispatch_sched(hctx);
 215         } else if (hctx->dispatch_busy) {
 216                 /* dequeue request one by one from sw queue if queue is busy */
 217                 blk_mq_do_dispatch_ctx(hctx);
 218         } else {
 219                 blk_mq_flush_busy_ctxs(hctx, &rq_list);
 220                 blk_mq_dispatch_rq_list(q, &rq_list, false);
 221         }
 222 }
 223
 224 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 225                             struct request **merged_request)
 226 {
 227         struct request *rq;
 228
 229         switch (elv_merge(q, &rq, bio)) {
 230         case ELEVATOR_BACK_MERGE:
 231                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 232                         return false;
 233                 if (!bio_attempt_back_merge(q, rq, bio))
 234                         return false;
 235                 *merged_request = attempt_back_merge(q, rq);
 236                 if (!*merged_request)
 237                         elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
 238                 return true;
 239         case ELEVATOR_FRONT_MERGE:
 240                 if (!blk_mq_sched_allow_merge(q, rq, bio))
 241                         return false;
 242                 if (!bio_attempt_front_merge(q, rq, bio))
 243                         return false;
 244                 *merged_request = attempt_front_merge(q, rq);
 245                 if (!*merged_request)
 246                         elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
 247                 return true;
 248         case ELEVATOR_DISCARD_MERGE:
 249                 return bio_attempt_discard_merge(q, rq, bio);
 250         default:
 251                 return false;
 252         }
 253 }
 254 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
 255
 256 /*
 257  * Iterate list of requests and see if we can merge this bio with any
 258  * of them.
 259  */
 260 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
 261                            struct bio *bio)
 262 {
 263         struct request *rq;
 264         int checked = 8;
 265
 266         list_for_each_entry_reverse(rq, list, queuelist) {
 267                 bool merged = false;
 268
 269                 if (!checked--)
 270                         break;
 271
 272                 if (!blk_rq_merge_ok(rq, bio))
 273                         continue;
 274
 275                 switch (blk_try_merge(rq, bio)) {
 276                 case ELEVATOR_BACK_MERGE:
 277                         if (blk_mq_sched_allow_merge(q, rq, bio))
 278                                 merged = bio_attempt_back_merge(q, rq, bio);
 279                         break;
 280                 case ELEVATOR_FRONT_MERGE:
 281                         if (blk_mq_sched_allow_merge(q, rq, bio))
 282                                 merged = bio_attempt_front_merge(q, rq, bio);
 283                         break;
 284                 case ELEVATOR_DISCARD_MERGE:
 285                         merged = bio_attempt_discard_merge(q, rq, bio);
 286                         break;
 287                 default:
 288                         continue;
 289                 }
 290
 291                 return merged;
 292         }
 293
 294         return false;
 295 }
 296 EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
 297
 298 /*
 299  * Reverse check our software queue for entries that we could potentially
 300  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
 301  * too much time checking for merges.
 302  */
 303 static bool blk_mq_attempt_merge(struct request_queue *q,
 304                                  struct blk_mq_ctx *ctx, struct bio *bio)
 305 {
 306         lockdep_assert_held(&ctx->lock);
 307
 308         if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
 309                 ctx->rq_merged++;
 310                 return true;
 311         }
 312
 313         return false;
 314 }
 315
 316 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 317 {
 318         struct elevator_queue *e = q->elevator;
 319         struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 320         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
 321         bool ret = false;
 322
 323         if (e && e->type->ops.bio_merge) {
 324                 blk_mq_put_ctx(ctx);
 325                 return e->type->ops.bio_merge(hctx, bio);
 326         }
 327
 328         if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
 329                         !list_empty_careful(&ctx->rq_list)) {
 330                 /* default per sw-queue merge */
 331                 spin_lock(&ctx->lock);
 332                 ret = blk_mq_attempt_merge(q, ctx, bio);
 333                 spin_unlock(&ctx->lock);
 334         }
 335
 336         blk_mq_put_ctx(ctx);
 337         return ret;
 338 }
 339
 340 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
 341 {
 342         return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
 343 }
 344 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
 345
 346 void blk_mq_sched_request_inserted(struct request *rq)
 347 {
 348         trace_block_rq_insert(rq->q, rq);
 349 }
 350 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 351
 352 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
 353                                        bool has_sched,
 354                                        struct request *rq)
 355 {
 356         /* dispatch flush rq directly */
 357         if (rq->rq_flags & RQF_FLUSH_SEQ) {
 358                 spin_lock(&hctx->lock);
 359                 list_add(&rq->queuelist, &hctx->dispatch);
 360                 spin_unlock(&hctx->lock);
 361                 return true;
 362         }
 363
 364         if (has_sched)
 365                 rq->rq_flags |= RQF_SORTED;
 366
 367         return false;
 368 }
 369
 370 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 371                                  bool run_queue, bool async)
 372 {
 373         struct request_queue *q = rq->q;
 374         struct elevator_queue *e = q->elevator;
 375         struct blk_mq_ctx *ctx = rq->mq_ctx;
 376         struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 377
 378         /* flush rq in flush machinery need to be dispatched directly */
 379         if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
 380                 blk_insert_flush(rq);
 381                 goto run;
 382         }
 383
 384         WARN_ON(e && (rq->tag != -1));
 385
 386         if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 387                 goto run;
 388
 389         if (e && e->type->ops.insert_requests) {
 390                 LIST_HEAD(list);
 391
 392                 list_add(&rq->queuelist, &list);
 393                 e->type->ops.insert_requests(hctx, &list, at_head);
 394         } else {
 395                 spin_lock(&ctx->lock);
 396                 __blk_mq_insert_request(hctx, rq, at_head);
 397                 spin_unlock(&ctx->lock);
 398         }
 399
 400 run:
 401         if (run_queue)
 402                 blk_mq_run_hw_queue(hctx, async);
 403 }
 404
 405 void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 406                                   struct blk_mq_ctx *ctx,
 407                                   struct list_head *list, bool run_queue_async)
 408 {
 409         struct elevator_queue *e;
 410
 411         e = hctx->queue->elevator;
 412         if (e && e->type->ops.insert_requests)
 413                 e->type->ops.insert_requests(hctx, list, false);
 414         else {
 415                 /*
 416                  * try to issue requests directly if the hw queue isn't
 417                  * busy in case of 'none' scheduler, and this way may save
 418                  * us one extra enqueue & dequeue to sw queue.
 419                  */
 420                 if (!hctx->dispatch_busy && !e && !run_queue_async) {
 421                         blk_mq_try_issue_list_directly(hctx, list);
 422                         if (list_empty(list))
 423                                 return;
 424                 }
 425                 blk_mq_insert_requests(hctx, ctx, list);
 426         }
 427
 428         blk_mq_run_hw_queue(hctx, run_queue_async);
 429 }
 430
 431 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
 432                                    struct blk_mq_hw_ctx *hctx,
 433                                    unsigned int hctx_idx)
 434 {
 435         if (hctx->sched_tags) {
 436                 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
 437                 blk_mq_free_rq_map(hctx->sched_tags);
 438                 hctx->sched_tags = NULL;
 439         }
 440 }
 441
 442 static int blk_mq_sched_alloc_tags(struct request_queue *q,
 443                                    struct blk_mq_hw_ctx *hctx,
 444                                    unsigned int hctx_idx)
 445 {
 446         struct blk_mq_tag_set *set = q->tag_set;
 447         int ret;
 448
 449         hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 450                                                set->reserved_tags);
 451         if (!hctx->sched_tags)
 452                 return -ENOMEM;
 453
 454         ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 455         if (ret)
 456                 blk_mq_sched_free_tags(set, hctx, hctx_idx);
 457
 458         return ret;
 459 }
 460
 461 static void blk_mq_sched_tags_teardown(struct request_queue *q)
 462 {
 463         struct blk_mq_tag_set *set = q->tag_set;
 464         struct blk_mq_hw_ctx *hctx;
 465         int i;
 466
 467         queue_for_each_hw_ctx(q, hctx, i)
 468                 blk_mq_sched_free_tags(set, hctx, i);
 469 }
 470
 471 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 472 {
 473         struct blk_mq_hw_ctx *hctx;
 474         struct elevator_queue *eq;
 475         unsigned int i;
 476         int ret;
 477
 478         if (!e) {
 479                 q->elevator = NULL;
 480                 q->nr_requests = q->tag_set->queue_depth;
 481                 return 0;
 482         }
 483
 484         /*
 485          * Default to double of smaller one between hw queue_depth and 128,
 486          * since we don't split into sync/async like the old code did.
 487          * Additionally, this is a per-hw queue depth.
 488          */
 489         q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
 490                                    BLKDEV_MAX_RQ);
 491
 492         queue_for_each_hw_ctx(q, hctx, i) {
 493                 ret = blk_mq_sched_alloc_tags(q, hctx, i);
 494                 if (ret)
 495                         goto err;
 496         }
 497
 498         ret = e->ops.init_sched(q, e);
 499         if (ret)
 500                 goto err;
 501
 502         blk_mq_debugfs_register_sched(q);
 503
 504         queue_for_each_hw_ctx(q, hctx, i) {
 505                 if (e->ops.init_hctx) {
 506                         ret = e->ops.init_hctx(hctx, i);
 507                         if (ret) {
 508                                 eq = q->elevator;
 509                                 blk_mq_exit_sched(q, eq);
 510                                 kobject_put(&eq->kobj);
 511                                 return ret;
 512                         }
 513                 }
 514                 blk_mq_debugfs_register_sched_hctx(q, hctx);
 515         }
 516
 517         return 0;
 518
 519 err:
 520         blk_mq_sched_tags_teardown(q);
 521         q->elevator = NULL;
 522         return ret;
 523 }
 524
 525 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 526 {
 527         struct blk_mq_hw_ctx *hctx;
 528         unsigned int i;
 529
 530         queue_for_each_hw_ctx(q, hctx, i) {
 531                 blk_mq_debugfs_unregister_sched_hctx(hctx);
 532                 if (e->type->ops.exit_hctx && hctx->sched_data) {
 533                         e->type->ops.exit_hctx(hctx, i);
 534                         hctx->sched_data = NULL;
 535                 }
 536         }
 537         blk_mq_debugfs_unregister_sched(q);
 538         if (e->type->ops.exit_sched)
 539                 e->type->ops.exit_sched(e);
 540         blk_mq_sched_tags_teardown(q);
 541         q->elevator = NULL;
 542 }