block/blk-mq.c

   1 /*
   2  * Block multiqueue core code
   3  *
   4  * Copyright (C) 2013-2014 Jens Axboe
   5  * Copyright (C) 2013-2014 Christoph Hellwig
   6  */
   7 #include <linux/kernel.h>
   8 #include <linux/module.h>
   9 #include <linux/backing-dev.h>
  10 #include <linux/bio.h>
  11 #include <linux/blkdev.h>
  12 #include <linux/kmemleak.h>
  13 #include <linux/mm.h>
  14 #include <linux/init.h>
  15 #include <linux/slab.h>
  16 #include <linux/workqueue.h>
  17 #include <linux/smp.h>
  18 #include <linux/llist.h>
  19 #include <linux/list_sort.h>
  20 #include <linux/cpu.h>
  21 #include <linux/cache.h>
  22 #include <linux/sched/sysctl.h>
  23 #include <linux/sched/topology.h>
  24 #include <linux/sched/signal.h>
  25 #include <linux/delay.h>
  26 #include <linux/crash_dump.h>
  27 #include <linux/prefetch.h>
  28
  29 #include <trace/events/block.h>
  30
  31 #include <linux/blk-mq.h>
  32 #include "blk.h"
  33 #include "blk-mq.h"
  34 #include "blk-mq-debugfs.h"
  35 #include "blk-mq-tag.h"
  36 #include "blk-stat.h"
  37 #include "blk-wbt.h"
  38 #include "blk-mq-sched.h"
  39
  40 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
  41 static void blk_mq_poll_stats_start(struct request_queue *q);
  42 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  43
  44 static int blk_mq_poll_stats_bkt(const struct request *rq)
  45 {
  46         int ddir, bytes, bucket;
  47
  48         ddir = rq_data_dir(rq);
  49         bytes = blk_rq_bytes(rq);
  50
  51         bucket = ddir + 2*(ilog2(bytes) - 9);
  52
  53         if (bucket < 0)
  54                 return -1;
  55         else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
  56                 return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
  57
  58         return bucket;
  59 }
  60
  61 /*
  62  * Check if any of the ctx's have pending work in this hardware queue
  63  */
  64 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  65 {
  66         return !list_empty_careful(&hctx->dispatch) ||
  67                 sbitmap_any_bit_set(&hctx->ctx_map) ||
  68                         blk_mq_sched_has_work(hctx);
  69 }
  70
  71 /*
  72  * Mark this ctx as having pending work in this hardware queue
  73  */
  74 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
  75                                      struct blk_mq_ctx *ctx)
  76 {
  77         if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
  78                 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
  79 }
  80
  81 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
  82                                       struct blk_mq_ctx *ctx)
  83 {
  84         sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
  85 }
  86
  87 struct mq_inflight {
  88         struct hd_struct *part;
  89         unsigned int *inflight;
  90 };
  91
  92 static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
  93                                   struct request *rq, void *priv,
  94                                   bool reserved)
  95 {
  96         struct mq_inflight *mi = priv;
  97
  98         if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
  99             !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
 100                 /*
 101                  * index[0] counts the specific partition that was asked
 102                  * for. index[1] counts the ones that are active on the
 103                  * whole device, so increment that if mi->part is indeed
 104                  * a partition, and not a whole device.
 105                  */
 106                 if (rq->part == mi->part)
 107                         mi->inflight[0]++;
 108                 if (mi->part->partno)
 109                         mi->inflight[1]++;
 110         }
 111 }
 112
 113 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 114                       unsigned int inflight[2])
 115 {
 116         struct mq_inflight mi = { .part = part, .inflight = inflight, };
 117
 118         inflight[0] = inflight[1] = 0;
 119         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 120 }
 121
 122 void blk_freeze_queue_start(struct request_queue *q)
 123 {
 124         int freeze_depth;
 125
 126         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 127         if (freeze_depth == 1) {
 128                 percpu_ref_kill(&q->q_usage_counter);
 129                 if (q->mq_ops)
 130                         blk_mq_run_hw_queues(q, false);
 131         }
 132 }
 133 EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
 134
 135 void blk_mq_freeze_queue_wait(struct request_queue *q)
 136 {
 137         wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
 138 }
 139 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
 140
 141 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 142                                      unsigned long timeout)
 143 {
 144         return wait_event_timeout(q->mq_freeze_wq,
 145                                         percpu_ref_is_zero(&q->q_usage_counter),
 146                                         timeout);
 147 }
 148 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
 149
 150 /*
 151  * Guarantee no request is in use, so we can change any data structure of
 152  * the queue afterward.
 153  */
 154 void blk_freeze_queue(struct request_queue *q)
 155 {
 156         /*
 157          * In the !blk_mq case we are only calling this to kill the
 158          * q_usage_counter, otherwise this increases the freeze depth
 159          * and waits for it to return to zero.  For this reason there is
 160          * no blk_unfreeze_queue(), and blk_freeze_queue() is not
 161          * exported to drivers as the only user for unfreeze is blk_mq.
 162          */
 163         blk_freeze_queue_start(q);
 164         blk_mq_freeze_queue_wait(q);
 165 }
 166
 167 void blk_mq_freeze_queue(struct request_queue *q)
 168 {
 169         /*
 170          * ...just an alias to keep freeze and unfreeze actions balanced
 171          * in the blk_mq_* namespace
 172          */
 173         blk_freeze_queue(q);
 174 }
 175 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
 176
 177 void blk_mq_unfreeze_queue(struct request_queue *q)
 178 {
 179         int freeze_depth;
 180
 181         freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
 182         WARN_ON_ONCE(freeze_depth < 0);
 183         if (!freeze_depth) {
 184                 percpu_ref_reinit(&q->q_usage_counter);
 185                 wake_up_all(&q->mq_freeze_wq);
 186         }
 187 }
 188 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
 189
 190 /*
 191  * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 192  * mpt3sas driver such that this function can be removed.
 193  */
 194 void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 195 {
 196         unsigned long flags;
 197
 198         spin_lock_irqsave(q->queue_lock, flags);
 199         queue_flag_set(QUEUE_FLAG_QUIESCED, q);
 200         spin_unlock_irqrestore(q->queue_lock, flags);
 201 }
 202 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 203
 204 /**
 205  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 206  * @q: request queue.
 207  *
 208  * Note: this function does not prevent that the struct request end_io()
 209  * callback function is invoked. Once this function is returned, we make
 210  * sure no dispatch can happen until the queue is unquiesced via
 211  * blk_mq_unquiesce_queue().
 212  */
 213 void blk_mq_quiesce_queue(struct request_queue *q)
 214 {
 215         struct blk_mq_hw_ctx *hctx;
 216         unsigned int i;
 217         bool rcu = false;
 218
 219         blk_mq_quiesce_queue_nowait(q);
 220
 221         queue_for_each_hw_ctx(q, hctx, i) {
 222                 if (hctx->flags & BLK_MQ_F_BLOCKING)
 223                         synchronize_srcu(hctx->queue_rq_srcu);
 224                 else
 225                         rcu = true;
 226         }
 227         if (rcu)
 228                 synchronize_rcu();
 229 }
 230 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 231
 232 /*
 233  * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 234  * @q: request queue.
 235  *
 236  * This function recovers queue into the state before quiescing
 237  * which is done by blk_mq_quiesce_queue.
 238  */
 239 void blk_mq_unquiesce_queue(struct request_queue *q)
 240 {
 241         unsigned long flags;
 242
 243         spin_lock_irqsave(q->queue_lock, flags);
 244         queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
 245         spin_unlock_irqrestore(q->queue_lock, flags);
 246
 247         /* dispatch requests which are inserted during quiescing */
 248         blk_mq_run_hw_queues(q, true);
 249 }
 250 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 251
 252 void blk_mq_wake_waiters(struct request_queue *q)
 253 {
 254         struct blk_mq_hw_ctx *hctx;
 255         unsigned int i;
 256
 257         queue_for_each_hw_ctx(q, hctx, i)
 258                 if (blk_mq_hw_queue_mapped(hctx))
 259                         blk_mq_tag_wakeup_all(hctx->tags, true);
 260 }
 261
 262 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 263 {
 264         return blk_mq_has_free_tags(hctx->tags);
 265 }
 266 EXPORT_SYMBOL(blk_mq_can_queue);
 267
 268 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 269                 unsigned int tag, unsigned int op)
 270 {
 271         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 272         struct request *rq = tags->static_rqs[tag];
 273
 274         rq->rq_flags = 0;
 275
 276         if (data->flags & BLK_MQ_REQ_INTERNAL) {
 277                 rq->tag = -1;
 278                 rq->internal_tag = tag;
 279         } else {
 280                 if (blk_mq_tag_busy(data->hctx)) {
 281                         rq->rq_flags = RQF_MQ_INFLIGHT;
 282                         atomic_inc(&data->hctx->nr_active);
 283                 }
 284                 rq->tag = tag;
 285                 rq->internal_tag = -1;
 286                 data->hctx->tags->rqs[rq->tag] = rq;
 287         }
 288
 289         INIT_LIST_HEAD(&rq->queuelist);
 290         /* csd/requeue_work/fifo_time is initialized before use */
 291         rq->q = data->q;
 292         rq->mq_ctx = data->ctx;
 293         rq->cmd_flags = op;
 294         if (data->flags & BLK_MQ_REQ_PREEMPT)
 295                 rq->rq_flags |= RQF_PREEMPT;
 296         if (blk_queue_io_stat(data->q))
 297                 rq->rq_flags |= RQF_IO_STAT;
 298         /* do not touch atomic flags, it needs atomic ops against the timer */
 299         rq->cpu = -1;
 300         INIT_HLIST_NODE(&rq->hash);
 301         RB_CLEAR_NODE(&rq->rb_node);
 302         rq->rq_disk = NULL;
 303         rq->part = NULL;
 304         rq->start_time = jiffies;
 305 #ifdef CONFIG_BLK_CGROUP
 306         rq->rl = NULL;
 307         set_start_time_ns(rq);
 308         rq->io_start_time_ns = 0;
 309 #endif
 310         rq->nr_phys_segments = 0;
 311 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 312         rq->nr_integrity_segments = 0;
 313 #endif
 314         rq->special = NULL;
 315         /* tag was already set */
 316         rq->extra_len = 0;
 317
 318         INIT_LIST_HEAD(&rq->timeout_list);
 319         rq->timeout = 0;
 320
 321         rq->end_io = NULL;
 322         rq->end_io_data = NULL;
 323         rq->next_rq = NULL;
 324
 325         data->ctx->rq_dispatched[op_is_sync(op)]++;
 326         return rq;
 327 }
 328
 329 static struct request *blk_mq_get_request(struct request_queue *q,
 330                 struct bio *bio, unsigned int op,
 331                 struct blk_mq_alloc_data *data)
 332 {
 333         struct elevator_queue *e = q->elevator;
 334         struct request *rq;
 335         unsigned int tag;
 336         bool put_ctx_on_error = false;
 337
 338         blk_queue_enter_live(q);
 339         data->q = q;
 340         if (likely(!data->ctx)) {
 341                 data->ctx = blk_mq_get_ctx(q);
 342                 put_ctx_on_error = true;
 343         }
 344         if (likely(!data->hctx))
 345                 data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 346         if (op & REQ_NOWAIT)
 347                 data->flags |= BLK_MQ_REQ_NOWAIT;
 348
 349         if (e) {
 350                 data->flags |= BLK_MQ_REQ_INTERNAL;
 351
 352                 /*
 353                  * Flush requests are special and go directly to the
 354                  * dispatch list.
 355                  */
 356                 if (!op_is_flush(op) && e->type->ops.mq.limit_depth)
 357                         e->type->ops.mq.limit_depth(op, data);
 358         }
 359
 360         tag = blk_mq_get_tag(data);
 361         if (tag == BLK_MQ_TAG_FAIL) {
 362                 if (put_ctx_on_error) {
 363                         blk_mq_put_ctx(data->ctx);
 364                         data->ctx = NULL;
 365                 }
 366                 blk_queue_exit(q);
 367                 return NULL;
 368         }
 369
 370         rq = blk_mq_rq_ctx_init(data, tag, op);
 371         if (!op_is_flush(op)) {
 372                 rq->elv.icq = NULL;
 373                 if (e && e->type->ops.mq.prepare_request) {
 374                         if (e->type->icq_cache && rq_ioc(bio))
 375                                 blk_mq_sched_assign_ioc(rq, bio);
 376
 377                         e->type->ops.mq.prepare_request(rq, bio);
 378                         rq->rq_flags |= RQF_ELVPRIV;
 379                 }
 380         }
 381         data->hctx->queued++;
 382         return rq;
 383 }
 384
 385 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 386                 blk_mq_req_flags_t flags)
 387 {
 388         struct blk_mq_alloc_data alloc_data = { .flags = flags };
 389         struct request *rq;
 390         int ret;
 391
 392         ret = blk_queue_enter(q, flags);
 393         if (ret)
 394                 return ERR_PTR(ret);
 395
 396         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 397         blk_queue_exit(q);
 398
 399         if (!rq)
 400                 return ERR_PTR(-EWOULDBLOCK);
 401
 402         blk_mq_put_ctx(alloc_data.ctx);
 403
 404         rq->__data_len = 0;
 405         rq->__sector = (sector_t) -1;
 406         rq->bio = rq->biotail = NULL;
 407         return rq;
 408 }
 409 EXPORT_SYMBOL(blk_mq_alloc_request);
 410
 411 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 412         unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 413 {
 414         struct blk_mq_alloc_data alloc_data = { .flags = flags };
 415         struct request *rq;
 416         unsigned int cpu;
 417         int ret;
 418
 419         /*
 420          * If the tag allocator sleeps we could get an allocation for a
 421          * different hardware context.  No need to complicate the low level
 422          * allocator for this for the rare use case of a command tied to
 423          * a specific queue.
 424          */
 425         if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
 426                 return ERR_PTR(-EINVAL);
 427
 428         if (hctx_idx >= q->nr_hw_queues)
 429                 return ERR_PTR(-EIO);
 430
 431         ret = blk_queue_enter(q, flags);
 432         if (ret)
 433                 return ERR_PTR(ret);
 434
 435         /*
 436          * Check if the hardware context is actually mapped to anything.
 437          * If not tell the caller that it should skip this queue.
 438          */
 439         alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
 440         if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
 441                 blk_queue_exit(q);
 442                 return ERR_PTR(-EXDEV);
 443         }
 444         cpu = cpumask_first(alloc_data.hctx->cpumask);
 445         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 446
 447         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
 448         blk_queue_exit(q);
 449
 450         if (!rq)
 451                 return ERR_PTR(-EWOULDBLOCK);
 452
 453         return rq;
 454 }
 455 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 456
 457 void blk_mq_free_request(struct request *rq)
 458 {
 459         struct request_queue *q = rq->q;
 460         struct elevator_queue *e = q->elevator;
 461         struct blk_mq_ctx *ctx = rq->mq_ctx;
 462         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 463         const int sched_tag = rq->internal_tag;
 464
 465         if (rq->rq_flags & RQF_ELVPRIV) {
 466                 if (e && e->type->ops.mq.finish_request)
 467                         e->type->ops.mq.finish_request(rq);
 468                 if (rq->elv.icq) {
 469                         put_io_context(rq->elv.icq->ioc);
 470                         rq->elv.icq = NULL;
 471                 }
 472         }
 473
 474         ctx->rq_completed[rq_is_sync(rq)]++;
 475         if (rq->rq_flags & RQF_MQ_INFLIGHT)
 476                 atomic_dec(&hctx->nr_active);
 477
 478         if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
 479                 laptop_io_completion(q->backing_dev_info);
 480
 481         wbt_done(q->rq_wb, &rq->issue_stat);
 482
 483         if (blk_rq_rl(rq))
 484                 blk_put_rl(blk_rq_rl(rq));
 485
 486         blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 487         clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 488         clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 489         if (rq->tag != -1)
 490                 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 491         if (sched_tag != -1)
 492                 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 493         blk_mq_sched_restart(hctx);
 494         blk_queue_exit(q);
 495 }
 496 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 497
 498 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 499 {
 500         blk_account_io_done(rq);
 501
 502         if (rq->end_io) {
 503                 wbt_done(rq->q->rq_wb, &rq->issue_stat);
 504                 rq->end_io(rq, error);
 505         } else {
 506                 if (unlikely(blk_bidi_rq(rq)))
 507                         blk_mq_free_request(rq->next_rq);
 508                 blk_mq_free_request(rq);
 509         }
 510 }
 511 EXPORT_SYMBOL(__blk_mq_end_request);
 512
 513 void blk_mq_end_request(struct request *rq, blk_status_t error)
 514 {
 515         if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 516                 BUG();
 517         __blk_mq_end_request(rq, error);
 518 }
 519 EXPORT_SYMBOL(blk_mq_end_request);
 520
 521 static void __blk_mq_complete_request_remote(void *data)
 522 {
 523         struct request *rq = data;
 524
 525         rq->q->softirq_done_fn(rq);
 526 }
 527
 528 static void __blk_mq_complete_request(struct request *rq)
 529 {
 530         struct blk_mq_ctx *ctx = rq->mq_ctx;
 531         bool shared = false;
 532         int cpu;
 533
 534         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
 535
 536         if (rq->internal_tag != -1)
 537                 blk_mq_sched_completed_request(rq);
 538         if (rq->rq_flags & RQF_STATS) {
 539                 blk_mq_poll_stats_start(rq->q);
 540                 blk_stat_add(rq);
 541         }
 542
 543         if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 544                 rq->q->softirq_done_fn(rq);
 545                 return;
 546         }
 547
 548         cpu = get_cpu();
 549         if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 550                 shared = cpus_share_cache(cpu, ctx->cpu);
 551
 552         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 553                 rq->csd.func = __blk_mq_complete_request_remote;
 554                 rq->csd.info = rq;
 555                 rq->csd.flags = 0;
 556                 smp_call_function_single_async(ctx->cpu, &rq->csd);
 557         } else {
 558                 rq->q->softirq_done_fn(rq);
 559         }
 560         put_cpu();
 561 }
 562
 563 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 564 {
 565         if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 566                 rcu_read_unlock();
 567         else
 568                 srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
 569 }
 570
 571 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 572 {
 573         if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 574                 rcu_read_lock();
 575         else
 576                 *srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
 577 }
 578
 579 static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
 580 {
 581         unsigned long flags;
 582
 583         /*
 584          * blk_mq_rq_aborted_gstate() is used from the completion path and
 585          * can thus be called from irq context.  u64_stats_fetch in the
 586          * middle of update on the same CPU leads to lockup.  Disable irq
 587          * while updating.
 588          */
 589         local_irq_save(flags);
 590         u64_stats_update_begin(&rq->aborted_gstate_sync);
 591         rq->aborted_gstate = gstate;
 592         u64_stats_update_end(&rq->aborted_gstate_sync);
 593         local_irq_restore(flags);
 594 }
 595
 596 static u64 blk_mq_rq_aborted_gstate(struct request *rq)
 597 {
 598         unsigned int start;
 599         u64 aborted_gstate;
 600
 601         do {
 602                 start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
 603                 aborted_gstate = rq->aborted_gstate;
 604         } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
 605
 606         return aborted_gstate;
 607 }
 608
 609 /**
 610  * blk_mq_complete_request - end I/O on a request
 611  * @rq:         the request being processed
 612  *
 613  * Description:
 614  *      Ends all I/O on a request. It does not handle partial completions.
 615  *      The actual completion happens out-of-order, through a IPI handler.
 616  **/
 617 void blk_mq_complete_request(struct request *rq)
 618 {
 619         struct request_queue *q = rq->q;
 620         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 621         int srcu_idx;
 622
 623         if (unlikely(blk_should_fake_timeout(q)))
 624                 return;
 625
 626         /*
 627          * If @rq->aborted_gstate equals the current instance, timeout is
 628          * claiming @rq and we lost.  This is synchronized through
 629          * hctx_lock().  See blk_mq_timeout_work() for details.
 630          *
 631          * Completion path never blocks and we can directly use RCU here
 632          * instead of hctx_lock() which can be either RCU or SRCU.
 633          * However, that would complicate paths which want to synchronize
 634          * against us.  Let stay in sync with the issue path so that
 635          * hctx_lock() covers both issue and completion paths.
 636          */
 637         hctx_lock(hctx, &srcu_idx);
 638         if (blk_mq_rq_aborted_gstate(rq) != rq->gstate &&
 639             !blk_mark_rq_complete(rq))
 640                 __blk_mq_complete_request(rq);
 641         hctx_unlock(hctx, srcu_idx);
 642 }
 643 EXPORT_SYMBOL(blk_mq_complete_request);
 644
 645 int blk_mq_request_started(struct request *rq)
 646 {
 647         return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 648 }
 649 EXPORT_SYMBOL_GPL(blk_mq_request_started);
 650
 651 void blk_mq_start_request(struct request *rq)
 652 {
 653         struct request_queue *q = rq->q;
 654
 655         blk_mq_sched_started_request(rq);
 656
 657         trace_block_rq_issue(q, rq);
 658
 659         if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
 660                 blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq));
 661                 rq->rq_flags |= RQF_STATS;
 662                 wbt_issue(q->rq_wb, &rq->issue_stat);
 663         }
 664
 665         WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 666         WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
 667
 668         /*
 669          * Mark @rq in-flight which also advances the generation number,
 670          * and register for timeout.  Protect with a seqcount to allow the
 671          * timeout path to read both @rq->gstate and @rq->deadline
 672          * coherently.
 673          *
 674          * This is the only place where a request is marked in-flight.  If
 675          * the timeout path reads an in-flight @rq->gstate, the
 676          * @rq->deadline it reads together under @rq->gstate_seq is
 677          * guaranteed to be the matching one.
 678          */
 679         preempt_disable();
 680         write_seqcount_begin(&rq->gstate_seq);
 681
 682         blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
 683         blk_add_timer(rq);
 684
 685         write_seqcount_end(&rq->gstate_seq);
 686         preempt_enable();
 687
 688         set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 689         if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 690                 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 691
 692         if (q->dma_drain_size && blk_rq_bytes(rq)) {
 693                 /*
 694                  * Make sure space for the drain appears.  We know we can do
 695                  * this because max_hw_segments has been adjusted to be one
 696                  * fewer than the device can handle.
 697                  */
 698                 rq->nr_phys_segments++;
 699         }
 700 }
 701 EXPORT_SYMBOL(blk_mq_start_request);
 702
 703 /*
 704  * When we reach here because queue is busy, REQ_ATOM_COMPLETE
 705  * flag isn't set yet, so there may be race with timeout handler,
 706  * but given rq->deadline is just set in .queue_rq() under
 707  * this situation, the race won't be possible in reality because
 708  * rq->timeout should be set as big enough to cover the window
 709  * between blk_mq_start_request() called from .queue_rq() and
 710  * clearing REQ_ATOM_STARTED here.
 711  */
 712 static void __blk_mq_requeue_request(struct request *rq)
 713 {
 714         struct request_queue *q = rq->q;
 715
 716         blk_mq_put_driver_tag(rq);
 717
 718         trace_block_rq_requeue(q, rq);
 719         wbt_requeue(q->rq_wb, &rq->issue_stat);
 720         blk_mq_sched_requeue_request(rq);
 721
 722         if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 723                 blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
 724                 if (q->dma_drain_size && blk_rq_bytes(rq))
 725                         rq->nr_phys_segments--;
 726         }
 727 }
 728
 729 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 730 {
 731         __blk_mq_requeue_request(rq);
 732
 733         BUG_ON(blk_queued_rq(rq));
 734         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 735 }
 736 EXPORT_SYMBOL(blk_mq_requeue_request);
 737
 738 static void blk_mq_requeue_work(struct work_struct *work)
 739 {
 740         struct request_queue *q =
 741                 container_of(work, struct request_queue, requeue_work.work);
 742         LIST_HEAD(rq_list);
 743         struct request *rq, *next;
 744
 745         spin_lock_irq(&q->requeue_lock);
 746         list_splice_init(&q->requeue_list, &rq_list);
 747         spin_unlock_irq(&q->requeue_lock);
 748
 749         list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 750                 if (!(rq->rq_flags & RQF_SOFTBARRIER))
 751                         continue;
 752
 753                 rq->rq_flags &= ~RQF_SOFTBARRIER;
 754                 list_del_init(&rq->queuelist);
 755                 blk_mq_sched_insert_request(rq, true, false, false, true);
 756         }
 757
 758         while (!list_empty(&rq_list)) {
 759                 rq = list_entry(rq_list.next, struct request, queuelist);
 760                 list_del_init(&rq->queuelist);
 761                 blk_mq_sched_insert_request(rq, false, false, false, true);
 762         }
 763
 764         blk_mq_run_hw_queues(q, false);
 765 }
 766
 767 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 768                                 bool kick_requeue_list)
 769 {
 770         struct request_queue *q = rq->q;
 771         unsigned long flags;
 772
 773         /*
 774          * We abuse this flag that is otherwise used by the I/O scheduler to
 775          * request head insertion from the workqueue.
 776          */
 777         BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
 778
 779         spin_lock_irqsave(&q->requeue_lock, flags);
 780         if (at_head) {
 781                 rq->rq_flags |= RQF_SOFTBARRIER;
 782                 list_add(&rq->queuelist, &q->requeue_list);
 783         } else {
 784                 list_add_tail(&rq->queuelist, &q->requeue_list);
 785         }
 786         spin_unlock_irqrestore(&q->requeue_lock, flags);
 787
 788         if (kick_requeue_list)
 789                 blk_mq_kick_requeue_list(q);
 790 }
 791 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 792
 793 void blk_mq_kick_requeue_list(struct request_queue *q)
 794 {
 795         kblockd_schedule_delayed_work(&q->requeue_work, 0);
 796 }
 797 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 798
 799 void blk_mq_delay_kick_requeue_list(struct request_queue *q,
 800                                     unsigned long msecs)
 801 {
 802         kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
 803                                     msecs_to_jiffies(msecs));
 804 }
 805 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
 806
 807 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 808 {
 809         if (tag < tags->nr_tags) {
 810                 prefetch(tags->rqs[tag]);
 811                 return tags->rqs[tag];
 812         }
 813
 814         return NULL;
 815 }
 816 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 817
 818 struct blk_mq_timeout_data {
 819         unsigned long next;
 820         unsigned int next_set;
 821         unsigned int nr_expired;
 822 };
 823
 824 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 825 {
 826         const struct blk_mq_ops *ops = req->q->mq_ops;
 827         enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 828
 829         /*
 830          * We know that complete is set at this point. If STARTED isn't set
 831          * anymore, then the request isn't active and the "timeout" should
 832          * just be ignored. This can happen due to the bitflag ordering.
 833          * Timeout first checks if STARTED is set, and if it is, assumes
 834          * the request is active. But if we race with completion, then
 835          * both flags will get cleared. So check here again, and ignore
 836          * a timeout event with a request that isn't active.
 837          */
 838         if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 839                 return;
 840
 841         if (ops->timeout)
 842                 ret = ops->timeout(req, reserved);
 843
 844         switch (ret) {
 845         case BLK_EH_HANDLED:
 846                 __blk_mq_complete_request(req);
 847                 break;
 848         case BLK_EH_RESET_TIMER:
 849                 /*
 850                  * As nothing prevents from completion happening while
 851                  * ->aborted_gstate is set, this may lead to ignored
 852                  * completions and further spurious timeouts.
 853                  */
 854                 blk_mq_rq_update_aborted_gstate(req, 0);
 855                 blk_add_timer(req);
 856                 blk_clear_rq_complete(req);
 857                 break;
 858         case BLK_EH_NOT_HANDLED:
 859                 break;
 860         default:
 861                 printk(KERN_ERR "block: bad eh return: %d\n", ret);
 862                 break;
 863         }
 864 }
 865
 866 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 867                 struct request *rq, void *priv, bool reserved)
 868 {
 869         struct blk_mq_timeout_data *data = priv;
 870         unsigned long gstate, deadline;
 871         int start;
 872
 873         might_sleep();
 874
 875         if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 876                 return;
 877
 878         /* read coherent snapshots of @rq->state_gen and @rq->deadline */
 879         while (true) {
 880                 start = read_seqcount_begin(&rq->gstate_seq);
 881                 gstate = READ_ONCE(rq->gstate);
 882                 deadline = rq->deadline;
 883                 if (!read_seqcount_retry(&rq->gstate_seq, start))
 884                         break;
 885                 cond_resched();
 886         }
 887
 888         /* if in-flight && overdue, mark for abortion */
 889         if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
 890             time_after_eq(jiffies, deadline)) {
 891                 blk_mq_rq_update_aborted_gstate(rq, gstate);
 892                 data->nr_expired++;
 893                 hctx->nr_expired++;
 894         } else if (!data->next_set || time_after(data->next, deadline)) {
 895                 data->next = deadline;
 896                 data->next_set = 1;
 897         }
 898 }
 899
 900 static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
 901                 struct request *rq, void *priv, bool reserved)
 902 {
 903         /*
 904          * We marked @rq->aborted_gstate and waited for RCU.  If there were
 905          * completions that we lost to, they would have finished and
 906          * updated @rq->gstate by now; otherwise, the completion path is
 907          * now guaranteed to see @rq->aborted_gstate and yield.  If
 908          * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
 909          */
 910         if (READ_ONCE(rq->gstate) == rq->aborted_gstate &&
 911             !blk_mark_rq_complete(rq))
 912                 blk_mq_rq_timed_out(rq, reserved);
 913 }
 914
 915 static void blk_mq_timeout_work(struct work_struct *work)
 916 {
 917         struct request_queue *q =
 918                 container_of(work, struct request_queue, timeout_work);
 919         struct blk_mq_timeout_data data = {
 920                 .next           = 0,
 921                 .next_set       = 0,
 922                 .nr_expired     = 0,
 923         };
 924         struct blk_mq_hw_ctx *hctx;
 925         int i;
 926
 927         /* A deadlock might occur if a request is stuck requiring a
 928          * timeout at the same time a queue freeze is waiting
 929          * completion, since the timeout code would not be able to
 930          * acquire the queue reference here.
 931          *
 932          * That's why we don't use blk_queue_enter here; instead, we use
 933          * percpu_ref_tryget directly, because we need to be able to
 934          * obtain a reference even in the short window between the queue
 935          * starting to freeze, by dropping the first reference in
 936          * blk_freeze_queue_start, and the moment the last request is
 937          * consumed, marked by the instant q_usage_counter reaches
 938          * zero.
 939          */
 940         if (!percpu_ref_tryget(&q->q_usage_counter))
 941                 return;
 942
 943         /* scan for the expired ones and set their ->aborted_gstate */
 944         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
 945
 946         if (data.nr_expired) {
 947                 bool has_rcu = false;
 948
 949                 /*
 950                  * Wait till everyone sees ->aborted_gstate.  The
 951                  * sequential waits for SRCUs aren't ideal.  If this ever
 952                  * becomes a problem, we can add per-hw_ctx rcu_head and
 953                  * wait in parallel.
 954                  */
 955                 queue_for_each_hw_ctx(q, hctx, i) {
 956                         if (!hctx->nr_expired)
 957                                 continue;
 958
 959                         if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 960                                 has_rcu = true;
 961                         else
 962                                 synchronize_srcu(hctx->queue_rq_srcu);
 963
 964                         hctx->nr_expired = 0;
 965                 }
 966                 if (has_rcu)
 967                         synchronize_rcu();
 968
 969                 /* terminate the ones we won */
 970                 blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
 971         }
 972
 973         if (data.next_set) {
 974                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
 975                 mod_timer(&q->timeout, data.next);
 976         } else {
 977                 queue_for_each_hw_ctx(q, hctx, i) {
 978                         /* the hctx may be unmapped, so check it here */
 979                         if (blk_mq_hw_queue_mapped(hctx))
 980                                 blk_mq_tag_idle(hctx);
 981                 }
 982         }
 983         blk_queue_exit(q);
 984 }
 985
 986 struct flush_busy_ctx_data {
 987         struct blk_mq_hw_ctx *hctx;
 988         struct list_head *list;
 989 };
 990
 991 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 992 {
 993         struct flush_busy_ctx_data *flush_data = data;
 994         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 995         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
 996
 997         sbitmap_clear_bit(sb, bitnr);
 998         spin_lock(&ctx->lock);
 999         list_splice_tail_init(&ctx->rq_list, flush_data->list);
1000         spin_unlock(&ctx->lock);
1001         return true;
1002 }
1003
1004 /*
1005  * Process software queues that have been marked busy, splicing them
1006  * to the for-dispatch
1007  */
1008 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
1009 {
1010         struct flush_busy_ctx_data data = {
1011                 .hctx = hctx,
1012                 .list = list,
1013         };
1014
1015         sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
1016 }
1017 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
1018
1019 struct dispatch_rq_data {
1020         struct blk_mq_hw_ctx *hctx;
1021         struct request *rq;
1022 };
1023
1024 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
1025                 void *data)
1026 {
1027         struct dispatch_rq_data *dispatch_data = data;
1028         struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
1029         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
1030
1031         spin_lock(&ctx->lock);
1032         if (unlikely(!list_empty(&ctx->rq_list))) {
1033                 dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
1034                 list_del_init(&dispatch_data->rq->queuelist);
1035                 if (list_empty(&ctx->rq_list))
1036                         sbitmap_clear_bit(sb, bitnr);
1037         }
1038         spin_unlock(&ctx->lock);
1039
1040         return !dispatch_data->rq;
1041 }
1042
1043 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
1044                                         struct blk_mq_ctx *start)
1045 {
1046         unsigned off = start ? start->index_hw : 0;
1047         struct dispatch_rq_data data = {
1048                 .hctx = hctx,
1049                 .rq   = NULL,
1050         };
1051
1052         __sbitmap_for_each_set(&hctx->ctx_map, off,
1053                                dispatch_rq_from_ctx, &data);
1054
1055         return data.rq;
1056 }
1057
1058 static inline unsigned int queued_to_index(unsigned int queued)
1059 {
1060         if (!queued)
1061                 return 0;
1062
1063         return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
1064 }
1065
1066 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
1067                            bool wait)
1068 {
1069         struct blk_mq_alloc_data data = {
1070                 .q = rq->q,
1071                 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
1072                 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
1073         };
1074
1075         might_sleep_if(wait);
1076
1077         if (rq->tag != -1)
1078                 goto done;
1079
1080         if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
1081                 data.flags |= BLK_MQ_REQ_RESERVED;
1082
1083         rq->tag = blk_mq_get_tag(&data);
1084         if (rq->tag >= 0) {
1085                 if (blk_mq_tag_busy(data.hctx)) {
1086                         rq->rq_flags |= RQF_MQ_INFLIGHT;
1087                         atomic_inc(&data.hctx->nr_active);
1088                 }
1089                 data.hctx->tags->rqs[rq->tag] = rq;
1090         }
1091
1092 done:
1093         if (hctx)
1094                 *hctx = data.hctx;
1095         return rq->tag != -1;
1096 }
1097
1098 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
1099                                 int flags, void *key)
1100 {
1101         struct blk_mq_hw_ctx *hctx;
1102
1103         hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
1104
1105         list_del_init(&wait->entry);
1106         blk_mq_run_hw_queue(hctx, true);
1107         return 1;
1108 }
1109
1110 /*
1111  * Mark us waiting for a tag. For shared tags, this involves hooking us into
1112  * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
1113  * restart. For both caes, take care to check the condition again after
1114  * marking us as waiting.
1115  */
1116 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
1117                                  struct request *rq)
1118 {
1119         struct blk_mq_hw_ctx *this_hctx = *hctx;
1120         bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
1121         struct sbq_wait_state *ws;
1122         wait_queue_entry_t *wait;
1123         bool ret;
1124
1125         if (!shared_tags) {
1126                 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
1127                         set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
1128         } else {
1129                 wait = &this_hctx->dispatch_wait;
1130                 if (!list_empty_careful(&wait->entry))
1131                         return false;
1132
1133                 spin_lock(&this_hctx->lock);
1134                 if (!list_empty(&wait->entry)) {
1135                         spin_unlock(&this_hctx->lock);
1136                         return false;
1137                 }
1138
1139                 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
1140                 add_wait_queue(&ws->wait, wait);
1141         }
1142
1143         /*
1144          * It's possible that a tag was freed in the window between the
1145          * allocation failure and adding the hardware queue to the wait
1146          * queue.
1147          */
1148         ret = blk_mq_get_driver_tag(rq, hctx, false);
1149
1150         if (!shared_tags) {
1151                 /*
1152                  * Don't clear RESTART here, someone else could have set it.
1153                  * At most this will cost an extra queue run.
1154                  */
1155                 return ret;
1156         } else {
1157                 if (!ret) {
1158                         spin_unlock(&this_hctx->lock);
1159                         return false;
1160                 }
1161
1162                 /*
1163                  * We got a tag, remove ourselves from the wait queue to ensure
1164                  * someone else gets the wakeup.
1165                  */
1166                 spin_lock_irq(&ws->wait.lock);
1167                 list_del_init(&wait->entry);
1168                 spin_unlock_irq(&ws->wait.lock);
1169                 spin_unlock(&this_hctx->lock);
1170                 return true;
1171         }
1172 }
1173
1174 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
1175                              bool got_budget)
1176 {
1177         struct blk_mq_hw_ctx *hctx;
1178         struct request *rq, *nxt;
1179         bool no_tag = false;
1180         int errors, queued;
1181
1182         if (list_empty(list))
1183                 return false;
1184
1185         WARN_ON(!list_is_singular(list) && got_budget);
1186
1187         /*
1188          * Now process all the entries, sending them to the driver.
1189          */
1190         errors = queued = 0;
1191         do {
1192                 struct blk_mq_queue_data bd;
1193                 blk_status_t ret;
1194
1195                 rq = list_first_entry(list, struct request, queuelist);
1196                 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1197                         /*
1198                          * The initial allocation attempt failed, so we need to
1199                          * rerun the hardware queue when a tag is freed. The
1200                          * waitqueue takes care of that. If the queue is run
1201                          * before we add this entry back on the dispatch list,
1202                          * we'll re-run it below.
1203                          */
1204                         if (!blk_mq_mark_tag_wait(&hctx, rq)) {
1205                                 if (got_budget)
1206                                         blk_mq_put_dispatch_budget(hctx);
1207                                 /*
1208                                  * For non-shared tags, the RESTART check
1209                                  * will suffice.
1210                                  */
1211                                 if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1212                                         no_tag = true;
1213                                 break;
1214                         }
1215                 }
1216
1217                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
1218                         blk_mq_put_driver_tag(rq);
1219                         break;
1220                 }
1221
1222                 list_del_init(&rq->queuelist);
1223
1224                 bd.rq = rq;
1225
1226                 /*
1227                  * Flag last if we have no more requests, or if we have more
1228                  * but can't assign a driver tag to it.
1229                  */
1230                 if (list_empty(list))
1231                         bd.last = true;
1232                 else {
1233                         nxt = list_first_entry(list, struct request, queuelist);
1234                         bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1235                 }
1236
1237                 ret = q->mq_ops->queue_rq(hctx, &bd);
1238                 if (ret == BLK_STS_RESOURCE) {
1239                         /*
1240                          * If an I/O scheduler has been configured and we got a
1241                          * driver tag for the next request already, free it
1242                          * again.
1243                          */
1244                         if (!list_empty(list)) {
1245                                 nxt = list_first_entry(list, struct request, queuelist);
1246                                 blk_mq_put_driver_tag(nxt);
1247                         }
1248                         list_add(&rq->queuelist, list);
1249                         __blk_mq_requeue_request(rq);
1250                         break;
1251                 }
1252
1253                 if (unlikely(ret != BLK_STS_OK)) {
1254                         errors++;
1255                         blk_mq_end_request(rq, BLK_STS_IOERR);
1256                         continue;
1257                 }
1258
1259                 queued++;
1260         } while (!list_empty(list));
1261
1262         hctx->dispatched[queued_to_index(queued)]++;
1263
1264         /*
1265          * Any items that need requeuing? Stuff them into hctx->dispatch,
1266          * that is where we will continue on next queue run.
1267          */
1268         if (!list_empty(list)) {
1269                 spin_lock(&hctx->lock);
1270                 list_splice_init(list, &hctx->dispatch);
1271                 spin_unlock(&hctx->lock);
1272
1273                 /*
1274                  * If SCHED_RESTART was set by the caller of this function and
1275                  * it is no longer set that means that it was cleared by another
1276                  * thread and hence that a queue rerun is needed.
1277                  *
1278                  * If 'no_tag' is set, that means that we failed getting
1279                  * a driver tag with an I/O scheduler attached. If our dispatch
1280                  * waitqueue is no longer active, ensure that we run the queue
1281                  * AFTER adding our entries back to the list.
1282                  *
1283                  * If no I/O scheduler has been configured it is possible that
1284                  * the hardware queue got stopped and restarted before requests
1285                  * were pushed back onto the dispatch list. Rerun the queue to
1286                  * avoid starvation. Notes:
1287                  * - blk_mq_run_hw_queue() checks whether or not a queue has
1288                  *   been stopped before rerunning a queue.
1289                  * - Some but not all block drivers stop a queue before
1290                  *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
1291                  *   and dm-rq.
1292                  */
1293                 if (!blk_mq_sched_needs_restart(hctx) ||
1294                     (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
1295                         blk_mq_run_hw_queue(hctx, true);
1296         }
1297
1298         return (queued + errors) != 0;
1299 }
1300
1301 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1302 {
1303         int srcu_idx;
1304
1305         /*
1306          * We should be running this queue from one of the CPUs that
1307          * are mapped to it.
1308          */
1309         WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1310                 cpu_online(hctx->next_cpu));
1311
1312         /*
1313          * We can't run the queue inline with ints disabled. Ensure that
1314          * we catch bad users of this early.
1315          */
1316         WARN_ON_ONCE(in_interrupt());
1317
1318         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1319
1320         hctx_lock(hctx, &srcu_idx);
1321         blk_mq_sched_dispatch_requests(hctx);
1322         hctx_unlock(hctx, srcu_idx);
1323 }
1324
1325 /*
1326  * It'd be great if the workqueue API had a way to pass
1327  * in a mask and had some smarts for more clever placement.
1328  * For now we just round-robin here, switching for every
1329  * BLK_MQ_CPU_WORK_BATCH queued items.
1330  */
1331 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1332 {
1333         if (hctx->queue->nr_hw_queues == 1)
1334                 return WORK_CPU_UNBOUND;
1335
1336         if (--hctx->next_cpu_batch <= 0) {
1337                 int next_cpu;
1338
1339                 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
1340                 if (next_cpu >= nr_cpu_ids)
1341                         next_cpu = cpumask_first(hctx->cpumask);
1342
1343                 hctx->next_cpu = next_cpu;
1344                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1345         }
1346
1347         return hctx->next_cpu;
1348 }
1349
1350 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1351                                         unsigned long msecs)
1352 {
1353         if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1354                 return;
1355
1356         if (unlikely(blk_mq_hctx_stopped(hctx)))
1357                 return;
1358
1359         if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1360                 int cpu = get_cpu();
1361                 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1362                         __blk_mq_run_hw_queue(hctx);
1363                         put_cpu();
1364                         return;
1365                 }
1366
1367                 put_cpu();
1368         }
1369
1370         kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1371                                          &hctx->run_work,
1372                                          msecs_to_jiffies(msecs));
1373 }
1374
1375 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1376 {
1377         __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1378 }
1379 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1380
1381 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1382 {
1383         int srcu_idx;
1384         bool need_run;
1385
1386         /*
1387          * When queue is quiesced, we may be switching io scheduler, or
1388          * updating nr_hw_queues, or other things, and we can't run queue
1389          * any more, even __blk_mq_hctx_has_pending() can't be called safely.
1390          *
1391          * And queue will be rerun in blk_mq_unquiesce_queue() if it is
1392          * quiesced.
1393          */
1394         hctx_lock(hctx, &srcu_idx);
1395         need_run = !blk_queue_quiesced(hctx->queue) &&
1396                 blk_mq_hctx_has_pending(hctx);
1397         hctx_unlock(hctx, srcu_idx);
1398
1399         if (need_run) {
1400                 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1401                 return true;
1402         }
1403
1404         return false;
1405 }
1406 EXPORT_SYMBOL(blk_mq_run_hw_queue);
1407
1408 void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1409 {
1410         struct blk_mq_hw_ctx *hctx;
1411         int i;
1412
1413         queue_for_each_hw_ctx(q, hctx, i) {
1414                 if (blk_mq_hctx_stopped(hctx))
1415                         continue;
1416
1417                 blk_mq_run_hw_queue(hctx, async);
1418         }
1419 }
1420 EXPORT_SYMBOL(blk_mq_run_hw_queues);
1421
1422 /**
1423  * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
1424  * @q: request queue.
1425  *
1426  * The caller is responsible for serializing this function against
1427  * blk_mq_{start,stop}_hw_queue().
1428  */
1429 bool blk_mq_queue_stopped(struct request_queue *q)
1430 {
1431         struct blk_mq_hw_ctx *hctx;
1432         int i;
1433
1434         queue_for_each_hw_ctx(q, hctx, i)
1435                 if (blk_mq_hctx_stopped(hctx))
1436                         return true;
1437
1438         return false;
1439 }
1440 EXPORT_SYMBOL(blk_mq_queue_stopped);
1441
1442 /*
1443  * This function is often used for pausing .queue_rq() by driver when
1444  * there isn't enough resource or some conditions aren't satisfied, and
1445  * BLK_STS_RESOURCE is usually returned.
1446  *
1447  * We do not guarantee that dispatch can be drained or blocked
1448  * after blk_mq_stop_hw_queue() returns. Please use
1449  * blk_mq_quiesce_queue() for that requirement.
1450  */
1451 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1452 {
1453         cancel_delayed_work(&hctx->run_work);
1454
1455         set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1456 }
1457 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1458
1459 /*
1460  * This function is often used for pausing .queue_rq() by driver when
1461  * there isn't enough resource or some conditions aren't satisfied, and
1462  * BLK_STS_RESOURCE is usually returned.
1463  *
1464  * We do not guarantee that dispatch can be drained or blocked
1465  * after blk_mq_stop_hw_queues() returns. Please use
1466  * blk_mq_quiesce_queue() for that requirement.
1467  */
1468 void blk_mq_stop_hw_queues(struct request_queue *q)
1469 {
1470         struct blk_mq_hw_ctx *hctx;
1471         int i;
1472
1473         queue_for_each_hw_ctx(q, hctx, i)
1474                 blk_mq_stop_hw_queue(hctx);
1475 }
1476 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1477
1478 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1479 {
1480         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1481
1482         blk_mq_run_hw_queue(hctx, false);
1483 }
1484 EXPORT_SYMBOL(blk_mq_start_hw_queue);
1485
1486 void blk_mq_start_hw_queues(struct request_queue *q)
1487 {
1488         struct blk_mq_hw_ctx *hctx;
1489         int i;
1490
1491         queue_for_each_hw_ctx(q, hctx, i)
1492                 blk_mq_start_hw_queue(hctx);
1493 }
1494 EXPORT_SYMBOL(blk_mq_start_hw_queues);
1495
1496 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1497 {
1498         if (!blk_mq_hctx_stopped(hctx))
1499                 return;
1500
1501         clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1502         blk_mq_run_hw_queue(hctx, async);
1503 }
1504 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
1505
1506 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1507 {
1508         struct blk_mq_hw_ctx *hctx;
1509         int i;
1510
1511         queue_for_each_hw_ctx(q, hctx, i)
1512                 blk_mq_start_stopped_hw_queue(hctx, async);
1513 }
1514 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1515
1516 static void blk_mq_run_work_fn(struct work_struct *work)
1517 {
1518         struct blk_mq_hw_ctx *hctx;
1519
1520         hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1521
1522         /*
1523          * If we are stopped, don't run the queue. The exception is if
1524          * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
1525          * the STOPPED bit and run it.
1526          */
1527         if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
1528                 if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
1529                         return;
1530
1531                 clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1532                 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1533         }
1534
1535         __blk_mq_run_hw_queue(hctx);
1536 }
1537
1538
1539 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1540 {
1541         if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
1542                 return;
1543
1544         /*
1545          * Stop the hw queue, then modify currently delayed work.
1546          * This should prevent us from running the queue prematurely.
1547          * Mark the queue as auto-clearing STOPPED when it runs.
1548          */
1549         blk_mq_stop_hw_queue(hctx);
1550         set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
1551         kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1552                                         &hctx->run_work,
1553                                         msecs_to_jiffies(msecs));
1554 }
1555 EXPORT_SYMBOL(blk_mq_delay_queue);
1556
1557 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1558                                             struct request *rq,
1559                                             bool at_head)
1560 {
1561         struct blk_mq_ctx *ctx = rq->mq_ctx;
1562
1563         lockdep_assert_held(&ctx->lock);
1564
1565         trace_block_rq_insert(hctx->queue, rq);
1566
1567         if (at_head)
1568                 list_add(&rq->queuelist, &ctx->rq_list);
1569         else
1570                 list_add_tail(&rq->queuelist, &ctx->rq_list);
1571 }
1572
1573 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1574                              bool at_head)
1575 {
1576         struct blk_mq_ctx *ctx = rq->mq_ctx;
1577
1578         lockdep_assert_held(&ctx->lock);
1579
1580         __blk_mq_insert_req_list(hctx, rq, at_head);
1581         blk_mq_hctx_mark_pending(hctx, ctx);
1582 }
1583
1584 /*
1585  * Should only be used carefully, when the caller knows we want to
1586  * bypass a potential IO scheduler on the target device.
1587  */
1588 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1589 {
1590         struct blk_mq_ctx *ctx = rq->mq_ctx;
1591         struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1592
1593         spin_lock(&hctx->lock);
1594         list_add_tail(&rq->queuelist, &hctx->dispatch);
1595         spin_unlock(&hctx->lock);
1596
1597         if (run_queue)
1598                 blk_mq_run_hw_queue(hctx, false);
1599 }
1600
1601 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1602                             struct list_head *list)
1603
1604 {
1605         /*
1606          * preemption doesn't flush plug list, so it's possible ctx->cpu is
1607          * offline now
1608          */
1609         spin_lock(&ctx->lock);
1610         while (!list_empty(list)) {
1611                 struct request *rq;
1612
1613                 rq = list_first_entry(list, struct request, queuelist);
1614                 BUG_ON(rq->mq_ctx != ctx);
1615                 list_del_init(&rq->queuelist);
1616                 __blk_mq_insert_req_list(hctx, rq, false);
1617         }
1618         blk_mq_hctx_mark_pending(hctx, ctx);
1619         spin_unlock(&ctx->lock);
1620 }
1621
1622 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1623 {
1624         struct request *rqa = container_of(a, struct request, queuelist);
1625         struct request *rqb = container_of(b, struct request, queuelist);
1626
1627         return !(rqa->mq_ctx < rqb->mq_ctx ||
1628                  (rqa->mq_ctx == rqb->mq_ctx &&
1629                   blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1630 }
1631
1632 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1633 {
1634         struct blk_mq_ctx *this_ctx;
1635         struct request_queue *this_q;
1636         struct request *rq;
1637         LIST_HEAD(list);
1638         LIST_HEAD(ctx_list);
1639         unsigned int depth;
1640
1641         list_splice_init(&plug->mq_list, &list);
1642
1643         list_sort(NULL, &list, plug_ctx_cmp);
1644
1645         this_q = NULL;
1646         this_ctx = NULL;
1647         depth = 0;
1648
1649         while (!list_empty(&list)) {
1650                 rq = list_entry_rq(list.next);
1651                 list_del_init(&rq->queuelist);
1652                 BUG_ON(!rq->q);
1653                 if (rq->mq_ctx != this_ctx) {
1654                         if (this_ctx) {
1655                                 trace_block_unplug(this_q, depth, from_schedule);
1656                                 blk_mq_sched_insert_requests(this_q, this_ctx,
1657                                                                 &ctx_list,
1658                                                                 from_schedule);
1659                         }
1660
1661                         this_ctx = rq->mq_ctx;
1662                         this_q = rq->q;
1663                         depth = 0;
1664                 }
1665
1666                 depth++;
1667                 list_add_tail(&rq->queuelist, &ctx_list);
1668         }
1669
1670         /*
1671          * If 'this_ctx' is set, we know we have entries to complete
1672          * on 'ctx_list'. Do those.
1673          */
1674         if (this_ctx) {
1675                 trace_block_unplug(this_q, depth, from_schedule);
1676                 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1677                                                 from_schedule);
1678         }
1679 }
1680
1681 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1682 {
1683         blk_init_request_from_bio(rq, bio);
1684
1685         blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
1686
1687         blk_account_io_start(rq, true);
1688 }
1689
1690 static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
1691                                    struct blk_mq_ctx *ctx,
1692                                    struct request *rq)
1693 {
1694         spin_lock(&ctx->lock);
1695         __blk_mq_insert_request(hctx, rq, false);
1696         spin_unlock(&ctx->lock);
1697 }
1698
1699 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
1700 {
1701         if (rq->tag != -1)
1702                 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
1703
1704         return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
1705 }
1706
1707 static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1708                                         struct request *rq,
1709                                         blk_qc_t *cookie)
1710 {
1711         struct request_queue *q = rq->q;
1712         struct blk_mq_queue_data bd = {
1713                 .rq = rq,
1714                 .last = true,
1715         };
1716         blk_qc_t new_cookie;
1717         blk_status_t ret;
1718         bool run_queue = true;
1719
1720         /* RCU or SRCU read lock is needed before checking quiesced flag */
1721         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
1722                 run_queue = false;
1723                 goto insert;
1724         }
1725
1726         if (q->elevator)
1727                 goto insert;
1728
1729         if (!blk_mq_get_driver_tag(rq, NULL, false))
1730                 goto insert;
1731
1732         if (!blk_mq_get_dispatch_budget(hctx)) {
1733                 blk_mq_put_driver_tag(rq);
1734                 goto insert;
1735         }
1736
1737         new_cookie = request_to_qc_t(hctx, rq);
1738
1739         /*
1740          * For OK queue, we are done. For error, kill it. Any other
1741          * error (busy), just add it to our list as we previously
1742          * would have done
1743          */
1744         ret = q->mq_ops->queue_rq(hctx, &bd);
1745         switch (ret) {
1746         case BLK_STS_OK:
1747                 *cookie = new_cookie;
1748                 return;
1749         case BLK_STS_RESOURCE:
1750                 __blk_mq_requeue_request(rq);
1751                 goto insert;
1752         default:
1753                 *cookie = BLK_QC_T_NONE;
1754                 blk_mq_end_request(rq, ret);
1755                 return;
1756         }
1757
1758 insert:
1759         blk_mq_sched_insert_request(rq, false, run_queue, false,
1760                                         hctx->flags & BLK_MQ_F_BLOCKING);
1761 }
1762
1763 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1764                 struct request *rq, blk_qc_t *cookie)
1765 {
1766         int srcu_idx;
1767
1768         might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
1769
1770         hctx_lock(hctx, &srcu_idx);
1771         __blk_mq_try_issue_directly(hctx, rq, cookie);
1772         hctx_unlock(hctx, srcu_idx);
1773 }
1774
1775 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
1776 {
1777         const int is_sync = op_is_sync(bio->bi_opf);
1778         const int is_flush_fua = op_is_flush(bio->bi_opf);
1779         struct blk_mq_alloc_data data = { .flags = 0 };
1780         struct request *rq;
1781         unsigned int request_count = 0;
1782         struct blk_plug *plug;
1783         struct request *same_queue_rq = NULL;
1784         blk_qc_t cookie;
1785         unsigned int wb_acct;
1786
1787         blk_queue_bounce(q, &bio);
1788
1789         blk_queue_split(q, &bio);
1790
1791         if (!bio_integrity_prep(bio))
1792                 return BLK_QC_T_NONE;
1793
1794         if (!is_flush_fua && !blk_queue_nomerges(q) &&
1795             blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1796                 return BLK_QC_T_NONE;
1797
1798         if (blk_mq_sched_bio_merge(q, bio))
1799                 return BLK_QC_T_NONE;
1800
1801         wb_acct = wbt_wait(q->rq_wb, bio, NULL);
1802
1803         trace_block_getrq(q, bio, bio->bi_opf);
1804
1805         rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
1806         if (unlikely(!rq)) {
1807                 __wbt_done(q->rq_wb, wb_acct);
1808                 if (bio->bi_opf & REQ_NOWAIT)
1809                         bio_wouldblock_error(bio);
1810                 return BLK_QC_T_NONE;
1811         }
1812
1813         wbt_track(&rq->issue_stat, wb_acct);
1814
1815         cookie = request_to_qc_t(data.hctx, rq);
1816
1817         plug = current->plug;
1818         if (unlikely(is_flush_fua)) {
1819                 blk_mq_put_ctx(data.ctx);
1820                 blk_mq_bio_to_request(rq, bio);
1821
1822                 /* bypass scheduler for flush rq */
1823                 blk_insert_flush(rq);
1824                 blk_mq_run_hw_queue(data.hctx, true);
1825         } else if (plug && q->nr_hw_queues == 1) {
1826                 struct request *last = NULL;
1827
1828                 blk_mq_put_ctx(data.ctx);
1829                 blk_mq_bio_to_request(rq, bio);
1830
1831                 /*
1832                  * @request_count may become stale because of schedule
1833                  * out, so check the list again.
1834                  */
1835                 if (list_empty(&plug->mq_list))
1836                         request_count = 0;
1837                 else if (blk_queue_nomerges(q))
1838                         request_count = blk_plug_queued_count(q);
1839
1840                 if (!request_count)
1841                         trace_block_plug(q);
1842                 else
1843                         last = list_entry_rq(plug->mq_list.prev);
1844
1845                 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1846                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1847                         blk_flush_plug_list(plug, false);
1848                         trace_block_plug(q);
1849                 }
1850
1851                 list_add_tail(&rq->queuelist, &plug->mq_list);
1852         } else if (plug && !blk_queue_nomerges(q)) {
1853                 blk_mq_bio_to_request(rq, bio);
1854
1855                 /*
1856                  * We do limited plugging. If the bio can be merged, do that.
1857                  * Otherwise the existing request in the plug list will be
1858                  * issued. So the plug list will have one request at most
1859                  * The plug list might get flushed before this. If that happens,
1860                  * the plug list is empty, and same_queue_rq is invalid.
1861                  */
1862                 if (list_empty(&plug->mq_list))
1863                         same_queue_rq = NULL;
1864                 if (same_queue_rq)
1865                         list_del_init(&same_queue_rq->queuelist);
1866                 list_add_tail(&rq->queuelist, &plug->mq_list);
1867
1868                 blk_mq_put_ctx(data.ctx);
1869
1870                 if (same_queue_rq) {
1871                         data.hctx = blk_mq_map_queue(q,
1872                                         same_queue_rq->mq_ctx->cpu);
1873                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
1874                                         &cookie);
1875                 }
1876         } else if (q->nr_hw_queues > 1 && is_sync) {
1877                 blk_mq_put_ctx(data.ctx);
1878                 blk_mq_bio_to_request(rq, bio);
1879                 blk_mq_try_issue_directly(data.hctx, rq, &cookie);
1880         } else if (q->elevator) {
1881                 blk_mq_put_ctx(data.ctx);
1882                 blk_mq_bio_to_request(rq, bio);
1883                 blk_mq_sched_insert_request(rq, false, true, true, true);
1884         } else {
1885                 blk_mq_put_ctx(data.ctx);
1886                 blk_mq_bio_to_request(rq, bio);
1887                 blk_mq_queue_io(data.hctx, data.ctx, rq);
1888                 blk_mq_run_hw_queue(data.hctx, true);
1889         }
1890
1891         return cookie;
1892 }
1893
1894 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1895                      unsigned int hctx_idx)
1896 {
1897         struct page *page;
1898
1899         if (tags->rqs && set->ops->exit_request) {
1900                 int i;
1901
1902                 for (i = 0; i < tags->nr_tags; i++) {
1903                         struct request *rq = tags->static_rqs[i];
1904
1905                         if (!rq)
1906                                 continue;
1907                         set->ops->exit_request(set, rq, hctx_idx);
1908                         tags->static_rqs[i] = NULL;
1909                 }
1910         }
1911
1912         while (!list_empty(&tags->page_list)) {
1913                 page = list_first_entry(&tags->page_list, struct page, lru);
1914                 list_del_init(&page->lru);
1915                 /*
1916                  * Remove kmemleak object previously allocated in
1917                  * blk_mq_init_rq_map().
1918                  */
1919                 kmemleak_free(page_address(page));
1920                 __free_pages(page, page->private);
1921         }
1922 }
1923
1924 void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1925 {
1926         kfree(tags->rqs);
1927         tags->rqs = NULL;
1928         kfree(tags->static_rqs);
1929         tags->static_rqs = NULL;
1930
1931         blk_mq_free_tags(tags);
1932 }
1933
1934 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1935                                         unsigned int hctx_idx,
1936                                         unsigned int nr_tags,
1937                                         unsigned int reserved_tags)
1938 {
1939         struct blk_mq_tags *tags;
1940         int node;
1941
1942         node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
1943         if (node == NUMA_NO_NODE)
1944                 node = set->numa_node;
1945
1946         tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
1947                                 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1948         if (!tags)
1949                 return NULL;
1950
1951         tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1952                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1953                                  node);
1954         if (!tags->rqs) {
1955                 blk_mq_free_tags(tags);
1956                 return NULL;
1957         }
1958
1959         tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1960                                  GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1961                                  node);
1962         if (!tags->static_rqs) {
1963                 kfree(tags->rqs);
1964                 blk_mq_free_tags(tags);
1965                 return NULL;
1966         }
1967
1968         return tags;
1969 }
1970
1971 static size_t order_to_size(unsigned int order)
1972 {
1973         return (size_t)PAGE_SIZE << order;
1974 }
1975
1976 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
1977                                unsigned int hctx_idx, int node)
1978 {
1979         int ret;
1980
1981         if (set->ops->init_request) {
1982                 ret = set->ops->init_request(set, rq, hctx_idx, node);
1983                 if (ret)
1984                         return ret;
1985         }
1986
1987         seqcount_init(&rq->gstate_seq);
1988         u64_stats_init(&rq->aborted_gstate_sync);
1989         return 0;
1990 }
1991
1992 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1993                      unsigned int hctx_idx, unsigned int depth)
1994 {
1995         unsigned int i, j, entries_per_page, max_order = 4;
1996         size_t rq_size, left;
1997         int node;
1998
1999         node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
2000         if (node == NUMA_NO_NODE)
2001                 node = set->numa_node;
2002
2003         INIT_LIST_HEAD(&tags->page_list);
2004
2005         /*
2006          * rq_size is the size of the request plus driver payload, rounded
2007          * to the cacheline size
2008          */
2009         rq_size = round_up(sizeof(struct request) + set->cmd_size,
2010                                 cache_line_size());
2011         left = rq_size * depth;
2012
2013         for (i = 0; i < depth; ) {
2014                 int this_order = max_order;
2015                 struct page *page;
2016                 int to_do;
2017                 void *p;
2018
2019                 while (this_order && left < order_to_size(this_order - 1))
2020                         this_order--;
2021
2022                 do {
2023                         page = alloc_pages_node(node,
2024                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
2025                                 this_order);
2026                         if (page)
2027                                 break;
2028                         if (!this_order--)
2029                                 break;
2030                         if (order_to_size(this_order) < rq_size)
2031                                 break;
2032                 } while (1);
2033
2034                 if (!page)
2035                         goto fail;
2036
2037                 page->private = this_order;
2038                 list_add_tail(&page->lru, &tags->page_list);
2039
2040                 p = page_address(page);
2041                 /*
2042                  * Allow kmemleak to scan these pages as they contain pointers
2043                  * to additional allocations like via ops->init_request().
2044                  */
2045                 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
2046                 entries_per_page = order_to_size(this_order) / rq_size;
2047                 to_do = min(entries_per_page, depth - i);
2048                 left -= to_do * rq_size;
2049                 for (j = 0; j < to_do; j++) {
2050                         struct request *rq = p;
2051
2052                         tags->static_rqs[i] = rq;
2053                         if (blk_mq_init_request(set, rq, hctx_idx, node)) {
2054                                 tags->static_rqs[i] = NULL;
2055                                 goto fail;
2056                         }
2057
2058                         p += rq_size;
2059                         i++;
2060                 }
2061         }
2062         return 0;
2063
2064 fail:
2065         blk_mq_free_rqs(set, tags, hctx_idx);
2066         return -ENOMEM;
2067 }
2068
2069 /*
2070  * 'cpu' is going away. splice any existing rq_list entries from this
2071  * software queue to the hw queue dispatch list, and ensure that it
2072  * gets run.
2073  */
2074 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
2075 {
2076         struct blk_mq_hw_ctx *hctx;
2077         struct blk_mq_ctx *ctx;
2078         LIST_HEAD(tmp);
2079
2080         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
2081         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
2082
2083         spin_lock(&ctx->lock);
2084         if (!list_empty(&ctx->rq_list)) {
2085                 list_splice_init(&ctx->rq_list, &tmp);
2086                 blk_mq_hctx_clear_pending(hctx, ctx);
2087         }
2088         spin_unlock(&ctx->lock);
2089
2090         if (list_empty(&tmp))
2091                 return 0;
2092
2093         spin_lock(&hctx->lock);
2094         list_splice_tail_init(&tmp, &hctx->dispatch);
2095         spin_unlock(&hctx->lock);
2096
2097         blk_mq_run_hw_queue(hctx, true);
2098         return 0;
2099 }
2100
2101 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
2102 {
2103         cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
2104                                             &hctx->cpuhp_dead);
2105 }
2106
2107 /* hctx->ctxs will be freed in queue's release handler */
2108 static void blk_mq_exit_hctx(struct request_queue *q,
2109                 struct blk_mq_tag_set *set,
2110                 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
2111 {
2112         blk_mq_debugfs_unregister_hctx(hctx);
2113
2114         if (blk_mq_hw_queue_mapped(hctx))
2115                 blk_mq_tag_idle(hctx);
2116
2117         if (set->ops->exit_request)
2118                 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
2119
2120         blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
2121
2122         if (set->ops->exit_hctx)
2123                 set->ops->exit_hctx(hctx, hctx_idx);
2124
2125         if (hctx->flags & BLK_MQ_F_BLOCKING)
2126                 cleanup_srcu_struct(hctx->queue_rq_srcu);
2127
2128         blk_mq_remove_cpuhp(hctx);
2129         blk_free_flush_queue(hctx->fq);
2130         sbitmap_free(&hctx->ctx_map);
2131 }
2132
2133 static void blk_mq_exit_hw_queues(struct request_queue *q,
2134                 struct blk_mq_tag_set *set, int nr_queue)
2135 {
2136         struct blk_mq_hw_ctx *hctx;
2137         unsigned int i;
2138
2139         queue_for_each_hw_ctx(q, hctx, i) {
2140                 if (i == nr_queue)
2141                         break;
2142                 blk_mq_exit_hctx(q, set, hctx, i);
2143         }
2144 }
2145
2146 static int blk_mq_init_hctx(struct request_queue *q,
2147                 struct blk_mq_tag_set *set,
2148                 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
2149 {
2150         int node;
2151
2152         node = hctx->numa_node;
2153         if (node == NUMA_NO_NODE)
2154                 node = hctx->numa_node = set->numa_node;
2155
2156         INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
2157         spin_lock_init(&hctx->lock);
2158         INIT_LIST_HEAD(&hctx->dispatch);
2159         hctx->queue = q;
2160         hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
2161
2162         cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
2163
2164         hctx->tags = set->tags[hctx_idx];
2165
2166         /*
2167          * Allocate space for all possible cpus to avoid allocation at
2168          * runtime
2169          */
2170         hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
2171                                         GFP_KERNEL, node);
2172         if (!hctx->ctxs)
2173                 goto unregister_cpu_notifier;
2174
2175         if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
2176                               node))
2177                 goto free_ctxs;
2178
2179         hctx->nr_ctx = 0;
2180
2181         init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
2182         INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
2183
2184         if (set->ops->init_hctx &&
2185             set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
2186                 goto free_bitmap;
2187
2188         if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
2189                 goto exit_hctx;
2190
2191         hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
2192         if (!hctx->fq)
2193                 goto sched_exit_hctx;
2194
2195         if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
2196                 goto free_fq;
2197
2198         if (hctx->flags & BLK_MQ_F_BLOCKING)
2199                 init_srcu_struct(hctx->queue_rq_srcu);
2200
2201         blk_mq_debugfs_register_hctx(q, hctx);
2202
2203         return 0;
2204
2205  free_fq:
2206         kfree(hctx->fq);
2207  sched_exit_hctx:
2208         blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
2209  exit_hctx:
2210         if (set->ops->exit_hctx)
2211                 set->ops->exit_hctx(hctx, hctx_idx);
2212  free_bitmap:
2213         sbitmap_free(&hctx->ctx_map);
2214  free_ctxs:
2215         kfree(hctx->ctxs);
2216  unregister_cpu_notifier:
2217         blk_mq_remove_cpuhp(hctx);
2218         return -1;
2219 }
2220
2221 static void blk_mq_init_cpu_queues(struct request_queue *q,
2222                                    unsigned int nr_hw_queues)
2223 {
2224         unsigned int i;
2225
2226         for_each_possible_cpu(i) {
2227                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2228                 struct blk_mq_hw_ctx *hctx;
2229
2230                 __ctx->cpu = i;
2231                 spin_lock_init(&__ctx->lock);
2232                 INIT_LIST_HEAD(&__ctx->rq_list);
2233                 __ctx->queue = q;
2234
2235                 /* If the cpu isn't present, the cpu is mapped to first hctx */
2236                 if (!cpu_present(i))
2237                         continue;
2238
2239                 hctx = blk_mq_map_queue(q, i);
2240
2241                 /*
2242                  * Set local node, IFF we have more than one hw queue. If
2243                  * not, we remain on the home node of the device
2244                  */
2245                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2246                         hctx->numa_node = local_memory_node(cpu_to_node(i));
2247         }
2248 }
2249
2250 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2251 {
2252         int ret = 0;
2253
2254         set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2255                                         set->queue_depth, set->reserved_tags);
2256         if (!set->tags[hctx_idx])
2257                 return false;
2258
2259         ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2260                                 set->queue_depth);
2261         if (!ret)
2262                 return true;
2263
2264         blk_mq_free_rq_map(set->tags[hctx_idx]);
2265         set->tags[hctx_idx] = NULL;
2266         return false;
2267 }
2268
2269 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2270                                          unsigned int hctx_idx)
2271 {
2272         if (set->tags[hctx_idx]) {
2273                 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2274                 blk_mq_free_rq_map(set->tags[hctx_idx]);
2275                 set->tags[hctx_idx] = NULL;
2276         }
2277 }
2278
2279 static void blk_mq_map_swqueue(struct request_queue *q)
2280 {
2281         unsigned int i, hctx_idx;
2282         struct blk_mq_hw_ctx *hctx;
2283         struct blk_mq_ctx *ctx;
2284         struct blk_mq_tag_set *set = q->tag_set;
2285
2286         /*
2287          * Avoid others reading imcomplete hctx->cpumask through sysfs
2288          */
2289         mutex_lock(&q->sysfs_lock);
2290
2291         queue_for_each_hw_ctx(q, hctx, i) {
2292                 cpumask_clear(hctx->cpumask);
2293                 hctx->nr_ctx = 0;
2294         }
2295
2296         /*
2297          * Map software to hardware queues.
2298          *
2299          * If the cpu isn't present, the cpu is mapped to first hctx.
2300          */
2301         for_each_present_cpu(i) {
2302                 hctx_idx = q->mq_map[i];
2303                 /* unmapped hw queue can be remapped after CPU topo changed */
2304                 if (!set->tags[hctx_idx] &&
2305                     !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2306                         /*
2307                          * If tags initialization fail for some hctx,
2308                          * that hctx won't be brought online.  In this
2309                          * case, remap the current ctx to hctx[0] which
2310                          * is guaranteed to always have tags allocated
2311                          */
2312                         q->mq_map[i] = 0;
2313                 }
2314
2315                 ctx = per_cpu_ptr(q->queue_ctx, i);
2316                 hctx = blk_mq_map_queue(q, i);
2317
2318                 cpumask_set_cpu(i, hctx->cpumask);
2319                 ctx->index_hw = hctx->nr_ctx;
2320                 hctx->ctxs[hctx->nr_ctx++] = ctx;
2321         }
2322
2323         mutex_unlock(&q->sysfs_lock);
2324
2325         queue_for_each_hw_ctx(q, hctx, i) {
2326                 /*
2327                  * If no software queues are mapped to this hardware queue,
2328                  * disable it and free the request entries.
2329                  */
2330                 if (!hctx->nr_ctx) {
2331                         /* Never unmap queue 0.  We need it as a
2332                          * fallback in case of a new remap fails
2333                          * allocation
2334                          */
2335                         if (i && set->tags[i])
2336                                 blk_mq_free_map_and_requests(set, i);
2337
2338                         hctx->tags = NULL;
2339                         continue;
2340                 }
2341
2342                 hctx->tags = set->tags[i];
2343                 WARN_ON(!hctx->tags);
2344
2345                 /*
2346                  * Set the map size to the number of mapped software queues.
2347                  * This is more accurate and more efficient than looping
2348                  * over all possibly mapped software queues.
2349                  */
2350                 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2351
2352                 /*
2353                  * Initialize batch roundrobin counts
2354                  */
2355                 hctx->next_cpu = cpumask_first(hctx->cpumask);
2356                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2357         }
2358 }
2359
2360 /*
2361  * Caller needs to ensure that we're either frozen/quiesced, or that
2362  * the queue isn't live yet.
2363  */
2364 static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2365 {
2366         struct blk_mq_hw_ctx *hctx;
2367         int i;
2368
2369         queue_for_each_hw_ctx(q, hctx, i) {
2370                 if (shared) {
2371                         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2372                                 atomic_inc(&q->shared_hctx_restart);
2373                         hctx->flags |= BLK_MQ_F_TAG_SHARED;
2374                 } else {
2375                         if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2376                                 atomic_dec(&q->shared_hctx_restart);
2377                         hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2378                 }
2379         }
2380 }
2381
2382 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2383                                         bool shared)
2384 {
2385         struct request_queue *q;
2386
2387         lockdep_assert_held(&set->tag_list_lock);
2388
2389         list_for_each_entry(q, &set->tag_list, tag_set_list) {
2390                 blk_mq_freeze_queue(q);
2391                 queue_set_hctx_shared(q, shared);
2392                 blk_mq_unfreeze_queue(q);
2393         }
2394 }
2395
2396 static void blk_mq_del_queue_tag_set(struct request_queue *q)
2397 {
2398         struct blk_mq_tag_set *set = q->tag_set;
2399
2400         mutex_lock(&set->tag_list_lock);
2401         list_del_rcu(&q->tag_set_list);
2402         INIT_LIST_HEAD(&q->tag_set_list);
2403         if (list_is_singular(&set->tag_list)) {
2404                 /* just transitioned to unshared */
2405                 set->flags &= ~BLK_MQ_F_TAG_SHARED;
2406                 /* update existing queue */
2407                 blk_mq_update_tag_set_depth(set, false);
2408         }
2409         mutex_unlock(&set->tag_list_lock);
2410
2411         synchronize_rcu();
2412 }
2413
2414 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2415                                      struct request_queue *q)
2416 {
2417         q->tag_set = set;
2418
2419         mutex_lock(&set->tag_list_lock);
2420
2421         /*
2422          * Check to see if we're transitioning to shared (from 1 to 2 queues).
2423          */
2424         if (!list_empty(&set->tag_list) &&
2425             !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2426                 set->flags |= BLK_MQ_F_TAG_SHARED;
2427                 /* update existing queue */
2428                 blk_mq_update_tag_set_depth(set, true);
2429         }
2430         if (set->flags & BLK_MQ_F_TAG_SHARED)
2431                 queue_set_hctx_shared(q, true);
2432         list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2433
2434         mutex_unlock(&set->tag_list_lock);
2435 }
2436
2437 /*
2438  * It is the actual release handler for mq, but we do it from
2439  * request queue's release handler for avoiding use-after-free
2440  * and headache because q->mq_kobj shouldn't have been introduced,
2441  * but we can't group ctx/kctx kobj without it.
2442  */
2443 void blk_mq_release(struct request_queue *q)
2444 {
2445         struct blk_mq_hw_ctx *hctx;
2446         unsigned int i;
2447
2448         /* hctx kobj stays in hctx */
2449         queue_for_each_hw_ctx(q, hctx, i) {
2450                 if (!hctx)
2451                         continue;
2452                 kobject_put(&hctx->kobj);
2453         }
2454
2455         q->mq_map = NULL;
2456
2457         kfree(q->queue_hw_ctx);
2458
2459         /*
2460          * release .mq_kobj and sw queue's kobject now because
2461          * both share lifetime with request queue.
2462          */
2463         blk_mq_sysfs_deinit(q);
2464
2465         free_percpu(q->queue_ctx);
2466 }
2467
2468 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2469 {
2470         struct request_queue *uninit_q, *q;
2471
2472         uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2473         if (!uninit_q)
2474                 return ERR_PTR(-ENOMEM);
2475
2476         q = blk_mq_init_allocated_queue(set, uninit_q);
2477         if (IS_ERR(q))
2478                 blk_cleanup_queue(uninit_q);
2479
2480         return q;
2481 }
2482 EXPORT_SYMBOL(blk_mq_init_queue);
2483
2484 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
2485 {
2486         int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
2487
2488         BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
2489                            __alignof__(struct blk_mq_hw_ctx)) !=
2490                      sizeof(struct blk_mq_hw_ctx));
2491
2492         if (tag_set->flags & BLK_MQ_F_BLOCKING)
2493                 hw_ctx_size += sizeof(struct srcu_struct);
2494
2495         return hw_ctx_size;
2496 }
2497
2498 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2499                                                 struct request_queue *q)
2500 {
2501         int i, j;
2502         struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2503
2504         blk_mq_sysfs_unregister(q);
2505
2506         /* protect against switching io scheduler  */
2507         mutex_lock(&q->sysfs_lock);
2508         for (i = 0; i < set->nr_hw_queues; i++) {
2509                 int node;
2510
2511                 if (hctxs[i])
2512                         continue;
2513
2514                 node = blk_mq_hw_queue_to_node(q->mq_map, i);
2515                 hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set),
2516                                         GFP_KERNEL, node);
2517                 if (!hctxs[i])
2518                         break;
2519
2520                 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2521                                                 node)) {
2522                         kfree(hctxs[i]);
2523                         hctxs[i] = NULL;
2524                         break;
2525                 }
2526
2527                 atomic_set(&hctxs[i]->nr_active, 0);
2528                 hctxs[i]->numa_node = node;
2529                 hctxs[i]->queue_num = i;
2530
2531                 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2532                         free_cpumask_var(hctxs[i]->cpumask);
2533                         kfree(hctxs[i]);
2534                         hctxs[i] = NULL;
2535                         break;
2536                 }
2537                 blk_mq_hctx_kobj_init(hctxs[i]);
2538         }
2539         for (j = i; j < q->nr_hw_queues; j++) {
2540                 struct blk_mq_hw_ctx *hctx = hctxs[j];
2541
2542                 if (hctx) {
2543                         if (hctx->tags)
2544                                 blk_mq_free_map_and_requests(set, j);
2545                         blk_mq_exit_hctx(q, set, hctx, j);
2546                         kobject_put(&hctx->kobj);
2547                         hctxs[j] = NULL;
2548
2549                 }
2550         }
2551         q->nr_hw_queues = i;
2552         mutex_unlock(&q->sysfs_lock);
2553         blk_mq_sysfs_register(q);
2554 }
2555
2556 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2557                                                   struct request_queue *q)
2558 {
2559         /* mark the queue as mq asap */
2560         q->mq_ops = set->ops;
2561
2562         q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2563                                              blk_mq_poll_stats_bkt,
2564                                              BLK_MQ_POLL_STATS_BKTS, q);
2565         if (!q->poll_cb)
2566                 goto err_exit;
2567
2568         q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2569         if (!q->queue_ctx)
2570                 goto err_exit;
2571
2572         /* init q->mq_kobj and sw queues' kobjects */
2573         blk_mq_sysfs_init(q);
2574
2575         q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2576                                                 GFP_KERNEL, set->numa_node);
2577         if (!q->queue_hw_ctx)
2578                 goto err_percpu;
2579
2580         q->mq_map = set->mq_map;
2581
2582         blk_mq_realloc_hw_ctxs(set, q);
2583         if (!q->nr_hw_queues)
2584                 goto err_hctxs;
2585
2586         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2587         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2588
2589         q->nr_queues = nr_cpu_ids;
2590
2591         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2592
2593         if (!(set->flags & BLK_MQ_F_SG_MERGE))
2594                 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2595
2596         q->sg_reserved_size = INT_MAX;
2597
2598         INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2599         INIT_LIST_HEAD(&q->requeue_list);
2600         spin_lock_init(&q->requeue_lock);
2601
2602         blk_queue_make_request(q, blk_mq_make_request);
2603         if (q->mq_ops->poll)
2604                 q->poll_fn = blk_mq_poll;
2605
2606         /*
2607          * Do this after blk_queue_make_request() overrides it...
2608          */
2609         q->nr_requests = set->queue_depth;
2610
2611         /*
2612          * Default to classic polling
2613          */
2614         q->poll_nsec = -1;
2615
2616         if (set->ops->complete)
2617                 blk_queue_softirq_done(q, set->ops->complete);
2618
2619         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2620         blk_mq_add_queue_tag_set(set, q);
2621         blk_mq_map_swqueue(q);
2622
2623         if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2624                 int ret;
2625
2626                 ret = blk_mq_sched_init(q);
2627                 if (ret)
2628                         return ERR_PTR(ret);
2629         }
2630
2631         return q;
2632
2633 err_hctxs:
2634         kfree(q->queue_hw_ctx);
2635 err_percpu:
2636         free_percpu(q->queue_ctx);
2637 err_exit:
2638         q->mq_ops = NULL;
2639         return ERR_PTR(-ENOMEM);
2640 }
2641 EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2642
2643 void blk_mq_free_queue(struct request_queue *q)
2644 {
2645         struct blk_mq_tag_set   *set = q->tag_set;
2646
2647         blk_mq_del_queue_tag_set(q);
2648         blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2649 }
2650
2651 /* Basically redo blk_mq_init_queue with queue frozen */
2652 static void blk_mq_queue_reinit(struct request_queue *q)
2653 {
2654         WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2655
2656         blk_mq_debugfs_unregister_hctxs(q);
2657         blk_mq_sysfs_unregister(q);
2658
2659         /*
2660          * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
2661          * we should change hctx numa_node according to the new topology (this
2662          * involves freeing and re-allocating memory, worth doing?)
2663          */
2664         blk_mq_map_swqueue(q);
2665
2666         blk_mq_sysfs_register(q);
2667         blk_mq_debugfs_register_hctxs(q);
2668 }
2669
2670 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2671 {
2672         int i;
2673
2674         for (i = 0; i < set->nr_hw_queues; i++)
2675                 if (!__blk_mq_alloc_rq_map(set, i))
2676                         goto out_unwind;
2677
2678         return 0;
2679
2680 out_unwind:
2681         while (--i >= 0)
2682                 blk_mq_free_rq_map(set->tags[i]);
2683
2684         return -ENOMEM;
2685 }
2686
2687 /*
2688  * Allocate the request maps associated with this tag_set. Note that this
2689  * may reduce the depth asked for, if memory is tight. set->queue_depth
2690  * will be updated to reflect the allocated depth.
2691  */
2692 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2693 {
2694         unsigned int depth;
2695         int err;
2696
2697         depth = set->queue_depth;
2698         do {
2699                 err = __blk_mq_alloc_rq_maps(set);
2700                 if (!err)
2701                         break;
2702
2703                 set->queue_depth >>= 1;
2704                 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2705                         err = -ENOMEM;
2706                         break;
2707                 }
2708         } while (set->queue_depth);
2709
2710         if (!set->queue_depth || err) {
2711                 pr_err("blk-mq: failed to allocate request map\n");
2712                 return -ENOMEM;
2713         }
2714
2715         if (depth != set->queue_depth)
2716                 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2717                                                 depth, set->queue_depth);
2718
2719         return 0;
2720 }
2721
2722 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2723 {
2724         if (set->ops->map_queues) {
2725                 int cpu;
2726                 /*
2727                  * transport .map_queues is usually done in the following
2728                  * way:
2729                  *
2730                  * for (queue = 0; queue < set->nr_hw_queues; queue++) {
2731                  *      mask = get_cpu_mask(queue)
2732                  *      for_each_cpu(cpu, mask)
2733                  *              set->mq_map[cpu] = queue;
2734                  * }
2735                  *
2736                  * When we need to remap, the table has to be cleared for
2737                  * killing stale mapping since one CPU may not be mapped
2738                  * to any hw queue.
2739                  */
2740                 for_each_possible_cpu(cpu)
2741                         set->mq_map[cpu] = 0;
2742
2743                 return set->ops->map_queues(set);
2744         } else
2745                 return blk_mq_map_queues(set);
2746 }
2747
2748 /*
2749  * Alloc a tag set to be associated with one or more request queues.
2750  * May fail with EINVAL for various error conditions. May adjust the
2751  * requested depth down, if if it too large. In that case, the set
2752  * value will be stored in set->queue_depth.
2753  */
2754 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2755 {
2756         int ret;
2757
2758         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2759
2760         if (!set->nr_hw_queues)
2761                 return -EINVAL;
2762         if (!set->queue_depth)
2763                 return -EINVAL;
2764         if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2765                 return -EINVAL;
2766
2767         if (!set->ops->queue_rq)
2768                 return -EINVAL;
2769
2770         if (!set->ops->get_budget ^ !set->ops->put_budget)
2771                 return -EINVAL;
2772
2773         if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2774                 pr_info("blk-mq: reduced tag depth to %u\n",
2775                         BLK_MQ_MAX_DEPTH);
2776                 set->queue_depth = BLK_MQ_MAX_DEPTH;
2777         }
2778
2779         /*
2780          * If a crashdump is active, then we are potentially in a very
2781          * memory constrained environment. Limit us to 1 queue and
2782          * 64 tags to prevent using too much memory.
2783          */
2784         if (is_kdump_kernel()) {
2785                 set->nr_hw_queues = 1;
2786                 set->queue_depth = min(64U, set->queue_depth);
2787         }
2788         /*
2789          * There is no use for more h/w queues than cpus.
2790          */
2791         if (set->nr_hw_queues > nr_cpu_ids)
2792                 set->nr_hw_queues = nr_cpu_ids;
2793
2794         set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2795                                  GFP_KERNEL, set->numa_node);
2796         if (!set->tags)
2797                 return -ENOMEM;
2798
2799         ret = -ENOMEM;
2800         set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2801                         GFP_KERNEL, set->numa_node);
2802         if (!set->mq_map)
2803                 goto out_free_tags;
2804
2805         ret = blk_mq_update_queue_map(set);
2806         if (ret)
2807                 goto out_free_mq_map;
2808
2809         ret = blk_mq_alloc_rq_maps(set);
2810         if (ret)
2811                 goto out_free_mq_map;
2812
2813         mutex_init(&set->tag_list_lock);
2814         INIT_LIST_HEAD(&set->tag_list);
2815
2816         return 0;
2817
2818 out_free_mq_map:
2819         kfree(set->mq_map);
2820         set->mq_map = NULL;
2821 out_free_tags:
2822         kfree(set->tags);
2823         set->tags = NULL;
2824         return ret;
2825 }
2826 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2827
2828 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2829 {
2830         int i;
2831
2832         for (i = 0; i < nr_cpu_ids; i++)
2833                 blk_mq_free_map_and_requests(set, i);
2834
2835         kfree(set->mq_map);
2836         set->mq_map = NULL;
2837
2838         kfree(set->tags);
2839         set->tags = NULL;
2840 }
2841 EXPORT_SYMBOL(blk_mq_free_tag_set);
2842
2843 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2844 {
2845         struct blk_mq_tag_set *set = q->tag_set;
2846         struct blk_mq_hw_ctx *hctx;
2847         int i, ret;
2848
2849         if (!set)
2850                 return -EINVAL;
2851
2852         blk_mq_freeze_queue(q);
2853         blk_mq_quiesce_queue(q);
2854
2855         ret = 0;
2856         queue_for_each_hw_ctx(q, hctx, i) {
2857                 if (!hctx->tags)
2858                         continue;
2859                 /*
2860                  * If we're using an MQ scheduler, just update the scheduler
2861                  * queue depth. This is similar to what the old code would do.
2862                  */
2863                 if (!hctx->sched_tags) {
2864                         ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
2865                                                         false);
2866                 } else {
2867                         ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2868                                                         nr, true);
2869                 }
2870                 if (ret)
2871                         break;
2872         }
2873
2874         if (!ret)
2875                 q->nr_requests = nr;
2876
2877         blk_mq_unquiesce_queue(q);
2878         blk_mq_unfreeze_queue(q);
2879
2880         return ret;
2881 }
2882
2883 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
2884                                                         int nr_hw_queues)
2885 {
2886         struct request_queue *q;
2887
2888         lockdep_assert_held(&set->tag_list_lock);
2889
2890         if (nr_hw_queues > nr_cpu_ids)
2891                 nr_hw_queues = nr_cpu_ids;
2892         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2893                 return;
2894
2895         list_for_each_entry(q, &set->tag_list, tag_set_list)
2896                 blk_mq_freeze_queue(q);
2897
2898         set->nr_hw_queues = nr_hw_queues;
2899         blk_mq_update_queue_map(set);
2900         list_for_each_entry(q, &set->tag_list, tag_set_list) {
2901                 blk_mq_realloc_hw_ctxs(set, q);
2902                 blk_mq_queue_reinit(q);
2903         }
2904
2905         list_for_each_entry(q, &set->tag_list, tag_set_list)
2906                 blk_mq_unfreeze_queue(q);
2907 }
2908
2909 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2910 {
2911         mutex_lock(&set->tag_list_lock);
2912         __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
2913         mutex_unlock(&set->tag_list_lock);
2914 }
2915 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2916
2917 /* Enable polling stats and return whether they were already enabled. */
2918 static bool blk_poll_stats_enable(struct request_queue *q)
2919 {
2920         if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2921             test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
2922                 return true;
2923         blk_stat_add_callback(q, q->poll_cb);
2924         return false;
2925 }
2926
2927 static void blk_mq_poll_stats_start(struct request_queue *q)
2928 {
2929         /*
2930          * We don't arm the callback if polling stats are not enabled or the
2931          * callback is already active.
2932          */
2933         if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2934             blk_stat_is_active(q->poll_cb))
2935                 return;
2936
2937         blk_stat_activate_msecs(q->poll_cb, 100);
2938 }
2939
2940 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2941 {
2942         struct request_queue *q = cb->data;
2943         int bucket;
2944
2945         for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
2946                 if (cb->stat[bucket].nr_samples)
2947                         q->poll_stat[bucket] = cb->stat[bucket];
2948         }
2949 }
2950
2951 static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
2952                                        struct blk_mq_hw_ctx *hctx,
2953                                        struct request *rq)
2954 {
2955         unsigned long ret = 0;
2956         int bucket;
2957
2958         /*
2959          * If stats collection isn't on, don't sleep but turn it on for
2960          * future users
2961          */
2962         if (!blk_poll_stats_enable(q))
2963                 return 0;
2964
2965         /*
2966          * As an optimistic guess, use half of the mean service time
2967          * for this type of request. We can (and should) make this smarter.
2968          * For instance, if the completion latencies are tight, we can
2969          * get closer than just half the mean. This is especially
2970          * important on devices where the completion latencies are longer
2971          * than ~10 usec. We do use the stats for the relevant IO size
2972          * if available which does lead to better estimates.
2973          */
2974         bucket = blk_mq_poll_stats_bkt(rq);
2975         if (bucket < 0)
2976                 return ret;
2977
2978         if (q->poll_stat[bucket].nr_samples)
2979                 ret = (q->poll_stat[bucket].mean + 1) / 2;
2980
2981         return ret;
2982 }
2983
2984 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
2985                                      struct blk_mq_hw_ctx *hctx,
2986                                      struct request *rq)
2987 {
2988         struct hrtimer_sleeper hs;
2989         enum hrtimer_mode mode;
2990         unsigned int nsecs;
2991         ktime_t kt;
2992
2993         if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
2994                 return false;
2995
2996         /*
2997          * poll_nsec can be:
2998          *
2999          * -1:  don't ever hybrid sleep
3000          *  0:  use half of prev avg
3001          * >0:  use this specific value
3002          */
3003         if (q->poll_nsec == -1)
3004                 return false;
3005         else if (q->poll_nsec > 0)
3006                 nsecs = q->poll_nsec;
3007         else
3008                 nsecs = blk_mq_poll_nsecs(q, hctx, rq);
3009
3010         if (!nsecs)
3011                 return false;
3012
3013         set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
3014
3015         /*
3016          * This will be replaced with the stats tracking code, using
3017          * 'avg_completion_time / 2' as the pre-sleep target.
3018          */
3019         kt = nsecs;
3020
3021         mode = HRTIMER_MODE_REL;
3022         hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
3023         hrtimer_set_expires(&hs.timer, kt);
3024
3025         hrtimer_init_sleeper(&hs, current);
3026         do {
3027                 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
3028                         break;
3029                 set_current_state(TASK_UNINTERRUPTIBLE);
3030                 hrtimer_start_expires(&hs.timer, mode);
3031                 if (hs.task)
3032                         io_schedule();
3033                 hrtimer_cancel(&hs.timer);
3034                 mode = HRTIMER_MODE_ABS;
3035         } while (hs.task && !signal_pending(current));
3036
3037         __set_current_state(TASK_RUNNING);
3038         destroy_hrtimer_on_stack(&hs.timer);
3039         return true;
3040 }
3041
3042 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
3043 {
3044         struct request_queue *q = hctx->queue;
3045         long state;
3046
3047         /*
3048          * If we sleep, have the caller restart the poll loop to reset
3049          * the state. Like for the other success return cases, the
3050          * caller is responsible for checking if the IO completed. If
3051          * the IO isn't complete, we'll get called again and will go
3052          * straight to the busy poll loop.
3053          */
3054         if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
3055                 return true;
3056
3057         hctx->poll_considered++;
3058
3059         state = current->state;
3060         while (!need_resched()) {
3061                 int ret;
3062
3063                 hctx->poll_invoked++;
3064
3065                 ret = q->mq_ops->poll(hctx, rq->tag);
3066                 if (ret > 0) {
3067                         hctx->poll_success++;
3068                         set_current_state(TASK_RUNNING);
3069                         return true;
3070                 }
3071
3072                 if (signal_pending_state(state, current))
3073                         set_current_state(TASK_RUNNING);
3074
3075                 if (current->state == TASK_RUNNING)
3076                         return true;
3077                 if (ret < 0)
3078                         break;
3079                 cpu_relax();
3080         }
3081
3082         return false;
3083 }
3084
3085 static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
3086 {
3087         struct blk_mq_hw_ctx *hctx;
3088         struct request *rq;
3089
3090         if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
3091                 return false;
3092
3093         hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
3094         if (!blk_qc_t_is_internal(cookie))
3095                 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
3096         else {
3097                 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
3098                 /*
3099                  * With scheduling, if the request has completed, we'll
3100                  * get a NULL return here, as we clear the sched tag when
3101                  * that happens. The request still remains valid, like always,
3102                  * so we should be safe with just the NULL check.
3103                  */
3104                 if (!rq)
3105                         return false;
3106         }
3107
3108         return __blk_mq_poll(hctx, rq);
3109 }
3110
3111 static int __init blk_mq_init(void)
3112 {
3113         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
3114                                 blk_mq_hctx_notify_dead);
3115         return 0;
3116 }
3117 subsys_initcall(blk_mq_init);