Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

[linux.git] / block / blk-mq.c
diff --git a/block/blk-mq.c b/block/blk-mq.c

index 3f91c6e5b17a95876f2c9c9ccf7cc481f159c8cc..3ba37b9e15e9ae7d7921de1ac9ead5cee707c21a 100644 (file)
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,6 @@
  #include "blk-mq-sched.h"
  #include "blk-rq-qos.h"
  
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
  static void blk_mq_poll_stats_start(struct request_queue *q);
  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
  
@@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
  static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                      struct blk_mq_ctx *ctx)
  {
-       if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
-               sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
+       const int bit = ctx->index_hw[hctx->type];
+
+       if (!sbitmap_test_bit(&hctx->ctx_map, bit))
+               sbitmap_set_bit(&hctx->ctx_map, bit);
  }
  
  static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
                                       struct blk_mq_ctx *ctx)
  {
-       sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
+       const int bit = ctx->index_hw[hctx->type];
+
+       sbitmap_clear_bit(&hctx->ctx_map, bit);
  }
  
  struct mq_inflight {
@@ -90,33 +93,33 @@ struct mq_inflight {
         unsigned int *inflight;
  };
  
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
                                   struct request *rq, void *priv,
                                   bool reserved)
  {
         struct mq_inflight *mi = priv;
  
         /*
-        * index[0] counts the specific partition that was asked for. index[1]
-        * counts the ones that are active on the whole device, so increment
-        * that if mi->part is indeed a partition, and not a whole device.
+        * index[0] counts the specific partition that was asked for.
          */
         if (rq->part == mi->part)
                 mi->inflight[0]++;
-       if (mi->part->partno)
-               mi->inflight[1]++;
+
+       return true;
  }
  
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
-                     unsigned int inflight[2])
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
  {
+       unsigned inflight[2];
         struct mq_inflight mi = { .part = part, .inflight = inflight, };
  
         inflight[0] = inflight[1] = 0;
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+
+       return inflight[0];
  }
  
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
                                      struct request *rq, void *priv,
                                      bool reserved)
  {
@@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
  
         if (rq->part == mi->part)
                 mi->inflight[rq_data_dir(rq)]++;
+
+       return true;
  }
  
  void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q)
         freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
         if (freeze_depth == 1) {
                 percpu_ref_kill(&q->q_usage_counter);
-               if (q->mq_ops)
+               if (queue_is_mq(q))
                         blk_mq_run_hw_queues(q, false);
         }
  }
@@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q)
          * exported to drivers as the only user for unfreeze is blk_mq.
          */
         blk_freeze_queue_start(q);
-       if (!q->mq_ops)
-               blk_drain_queue(q);
         blk_mq_freeze_queue_wait(q);
  }
  
@@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
  }
  EXPORT_SYMBOL(blk_mq_can_queue);
  
+/*
+ * Only need start/end time stamping if we have stats enabled, or using
+ * an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+       return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
+}
+
  static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                 unsigned int tag, unsigned int op)
  {
@@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         /* csd/requeue_work/fifo_time is initialized before use */
         rq->q = data->q;
         rq->mq_ctx = data->ctx;
+       rq->mq_hctx = data->hctx;
         rq->rq_flags = rq_flags;
-       rq->cpu = -1;
         rq->cmd_flags = op;
         if (data->flags & BLK_MQ_REQ_PREEMPT)
                 rq->rq_flags |= RQF_PREEMPT;
@@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         RB_CLEAR_NODE(&rq->rb_node);
         rq->rq_disk = NULL;
         rq->part = NULL;
-       rq->start_time_ns = ktime_get_ns();
+       if (blk_mq_need_time_stamp(rq))
+               rq->start_time_ns = ktime_get_ns();
+       else
+               rq->start_time_ns = 0;
         rq->io_start_time_ns = 0;
         rq->nr_phys_segments = 0;
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
         rq->special = NULL;
         /* tag was already set */
         rq->extra_len = 0;
-       rq->__deadline = 0;
+       WRITE_ONCE(rq->deadline, 0);
  
-       INIT_LIST_HEAD(&rq->timeout_list);
         rq->timeout = 0;
  
         rq->end_io = NULL;
         rq->end_io_data = NULL;
         rq->next_rq = NULL;
  
-#ifdef CONFIG_BLK_CGROUP
-       rq->rl = NULL;
-#endif
-
         data->ctx->rq_dispatched[op_is_sync(op)]++;
         refcount_set(&rq->ref, 1);
         return rq;
  }
  
  static struct request *blk_mq_get_request(struct request_queue *q,
-               struct bio *bio, unsigned int op,
-               struct blk_mq_alloc_data *data)
+                                         struct bio *bio,
+                                         struct blk_mq_alloc_data *data)
  {
         struct elevator_queue *e = q->elevator;
         struct request *rq;
@@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                 put_ctx_on_error = true;
         }
         if (likely(!data->hctx))
-               data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
-       if (op & REQ_NOWAIT)
+               data->hctx = blk_mq_map_queue(q, data->cmd_flags,
+                                               data->ctx->cpu);
+       if (data->cmd_flags & REQ_NOWAIT)
                 data->flags |= BLK_MQ_REQ_NOWAIT;
  
         if (e) {
@@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                  * dispatch list. Don't include reserved tags in the
                  * limiting, as it isn't useful.
                  */
-               if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
+               if (!op_is_flush(data->cmd_flags) &&
+                   e->type->ops.limit_depth &&
                     !(data->flags & BLK_MQ_REQ_RESERVED))
-                       e->type->ops.mq.limit_depth(op, data);
+                       e->type->ops.limit_depth(data->cmd_flags, data);
         } else {
                 blk_mq_tag_busy(data->hctx);
         }
@@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q,
                 return NULL;
         }
  
-       rq = blk_mq_rq_ctx_init(data, tag, op);
-       if (!op_is_flush(op)) {
+       rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
+       if (!op_is_flush(data->cmd_flags)) {
                 rq->elv.icq = NULL;
-               if (e && e->type->ops.mq.prepare_request) {
-                       if (e->type->icq_cache && rq_ioc(bio))
-                               blk_mq_sched_assign_ioc(rq, bio);
+               if (e && e->type->ops.prepare_request) {
+                       if (e->type->icq_cache)
+                               blk_mq_sched_assign_ioc(rq);
  
-                       e->type->ops.mq.prepare_request(rq, bio);
+                       e->type->ops.prepare_request(rq, bio);
                         rq->rq_flags |= RQF_ELVPRIV;
                 }
         }
@@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
  struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
                 blk_mq_req_flags_t flags)
  {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags };
+       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
         struct request *rq;
         int ret;
  
@@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
         if (ret)
                 return ERR_PTR(ret);
  
-       rq = blk_mq_get_request(q, NULL, op, &alloc_data);
+       rq = blk_mq_get_request(q, NULL, &alloc_data);
         blk_queue_exit(q);
  
         if (!rq)
@@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
  struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
  {
-       struct blk_mq_alloc_data alloc_data = { .flags = flags };
+       struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
         struct request *rq;
         unsigned int cpu;
         int ret;
@@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
         cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
  
-       rq = blk_mq_get_request(q, NULL, op, &alloc_data);
+       rq = blk_mq_get_request(q, NULL, &alloc_data);
         blk_queue_exit(q);
  
         if (!rq)
@@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq)
  {
         struct request_queue *q = rq->q;
         struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
         const int sched_tag = rq->internal_tag;
  
         blk_pm_mark_last_busy(rq);
+       rq->mq_hctx = NULL;
         if (rq->tag != -1)
                 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
         if (sched_tag != -1)
@@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq)
         struct request_queue *q = rq->q;
         struct elevator_queue *e = q->elevator;
         struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
  
         if (rq->rq_flags & RQF_ELVPRIV) {
-               if (e && e->type->ops.mq.finish_request)
-                       e->type->ops.mq.finish_request(rq);
+               if (e && e->type->ops.finish_request)
+                       e->type->ops.finish_request(rq);
                 if (rq->elv.icq) {
                         put_io_context(rq->elv.icq->ioc);
                         rq->elv.icq = NULL;
@@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq)
  
         rq_qos_done(q, rq);
  
-       if (blk_rq_rl(rq))
-               blk_put_rl(blk_rq_rl(rq));
-
         WRITE_ONCE(rq->state, MQ_RQ_IDLE);
         if (refcount_dec_and_test(&rq->ref))
                 __blk_mq_free_request(rq);
@@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
  
  inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
  {
-       u64 now = ktime_get_ns();
+       u64 now = 0;
+
+       if (blk_mq_need_time_stamp(rq))
+               now = ktime_get_ns();
  
         if (rq->rq_flags & RQF_STATS) {
                 blk_mq_poll_stats_start(rq->q);
@@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request);
  static void __blk_mq_complete_request_remote(void *data)
  {
         struct request *rq = data;
+       struct request_queue *q = rq->q;
  
-       rq->q->softirq_done_fn(rq);
+       q->mq_ops->complete(rq);
  }
  
  static void __blk_mq_complete_request(struct request *rq)
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct request_queue *q = rq->q;
         bool shared = false;
         int cpu;
  
-       if (!blk_mq_mark_complete(rq))
-               return;
-
+       WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
         /*
          * Most of single queue controllers, there is only one irq vector
          * for handling IO completion, and the only irq's affinity is set
@@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq)
          * So complete IO reqeust in softirq context in case of single queue
          * for not degrading IO performance by irqsoff latency.
          */
-       if (rq->q->nr_hw_queues == 1) {
+       if (q->nr_hw_queues == 1) {
                 __blk_complete_request(rq);
                 return;
         }
  
-       if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
-               rq->q->softirq_done_fn(rq);
+       /*
+        * For a polled request, always complete locallly, it's pointless
+        * to redirect the completion.
+        */
+       if ((rq->cmd_flags & REQ_HIPRI) ||
+           !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
+               q->mq_ops->complete(rq);
                 return;
         }
  
         cpu = get_cpu();
-       if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+       if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
                 shared = cpus_share_cache(cpu, ctx->cpu);
  
         if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
@@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq)
                 rq->csd.flags = 0;
                 smp_call_function_single_async(ctx->cpu, &rq->csd);
         } else {
-               rq->q->softirq_done_fn(rq);
+               q->mq_ops->complete(rq);
         }
         put_cpu();
  }
@@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
   *     Ends all I/O on a request. It does not handle partial completions.
   *     The actual completion happens out-of-order, through a IPI handler.
   **/
-void blk_mq_complete_request(struct request *rq)
+bool blk_mq_complete_request(struct request *rq)
  {
         if (unlikely(blk_should_fake_timeout(rq->q)))
-               return;
+               return false;
         __blk_mq_complete_request(rq);
+       return true;
  }
  EXPORT_SYMBOL(blk_mq_complete_request);
  
@@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
         /* this request will be re-inserted to io scheduler queue */
         blk_mq_sched_requeue_request(rq);
  
-       BUG_ON(blk_queued_rq(rq));
+       BUG_ON(!list_empty(&rq->queuelist));
         blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
  }
  EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
  }
  EXPORT_SYMBOL(blk_mq_tag_to_rq);
  
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
+                              void *priv, bool reserved)
+{
+       /*
+        * If we find a request that is inflight and the queue matches,
+        * we know the queue is busy. Return false to stop the iteration.
+        */
+       if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+               bool *busy = priv;
+
+               *busy = true;
+               return false;
+       }
+
+       return true;
+}
+
+bool blk_mq_queue_inflight(struct request_queue *q)
+{
+       bool busy = false;
+
+       blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
+       return busy;
+}
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
+
  static void blk_mq_rq_timed_out(struct request *req, bool reserved)
  {
         req->rq_flags |= RQF_TIMED_OUT;
@@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
         if (rq->rq_flags & RQF_TIMED_OUT)
                 return false;
  
-       deadline = blk_rq_deadline(rq);
+       deadline = READ_ONCE(rq->deadline);
         if (time_after_eq(jiffies, deadline))
                 return true;
  
@@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
         return false;
  }
  
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                 struct request *rq, void *priv, bool reserved)
  {
         unsigned long *next = priv;
@@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
          * so we're not unnecessarilly synchronizing across CPUs.
          */
         if (!blk_mq_req_expired(rq, next))
-               return;
+               return true;
  
         /*
          * We have reason to believe the request may be expired. Take a
@@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
          * timeout handler to posting a natural completion.
          */
         if (!refcount_inc_not_zero(&rq->ref))
-               return;
+               return true;
  
         /*
          * The request is now locked and cannot be reallocated underneath the
@@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
                 blk_mq_rq_timed_out(rq, reserved);
         if (refcount_dec_and_test(&rq->ref))
                 __blk_mq_free_request(rq);
+
+       return true;
  }
  
  static void blk_mq_timeout_work(struct work_struct *work)
@@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
         struct flush_busy_ctx_data *flush_data = data;
         struct blk_mq_hw_ctx *hctx = flush_data->hctx;
         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+       enum hctx_type type = hctx->type;
  
         spin_lock(&ctx->lock);
-       list_splice_tail_init(&ctx->rq_list, flush_data->list);
+       list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
         sbitmap_clear_bit(sb, bitnr);
         spin_unlock(&ctx->lock);
         return true;
@@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
         struct dispatch_rq_data *dispatch_data = data;
         struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
         struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+       enum hctx_type type = hctx->type;
  
         spin_lock(&ctx->lock);
-       if (!list_empty(&ctx->rq_list)) {
-               dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+       if (!list_empty(&ctx->rq_lists[type])) {
+               dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
                 list_del_init(&dispatch_data->rq->queuelist);
-               if (list_empty(&ctx->rq_list))
+               if (list_empty(&ctx->rq_lists[type]))
                         sbitmap_clear_bit(sb, bitnr);
         }
         spin_unlock(&ctx->lock);
@@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
  struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                         struct blk_mq_ctx *start)
  {
-       unsigned off = start ? start->index_hw : 0;
+       unsigned off = start ? start->index_hw[hctx->type] : 0;
         struct dispatch_rq_data data = {
                 .hctx = hctx,
                 .rq   = NULL,
@@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq)
  {
         struct blk_mq_alloc_data data = {
                 .q = rq->q,
-               .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+               .hctx = rq->mq_hctx,
                 .flags = BLK_MQ_REQ_NOWAIT,
+               .cmd_flags = rq->cmd_flags,
         };
         bool shared;
  
@@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
  
                 rq = list_first_entry(list, struct request, queuelist);
  
-               hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+               hctx = rq->mq_hctx;
                 if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
                         break;
  
@@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
         if (!list_empty(list)) {
                 bool needs_restart;
  
+               /*
+                * If we didn't flush the entire list, we could have told
+                * the driver there was more coming, but that turned out to
+                * be a lie.
+                */
+               if (q->mq_ops->commit_rqs)
+                       q->mq_ops->commit_rqs(hctx);
+
                 spin_lock(&hctx->lock);
                 list_splice_init(list, &hctx->dispatch);
                 spin_unlock(&hctx->lock);
@@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
                                             bool at_head)
  {
         struct blk_mq_ctx *ctx = rq->mq_ctx;
+       enum hctx_type type = hctx->type;
  
         lockdep_assert_held(&ctx->lock);
  
         trace_block_rq_insert(hctx->queue, rq);
  
         if (at_head)
-               list_add(&rq->queuelist, &ctx->rq_list);
+               list_add(&rq->queuelist, &ctx->rq_lists[type]);
         else
-               list_add_tail(&rq->queuelist, &ctx->rq_list);
+               list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
  }
  
  void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
   */
  void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
  {
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+       struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
  
         spin_lock(&hctx->lock);
         list_add_tail(&rq->queuelist, &hctx->dispatch);
@@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
  
  {
         struct request *rq;
+       enum hctx_type type = hctx->type;
  
         /*
          * preemption doesn't flush plug list, so it's possible ctx->cpu is
@@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
         }
  
         spin_lock(&ctx->lock);
-       list_splice_tail_init(list, &ctx->rq_list);
+       list_splice_tail_init(list, &ctx->rq_lists[type]);
         blk_mq_hctx_mark_pending(hctx, ctx);
         spin_unlock(&ctx->lock);
  }
  
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
  {
         struct request *rqa = container_of(a, struct request, queuelist);
         struct request *rqb = container_of(b, struct request, queuelist);
  
-       return !(rqa->mq_ctx < rqb->mq_ctx ||
-                (rqa->mq_ctx == rqb->mq_ctx &&
-                 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
+       if (rqa->mq_ctx < rqb->mq_ctx)
+               return -1;
+       else if (rqa->mq_ctx > rqb->mq_ctx)
+               return 1;
+       else if (rqa->mq_hctx < rqb->mq_hctx)
+               return -1;
+       else if (rqa->mq_hctx > rqb->mq_hctx)
+               return 1;
+
+       return blk_rq_pos(rqa) > blk_rq_pos(rqb);
  }
  
  void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
  {
+       struct blk_mq_hw_ctx *this_hctx;
         struct blk_mq_ctx *this_ctx;
         struct request_queue *this_q;
         struct request *rq;
         LIST_HEAD(list);
-       LIST_HEAD(ctx_list);
+       LIST_HEAD(rq_list);
         unsigned int depth;
  
         list_splice_init(&plug->mq_list, &list);
+       plug->rq_count = 0;
  
-       list_sort(NULL, &list, plug_ctx_cmp);
+       if (plug->rq_count > 2 && plug->multiple_queues)
+               list_sort(NULL, &list, plug_rq_cmp);
  
         this_q = NULL;
+       this_hctx = NULL;
         this_ctx = NULL;
         depth = 0;
  
@@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
                 rq = list_entry_rq(list.next);
                 list_del_init(&rq->queuelist);
                 BUG_ON(!rq->q);
-               if (rq->mq_ctx != this_ctx) {
-                       if (this_ctx) {
+               if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
+                       if (this_hctx) {
                                 trace_block_unplug(this_q, depth, !from_schedule);
-                               blk_mq_sched_insert_requests(this_q, this_ctx,
-                                                               &ctx_list,
+                               blk_mq_sched_insert_requests(this_hctx, this_ctx,
+                                                               &rq_list,
                                                                 from_schedule);
                         }
  
-                       this_ctx = rq->mq_ctx;
                         this_q = rq->q;
+                       this_ctx = rq->mq_ctx;
+                       this_hctx = rq->mq_hctx;
                         depth = 0;
                 }
  
                 depth++;
-               list_add_tail(&rq->queuelist, &ctx_list);
+               list_add_tail(&rq->queuelist, &rq_list);
         }
  
         /*
-        * If 'this_ctx' is set, we know we have entries to complete
-        * on 'ctx_list'. Do those.
+        * If 'this_hctx' is set, we know we have entries to complete
+        * on 'rq_list'. Do those.
          */
-       if (this_ctx) {
+       if (this_hctx) {
                 trace_block_unplug(this_q, depth, !from_schedule);
-               blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+               blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
                                                 from_schedule);
         }
  }
@@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
  {
         blk_init_request_from_bio(rq, bio);
  
-       blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
-
         blk_account_io_start(rq, true);
  }
  
-static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
-       if (rq->tag != -1)
-               return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false);
-
-       return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
-}
-
  static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                             struct request *rq,
-                                           blk_qc_t *cookie)
+                                           blk_qc_t *cookie, bool last)
  {
         struct request_queue *q = rq->q;
         struct blk_mq_queue_data bd = {
                 .rq = rq,
-               .last = true,
+               .last = last,
         };
         blk_qc_t new_cookie;
         blk_status_t ret;
@@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
         return ret;
  }
  
-static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                                                 struct request *rq,
                                                 blk_qc_t *cookie,
-                                               bool bypass_insert)
+                                               bool bypass, bool last)
  {
         struct request_queue *q = rq->q;
         bool run_queue = true;
+       blk_status_t ret = BLK_STS_RESOURCE;
+       int srcu_idx;
+       bool force = false;
  
+       hctx_lock(hctx, &srcu_idx);
         /*
-        * RCU or SRCU read lock is needed before checking quiesced flag.
+        * hctx_lock is needed before checking quiesced flag.
          *
-        * When queue is stopped or quiesced, ignore 'bypass_insert' from
-        * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
-        * and avoid driver to try to dispatch again.
+        * When queue is stopped or quiesced, ignore 'bypass', insert
+        * and return BLK_STS_OK to caller, and avoid driver to try to
+        * dispatch again.
          */
-       if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
+       if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) {
                 run_queue = false;
-               bypass_insert = false;
-               goto insert;
+               bypass = false;
+               goto out_unlock;
         }
  
-       if (q->elevator && !bypass_insert)
-               goto insert;
+       if (unlikely(q->elevator && !bypass))
+               goto out_unlock;
  
         if (!blk_mq_get_dispatch_budget(hctx))
-               goto insert;
+               goto out_unlock;
  
         if (!blk_mq_get_driver_tag(rq)) {
                 blk_mq_put_dispatch_budget(hctx);
-               goto insert;
+               goto out_unlock;
         }
  
-       return __blk_mq_issue_directly(hctx, rq, cookie);
-insert:
-       if (bypass_insert)
-               return BLK_STS_RESOURCE;
-
-       blk_mq_sched_insert_request(rq, false, run_queue, false);
-       return BLK_STS_OK;
-}
-
-static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-               struct request *rq, blk_qc_t *cookie)
-{
-       blk_status_t ret;
-       int srcu_idx;
-
-       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
-       hctx_lock(hctx, &srcu_idx);
-
-       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-       if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-               blk_mq_sched_insert_request(rq, false, true, false);
-       else if (ret != BLK_STS_OK)
-               blk_mq_end_request(rq, ret);
-
-       hctx_unlock(hctx, srcu_idx);
-}
-
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
-{
-       blk_status_t ret;
-       int srcu_idx;
-       blk_qc_t unused_cookie;
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
-
-       hctx_lock(hctx, &srcu_idx);
-       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+       /*
+        * Always add a request that has been through
+        *.queue_rq() to the hardware dispatch list.
+        */
+       force = true;
+       ret = __blk_mq_issue_directly(hctx, rq, cookie, last);
+out_unlock:
         hctx_unlock(hctx, srcu_idx);
+       switch (ret) {
+       case BLK_STS_OK:
+               break;
+       case BLK_STS_DEV_RESOURCE:
+       case BLK_STS_RESOURCE:
+               if (force) {
+                       blk_mq_request_bypass_insert(rq, run_queue);
+                       /*
+                        * We have to return BLK_STS_OK for the DM
+                        * to avoid livelock. Otherwise, we return
+                        * the real result to indicate whether the
+                        * request is direct-issued successfully.
+                        */
+                       ret = bypass ? BLK_STS_OK : ret;
+               } else if (!bypass) {
+                       blk_mq_sched_insert_request(rq, false,
+                                                   run_queue, false);
+               }
+               break;
+       default:
+               if (!bypass)
+                       blk_mq_end_request(rq, ret);
+               break;
+       }
  
         return ret;
  }
@@ -1805,21 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
  void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                 struct list_head *list)
  {
+       blk_qc_t unused;
+       blk_status_t ret = BLK_STS_OK;
+
         while (!list_empty(list)) {
-               blk_status_t ret;
                 struct request *rq = list_first_entry(list, struct request,
                                 queuelist);
  
                 list_del_init(&rq->queuelist);
-               ret = blk_mq_request_issue_directly(rq);
-               if (ret != BLK_STS_OK) {
-                       if (ret == BLK_STS_RESOURCE ||
-                                       ret == BLK_STS_DEV_RESOURCE) {
-                               list_add(&rq->queuelist, list);
-                               break;
-                       }
-                       blk_mq_end_request(rq, ret);
-               }
+               if (ret == BLK_STS_OK)
+                       ret = blk_mq_try_issue_directly(hctx, rq, &unused,
+                                                       false,
+                                                       list_empty(list));
+               else
+                       blk_mq_sched_insert_request(rq, false, true, false);
+       }
+
+       /*
+        * If we didn't flush the entire list, we could have told
+        * the driver there was more coming, but that turned out to
+        * be a lie.
+        */
+       if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs)
+               hctx->queue->mq_ops->commit_rqs(hctx);
+}
+
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+       list_add_tail(&rq->queuelist, &plug->mq_list);
+       plug->rq_count++;
+       if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
+               struct request *tmp;
+
+               tmp = list_first_entry(&plug->mq_list, struct request,
+                                               queuelist);
+               if (tmp->q != rq->q)
+                       plug->multiple_queues = true;
         }
  }
  
@@ -1827,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
  {
         const int is_sync = op_is_sync(bio->bi_opf);
         const int is_flush_fua = op_is_flush(bio->bi_opf);
-       struct blk_mq_alloc_data data = { .flags = 0 };
+       struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
         struct request *rq;
-       unsigned int request_count = 0;
         struct blk_plug *plug;
         struct request *same_queue_rq = NULL;
         blk_qc_t cookie;
@@ -1842,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 return BLK_QC_T_NONE;
  
         if (!is_flush_fua && !blk_queue_nomerges(q) &&
-           blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+           blk_attempt_plug_merge(q, bio, &same_queue_rq))
                 return BLK_QC_T_NONE;
  
         if (blk_mq_sched_bio_merge(q, bio))
                 return BLK_QC_T_NONE;
  
-       rq_qos_throttle(q, bio, NULL);
+       rq_qos_throttle(q, bio);
  
-       rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
+       rq = blk_mq_get_request(q, bio, &data);
         if (unlikely(!rq)) {
                 rq_qos_cleanup(q, bio);
                 if (bio->bi_opf & REQ_NOWAIT)
@@ -1872,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                 /* bypass scheduler for flush rq */
                 blk_insert_flush(rq);
                 blk_mq_run_hw_queue(data.hctx, true);
-       } else if (plug && q->nr_hw_queues == 1) {
+       } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+               /*
+                * Use plugging if we have a ->commit_rqs() hook as well, as
+                * we know the driver uses bd->last in a smart fashion.
+                */
+               unsigned int request_count = plug->rq_count;
                 struct request *last = NULL;
  
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
  
-               /*
-                * @request_count may become stale because of schedule
-                * out, so check the list again.
-                */
-               if (list_empty(&plug->mq_list))
-                       request_count = 0;
-               else if (blk_queue_nomerges(q))
-                       request_count = blk_plug_queued_count(q);
-
                 if (!request_count)
                         trace_block_plug(q);
                 else
@@ -1898,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                         trace_block_plug(q);
                 }
  
-               list_add_tail(&rq->queuelist, &plug->mq_list);
+               blk_add_rq_to_plug(plug, rq);
         } else if (plug && !blk_queue_nomerges(q)) {
                 blk_mq_bio_to_request(rq, bio);
  
@@ -1911,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
                  */
                 if (list_empty(&plug->mq_list))
                         same_queue_rq = NULL;
-               if (same_queue_rq)
+               if (same_queue_rq) {
                         list_del_init(&same_queue_rq->queuelist);
-               list_add_tail(&rq->queuelist, &plug->mq_list);
+                       plug->rq_count--;
+               }
+               blk_add_rq_to_plug(plug, rq);
  
                 blk_mq_put_ctx(data.ctx);
  
                 if (same_queue_rq) {
-                       data.hctx = blk_mq_map_queue(q,
-                                       same_queue_rq->mq_ctx->cpu);
+                       data.hctx = same_queue_rq->mq_hctx;
                         blk_mq_try_issue_directly(data.hctx, same_queue_rq,
-                                       &cookie);
+                                       &cookie, false, true);
                 }
         } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
                         !data.hctx->dispatch_busy)) {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
-               blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+               blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true);
         } else {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
@@ -1985,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
         struct blk_mq_tags *tags;
         int node;
  
-       node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
  
@@ -2041,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
         size_t rq_size, left;
         int node;
  
-       node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+       node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
         if (node == NUMA_NO_NODE)
                 node = set->numa_node;
  
@@ -2121,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
         struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         LIST_HEAD(tmp);
+       enum hctx_type type;
  
         hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
         ctx = __blk_mq_get_ctx(hctx->queue, cpu);
+       type = hctx->type;
  
         spin_lock(&ctx->lock);
-       if (!list_empty(&ctx->rq_list)) {
-               list_splice_init(&ctx->rq_list, &tmp);
+       if (!list_empty(&ctx->rq_lists[type])) {
+               list_splice_init(&ctx->rq_lists[type], &tmp);
                 blk_mq_hctx_clear_pending(hctx, ctx);
         }
         spin_unlock(&ctx->lock);
@@ -2258,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q,
  static void blk_mq_init_cpu_queues(struct request_queue *q,
                                    unsigned int nr_hw_queues)
  {
-       unsigned int i;
+       struct blk_mq_tag_set *set = q->tag_set;
+       unsigned int i, j;
  
         for_each_possible_cpu(i) {
                 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
                 struct blk_mq_hw_ctx *hctx;
+               int k;
  
                 __ctx->cpu = i;
                 spin_lock_init(&__ctx->lock);
-               INIT_LIST_HEAD(&__ctx->rq_list);
+               for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
+                       INIT_LIST_HEAD(&__ctx->rq_lists[k]);
+
                 __ctx->queue = q;
  
                 /*
                  * Set local node, IFF we have more than one hw queue. If
                  * not, we remain on the home node of the device
                  */
-               hctx = blk_mq_map_queue(q, i);
-               if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
-                       hctx->numa_node = local_memory_node(cpu_to_node(i));
+               for (j = 0; j < set->nr_maps; j++) {
+                       hctx = blk_mq_map_queue_type(q, j, i);
+                       if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
+                               hctx->numa_node = local_memory_node(cpu_to_node(i));
+               }
         }
  }
  
@@ -2301,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
  static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
                                          unsigned int hctx_idx)
  {
-       if (set->tags[hctx_idx]) {
+       if (set->tags && set->tags[hctx_idx]) {
                 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
                 blk_mq_free_rq_map(set->tags[hctx_idx]);
                 set->tags[hctx_idx] = NULL;
@@ -2310,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
  
  static void blk_mq_map_swqueue(struct request_queue *q)
  {
-       unsigned int i, hctx_idx;
+       unsigned int i, j, hctx_idx;
         struct blk_mq_hw_ctx *hctx;
         struct blk_mq_ctx *ctx;
         struct blk_mq_tag_set *set = q->tag_set;
@@ -2332,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
          * If the cpu isn't present, the cpu is mapped to first hctx.
          */
         for_each_possible_cpu(i) {
-               hctx_idx = q->mq_map[i];
+               hctx_idx = set->map[0].mq_map[i];
                 /* unmapped hw queue can be remapped after CPU topo changed */
                 if (!set->tags[hctx_idx] &&
                     !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2342,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q)
                          * case, remap the current ctx to hctx[0] which
                          * is guaranteed to always have tags allocated
                          */
-                       q->mq_map[i] = 0;
+                       set->map[0].mq_map[i] = 0;
                 }
  
                 ctx = per_cpu_ptr(q->queue_ctx, i);
-               hctx = blk_mq_map_queue(q, i);
+               for (j = 0; j < set->nr_maps; j++) {
+                       if (!set->map[j].nr_queues)
+                               continue;
+
+                       hctx = blk_mq_map_queue_type(q, j, i);
+
+                       /*
+                        * If the CPU is already set in the mask, then we've
+                        * mapped this one already. This can happen if
+                        * devices share queues across queue maps.
+                        */
+                       if (cpumask_test_cpu(i, hctx->cpumask))
+                               continue;
+
+                       cpumask_set_cpu(i, hctx->cpumask);
+                       hctx->type = j;
+                       ctx->index_hw[hctx->type] = hctx->nr_ctx;
+                       hctx->ctxs[hctx->nr_ctx++] = ctx;
  
-               cpumask_set_cpu(i, hctx->cpumask);
-               ctx->index_hw = hctx->nr_ctx;
-               hctx->ctxs[hctx->nr_ctx++] = ctx;
+                       /*
+                        * If the nr_ctx type overflows, we have exceeded the
+                        * amount of sw queues we can support.
+                        */
+                       BUG_ON(!hctx->nr_ctx);
+               }
         }
  
         mutex_unlock(&q->sysfs_lock);
@@ -2440,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
  static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
                                      struct request_queue *q)
  {
-       q->tag_set = set;
-
         mutex_lock(&set->tag_list_lock);
  
         /*
@@ -2460,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
         mutex_unlock(&set->tag_list_lock);
  }
  
+/* All allocations will be freed in release handler of q->mq_kobj */
+static int blk_mq_alloc_ctxs(struct request_queue *q)
+{
+       struct blk_mq_ctxs *ctxs;
+       int cpu;
+
+       ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
+       if (!ctxs)
+               return -ENOMEM;
+
+       ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+       if (!ctxs->queue_ctx)
+               goto fail;
+
+       for_each_possible_cpu(cpu) {
+               struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
+               ctx->ctxs = ctxs;
+       }
+
+       q->mq_kobj = &ctxs->kobj;
+       q->queue_ctx = ctxs->queue_ctx;
+
+       return 0;
+ fail:
+       kfree(ctxs);
+       return -ENOMEM;
+}
+
  /*
   * It is the actual release handler for mq, but we do it from
   * request queue's release handler for avoiding use-after-free
@@ -2478,8 +2607,6 @@ void blk_mq_release(struct request_queue *q)
                 kobject_put(&hctx->kobj);
         }
  
-       q->mq_map = NULL;
-
         kfree(q->queue_hw_ctx);
  
         /*
@@ -2487,15 +2614,13 @@ void blk_mq_release(struct request_queue *q)
          * both share lifetime with request queue.
          */
         blk_mq_sysfs_deinit(q);
-
-       free_percpu(q->queue_ctx);
  }
  
  struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
  {
         struct request_queue *uninit_q, *q;
  
-       uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
+       uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
         if (!uninit_q)
                 return ERR_PTR(-ENOMEM);
  
@@ -2522,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
         memset(set, 0, sizeof(*set));
         set->ops = ops;
         set->nr_hw_queues = 1;
+       set->nr_maps = 1;
         set->queue_depth = queue_depth;
         set->numa_node = NUMA_NO_NODE;
         set->flags = set_flags;
@@ -2599,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                 int node;
                 struct blk_mq_hw_ctx *hctx;
  
-               node = blk_mq_hw_queue_to_node(q->mq_map, i);
+               node = blk_mq_hw_queue_to_node(&set->map[0], i);
                 /*
                  * If the hw queue has been mapped to another numa node,
                  * we need to realloc the hctx. If allocation fails, fallback
@@ -2652,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
         mutex_unlock(&q->sysfs_lock);
  }
  
+/*
+ * Maximum number of hardware queues we support. For single sets, we'll never
+ * have more than the CPUs (software queues). For multiple sets, the tag_set
+ * user may have set ->nr_hw_queues larger.
+ */
+static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
+{
+       if (set->nr_maps == 1)
+               return nr_cpu_ids;
+
+       return max(set->nr_hw_queues, nr_cpu_ids);
+}
+
  struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                                                   struct request_queue *q)
  {
@@ -2664,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         if (!q->poll_cb)
                 goto err_exit;
  
-       q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
-       if (!q->queue_ctx)
+       if (blk_mq_alloc_ctxs(q))
                 goto err_exit;
  
         /* init q->mq_kobj and sw queues' kobjects */
         blk_mq_sysfs_init(q);
  
-       q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
+       q->nr_queues = nr_hw_queues(set);
+       q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
                                                 GFP_KERNEL, set->numa_node);
         if (!q->queue_hw_ctx)
-               goto err_percpu;
-
-       q->mq_map = set->mq_map;
+               goto err_sys_init;
  
         blk_mq_realloc_hw_ctxs(set, q);
         if (!q->nr_hw_queues)
@@ -2685,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
         blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
  
-       q->nr_queues = nr_cpu_ids;
+       q->tag_set = set;
  
         q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+       if (set->nr_maps > HCTX_TYPE_POLL &&
+           set->map[HCTX_TYPE_POLL].nr_queues)
+               blk_queue_flag_set(QUEUE_FLAG_POLL, q);
  
         if (!(set->flags & BLK_MQ_F_SG_MERGE))
-               queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+               blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
  
         q->sg_reserved_size = INT_MAX;
  
@@ -2699,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
         spin_lock_init(&q->requeue_lock);
  
         blk_queue_make_request(q, blk_mq_make_request);
-       if (q->mq_ops->poll)
-               q->poll_fn = blk_mq_poll;
  
         /*
          * Do this after blk_queue_make_request() overrides it...
@@ -2712,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
          */
         q->poll_nsec = -1;
  
-       if (set->ops->complete)
-               blk_queue_softirq_done(q, set->ops->complete);
-
         blk_mq_init_cpu_queues(q, set->nr_hw_queues);
         blk_mq_add_queue_tag_set(set, q);
         blk_mq_map_swqueue(q);
@@ -2731,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
  
  err_hctxs:
         kfree(q->queue_hw_ctx);
-err_percpu:
-       free_percpu(q->queue_ctx);
+err_sys_init:
+       blk_mq_sysfs_deinit(q);
  err_exit:
         q->mq_ops = NULL;
         return ERR_PTR(-ENOMEM);
@@ -2801,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
  
  static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  {
-       if (set->ops->map_queues) {
+       if (set->ops->map_queues && !is_kdump_kernel()) {
+               int i;
+
                 /*
                  * transport .map_queues is usually done in the following
                  * way:
@@ -2809,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
                  * for (queue = 0; queue < set->nr_hw_queues; queue++) {
                  *      mask = get_cpu_mask(queue)
                  *      for_each_cpu(cpu, mask)
-                *              set->mq_map[cpu] = queue;
+                *              set->map[x].mq_map[cpu] = queue;
                  * }
                  *
                  * When we need to remap, the table has to be cleared for
                  * killing stale mapping since one CPU may not be mapped
                  * to any hw queue.
                  */
-               blk_mq_clear_mq_map(set);
+               for (i = 0; i < set->nr_maps; i++)
+                       blk_mq_clear_mq_map(&set->map[i]);
  
                 return set->ops->map_queues(set);
-       } else
-               return blk_mq_map_queues(set);
+       } else {
+               BUG_ON(set->nr_maps > 1);
+               return blk_mq_map_queues(&set->map[0]);
+       }
  }
  
  /*
@@ -2831,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
   */
  int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
  {
-       int ret;
+       int i, ret;
  
         BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
  
@@ -2854,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
                 set->queue_depth = BLK_MQ_MAX_DEPTH;
         }
  
+       if (!set->nr_maps)
+               set->nr_maps = 1;
+       else if (set->nr_maps > HCTX_MAX_TYPES)
+               return -EINVAL;
+
         /*
          * If a crashdump is active, then we are potentially in a very
          * memory constrained environment. Limit us to 1 queue and
@@ -2861,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
          */
         if (is_kdump_kernel()) {
                 set->nr_hw_queues = 1;
+               set->nr_maps = 1;
                 set->queue_depth = min(64U, set->queue_depth);
         }
         /*
-        * There is no use for more h/w queues than cpus.
+        * There is no use for more h/w queues than cpus if we just have
+        * a single map
          */
-       if (set->nr_hw_queues > nr_cpu_ids)
+       if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                 set->nr_hw_queues = nr_cpu_ids;
  
-       set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
+       set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
                                  GFP_KERNEL, set->numa_node);
         if (!set->tags)
                 return -ENOMEM;
  
         ret = -ENOMEM;
-       set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
-                                  GFP_KERNEL, set->numa_node);
-       if (!set->mq_map)
-               goto out_free_tags;
+       for (i = 0; i < set->nr_maps; i++) {
+               set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
+                                                 sizeof(set->map[i].mq_map[0]),
+                                                 GFP_KERNEL, set->numa_node);
+               if (!set->map[i].mq_map)
+                       goto out_free_mq_map;
+               set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
+       }
  
         ret = blk_mq_update_queue_map(set);
         if (ret)
@@ -2894,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
         return 0;
  
  out_free_mq_map:
-       kfree(set->mq_map);
-       set->mq_map = NULL;
-out_free_tags:
+       for (i = 0; i < set->nr_maps; i++) {
+               kfree(set->map[i].mq_map);
+               set->map[i].mq_map = NULL;
+       }
         kfree(set->tags);
         set->tags = NULL;
         return ret;
@@ -2905,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set);
  
  void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
  {
-       int i;
+       int i, j;
  
-       for (i = 0; i < nr_cpu_ids; i++)
+       for (i = 0; i < nr_hw_queues(set); i++)
                 blk_mq_free_map_and_requests(set, i);
  
-       kfree(set->mq_map);
-       set->mq_map = NULL;
+       for (j = 0; j < set->nr_maps; j++) {
+               kfree(set->map[j].mq_map);
+               set->map[j].mq_map = NULL;
+       }
  
         kfree(set->tags);
         set->tags = NULL;
@@ -3037,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
  
         lockdep_assert_held(&set->tag_list_lock);
  
-       if (nr_hw_queues > nr_cpu_ids)
+       if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
                 nr_hw_queues = nr_cpu_ids;
         if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
                 return;
@@ -3072,7 +3226,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                         pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                         nr_hw_queues, prev_nr_hw_queues);
                         set->nr_hw_queues = prev_nr_hw_queues;
-                       blk_mq_map_queues(set);
+                       blk_mq_map_queues(&set->map[0]);
                         goto fallback;
                 }
                 blk_mq_map_swqueue(q);
@@ -3179,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
                 return false;
  
         /*
-        * poll_nsec can be:
+        * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
          *
-        * -1:  don't ever hybrid sleep
          *  0:  use half of prev avg
          * >0:  use this specific value
          */
-       if (q->poll_nsec == -1)
-               return false;
-       else if (q->poll_nsec > 0)
+       if (q->poll_nsec > 0)
                 nsecs = q->poll_nsec;
         else
                 nsecs = blk_mq_poll_nsecs(q, hctx, rq);
@@ -3224,11 +3375,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
         return true;
  }
  
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q,
+                              struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
  {
-       struct request_queue *q = hctx->queue;
+       struct request *rq;
+
+       if (q->poll_nsec == -1)
+               return false;
+
+       if (!blk_qc_t_is_internal(cookie))
+               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+       else {
+               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
+               /*
+                * With scheduling, if the request has completed, we'll
+                * get a NULL return here, as we clear the sched tag when
+                * that happens. The request still remains valid, like always,
+                * so we should be safe with just the NULL check.
+                */
+               if (!rq)
+                       return false;
+       }
+
+       return blk_mq_poll_hybrid_sleep(q, hctx, rq);
+}
+
+/**
+ * blk_poll - poll for IO completions
+ * @q:  the queue
+ * @cookie: cookie passed back at IO submission time
+ * @spin: whether to spin for completions
+ *
+ * Description:
+ *    Poll for completions on the passed in queue. Returns number of
+ *    completed entries found. If @spin is true, then blk_poll will continue
+ *    looping until at least one completion is found, unless the task is
+ *    otherwise marked running (or we need to reschedule).
+ */
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
+{
+       struct blk_mq_hw_ctx *hctx;
         long state;
  
+       if (!blk_qc_t_valid(cookie) ||
+           !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+               return 0;
+
+       if (current->plug)
+               blk_flush_plug_list(current->plug, false);
+
+       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+
         /*
          * If we sleep, have the caller restart the poll loop to reset
          * the state. Like for the other success return cases, the
@@ -3236,63 +3433,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
          * the IO isn't complete, we'll get called again and will go
          * straight to the busy poll loop.
          */
-       if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
-               return true;
+       if (blk_mq_poll_hybrid(q, hctx, cookie))
+               return 1;
  
         hctx->poll_considered++;
  
         state = current->state;
-       while (!need_resched()) {
+       do {
                 int ret;
  
                 hctx->poll_invoked++;
  
-               ret = q->mq_ops->poll(hctx, rq->tag);
+               ret = q->mq_ops->poll(hctx);
                 if (ret > 0) {
                         hctx->poll_success++;
-                       set_current_state(TASK_RUNNING);
-                       return true;
+                       __set_current_state(TASK_RUNNING);
+                       return ret;
                 }
  
                 if (signal_pending_state(state, current))
-                       set_current_state(TASK_RUNNING);
+                       __set_current_state(TASK_RUNNING);
  
                 if (current->state == TASK_RUNNING)
-                       return true;
-               if (ret < 0)
+                       return 1;
+               if (ret < 0 || !spin)
                         break;
                 cpu_relax();
-       }
+       } while (!need_resched());
  
         __set_current_state(TASK_RUNNING);
-       return false;
+       return 0;
  }
+EXPORT_SYMBOL_GPL(blk_poll);
  
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+unsigned int blk_mq_rq_cpu(struct request *rq)
  {
-       struct blk_mq_hw_ctx *hctx;
-       struct request *rq;
-
-       if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-               return false;
-
-       hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-       if (!blk_qc_t_is_internal(cookie))
-               rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-       else {
-               rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-               /*
-                * With scheduling, if the request has completed, we'll
-                * get a NULL return here, as we clear the sched tag when
-                * that happens. The request still remains valid, like always,
-                * so we should be safe with just the NULL check.
-                */
-               if (!rq)
-                       return false;
-       }
-
-       return __blk_mq_poll(hctx, rq);
+       return rq->mq_ctx->cpu;
  }
+EXPORT_SYMBOL(blk_mq_rq_cpu);
  
  static int __init blk_mq_init(void)
  {