Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
diff --combined block/bio.c

index 9ef6cf3addb38cae822d0e5c5ef18ba9e98cd2d7,77993fb4bac682e0ed83d2a7a6bdd5f5d090b0ba..e1708db48258cb9bc8487d732074f476dc1fb520
--- 1/block/bio.c
--- 2/block/bio.c
+++ b/block/bio.c
@@@ -599,8 -599,6 +599,8 @@@ void __bio_clone_fast(struct bio *bio, 
         bio->bi_disk = bio_src->bi_disk;
         bio->bi_partno = bio_src->bi_partno;
         bio_set_flag(bio, BIO_CLONED);
+ +      if (bio_flagged(bio_src, BIO_THROTTLED))
+ +              bio_set_flag(bio, BIO_THROTTLED);
         bio->bi_opf = bio_src->bi_opf;
         bio->bi_write_hint = bio_src->bi_write_hint;
         bio->bi_iter = bio_src->bi_iter;
@@@ -970,34 -968,6 +970,6 @@@ void bio_advance(struct bio *bio, unsig
   }
   EXPORT_SYMBOL(bio_advance);
   
- /**
-  * bio_alloc_pages - allocates a single page for each bvec in a bio
-  * @bio: bio to allocate pages for
-  * @gfp_mask: flags for allocation
-  *
-  * Allocates pages up to @bio->bi_vcnt.
-  *
-  * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
-  * freed.
-  */
- int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
- {
-       int i;
-       struct bio_vec *bv;
- 
-       bio_for_each_segment_all(bv, bio, i) {
-               bv->bv_page = alloc_page(gfp_mask);
-               if (!bv->bv_page) {
-                       while (--bv >= bio->bi_io_vec)
-                               __free_page(bv->bv_page);
-                       return -ENOMEM;
-               }
-       }
- 
-       return 0;
- }
- EXPORT_SYMBOL(bio_alloc_pages);
- 
   /**
    * bio_copy_data - copy contents of data buffers from one chain of bios to
    * another
@@@ -1838,7 -1808,7 +1810,7 @@@ struct bio *bio_split(struct bio *bio, 
         bio_advance(bio, split->bi_iter.bi_size);
   
         if (bio_flagged(bio, BIO_TRACE_COMPLETION))
-               bio_set_flag(bio, BIO_TRACE_COMPLETION);
+               bio_set_flag(split, BIO_TRACE_COMPLETION);
   
         return split;
   }
diff --combined block/blk-core.c

index 3ba4326a63b59632fad81e686bf8229eee07320b,cdae69be68e9c779ff70028c60ab2bfd44786d6e..a2005a485335b5b42082bf02ffd7b3d1e90b3f3c
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -126,6 -126,8 +126,8 @@@ void blk_rq_init(struct request_queue *
         rq->start_time = jiffies;
         set_start_time_ns(rq);
         rq->part = NULL;
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
   }
   EXPORT_SYMBOL(blk_rq_init);
   
@@@ -562,13 -564,6 +564,13 @@@ static void __blk_drain_queue(struct re
         }
   }
   
+ +void blk_drain_queue(struct request_queue *q)
+ +{
+ +      spin_lock_irq(q->queue_lock);
+ +      __blk_drain_queue(q, true);
+ +      spin_unlock_irq(q->queue_lock);
+ +}
+ +
   /**
    * blk_queue_bypass_start - enter queue bypass mode
    * @q: queue of interest
@@@ -696,9 -691,20 +698,18 @@@ void blk_cleanup_queue(struct request_q
          */
         blk_freeze_queue(q);
         spin_lock_irq(lock);
- -      if (!q->mq_ops)
- -              __blk_drain_queue(q, true);
         queue_flag_set(QUEUE_FLAG_DEAD, q);
         spin_unlock_irq(lock);
   
+       /*
+        * make sure all in-progress dispatch are completed because
+        * blk_freeze_queue() can only complete all requests, and
+        * dispatch may still be in-progress since we dispatch requests
+        * from more than one contexts
+        */
+       if (q->mq_ops)
+               blk_mq_quiesce_queue(q);
+ 
         /* for synchronous bio-based driver finish in-flight integrity i/o */
         blk_flush_integrity();
   
@@@ -1646,6 -1652,7 +1657,7 @@@ void __blk_put_request(struct request_q
   
         lockdep_assert_held(q->queue_lock);
   
+       blk_req_zone_write_unlock(req);
         blk_pm_put_request(req);
   
         elv_completed_request(q, req);
@@@ -2055,6 -2062,21 +2067,21 @@@ static inline bool should_fail_request(
   
   #endif /* CONFIG_FAIL_MAKE_REQUEST */
   
+ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+ {
+       if (part->policy && op_is_write(bio_op(bio))) {
+               char b[BDEVNAME_SIZE];
+ 
+               printk(KERN_ERR
+                      "generic_make_request: Trying to write "
+                       "to read-only block-device %s (partno %d)\n",
+                       bio_devname(bio, b), part->partno);
+               return true;
+       }
+ 
+       return false;
+ }
+ 
   /*
    * Remap block n of partition p to block n+start(p) of the disk.
    */
@@@ -2063,27 -2085,28 +2090,28 @@@ static inline int blk_partition_remap(s
         struct hd_struct *p;
         int ret = 0;
   
+       rcu_read_lock();
+       p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+       if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+                    bio_check_ro(bio, p))) {
+               ret = -EIO;
+               goto out;
+       }
+ 
         /*
          * Zone reset does not include bi_size so bio_sectors() is always 0.
          * Include a test for the reset op code and perform the remap if needed.
          */
-       if (!bio->bi_partno ||
-           (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
-               return 0;
+       if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+               goto out;
   
-       rcu_read_lock();
-       p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-       if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
-               bio->bi_iter.bi_sector += p->start_sect;
-               bio->bi_partno = 0;
-               trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-                               bio->bi_iter.bi_sector - p->start_sect);
-       } else {
-               printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
-               ret = -EIO;
-       }
-       rcu_read_unlock();
+       bio->bi_iter.bi_sector += p->start_sect;
+       bio->bi_partno = 0;
+       trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+                             bio->bi_iter.bi_sector - p->start_sect);
   
+ out:
+       rcu_read_unlock();
         return ret;
   }
   
@@@ -2142,15 -2165,19 +2170,19 @@@ generic_make_request_checks(struct bio 
          * For a REQ_NOWAIT based request, return -EOPNOTSUPP
          * if queue is not a request based queue.
          */
- 
         if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
                 goto not_supported;
   
         if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
                 goto end_io;
   
-       if (blk_partition_remap(bio))
-               goto end_io;
+       if (!bio->bi_partno) {
+               if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+                       goto end_io;
+       } else {
+               if (blk_partition_remap(bio))
+                       goto end_io;
+       }
   
         if (bio_check_eod(bio, nr_sectors))
                 goto end_io;
@@@ -2493,8 -2520,7 +2525,7 @@@ blk_status_t blk_insert_cloned_request(
                  * bypass a potential scheduler on the bottom device for
                  * insert.
                  */
-               blk_mq_request_bypass_insert(rq, true);
-               return BLK_STS_OK;
+               return blk_mq_request_issue_directly(rq);
         }
   
         spin_lock_irqsave(q->queue_lock, flags);
@@@ -2846,7 -2872,7 +2877,7 @@@ void blk_start_request(struct request *
                 wbt_issue(req->q->rq_wb, &req->issue_stat);
         }
   
-       BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+       BUG_ON(blk_rq_is_complete(req));
         blk_add_timer(req);
   }
   EXPORT_SYMBOL(blk_start_request);
@@@ -3415,20 -3441,6 +3446,6 @@@ int kblockd_mod_delayed_work_on(int cpu
   }
   EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
   
- int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-                                 unsigned long delay)
- {
-       return queue_delayed_work(kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work);
- 
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-                                    unsigned long delay)
- {
-       return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
- 
   /**
    * blk_start_plug - initialize blk_plug and track it inside the task_struct
    * @plug:     The &struct blk_plug that needs to be initialized
diff --combined block/blk-map.c

index d3a94719f03fb2af81d6270d6fc9ed58f0dde373,209eb3b45c54d95cb4dfff07cd78a01ac2b4fd91..db9373bd31aca0e9393dd77ec9ef5f404d88923e
--- 1/block/blk-map.c
--- 2/block/blk-map.c
+++ b/block/blk-map.c
@@@ -12,29 -12,22 +12,29 @@@
   #include "blk.h"
   
   /*
- - * Append a bio to a passthrough request.  Only works can be merged into
- - * the request based on the driver constraints.
+ + * Append a bio to a passthrough request.  Only works if the bio can be merged
+ + * into the request based on the driver constraints.
    */
- -int blk_rq_append_bio(struct request *rq, struct bio *bio)
+ +int blk_rq_append_bio(struct request *rq, struct bio **bio)
   {
- -      blk_queue_bounce(rq->q, &bio);
+ +      struct bio *orig_bio = *bio;
+ +
+ +      blk_queue_bounce(rq->q, bio);
   
         if (!rq->bio) {
- -              blk_rq_bio_prep(rq->q, rq, bio);
+ +              blk_rq_bio_prep(rq->q, rq, *bio);
         } else {
- -              if (!ll_back_merge_fn(rq->q, rq, bio))
+ +              if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ +                      if (orig_bio != *bio) {
+ +                              bio_put(*bio);
+ +                              *bio = orig_bio;
+ +                      }
                         return -EINVAL;
+ +              }
   
- -              rq->biotail->bi_next = bio;
- -              rq->biotail = bio;
- -              rq->__data_len += bio->bi_iter.bi_size;
+ +              rq->biotail->bi_next = *bio;
+ +              rq->biotail = *bio;
+ +              rq->__data_len += (*bio)->bi_iter.bi_size;
         }
   
         return 0;
@@@ -80,12 -73,14 +80,12 @@@ static int __blk_rq_map_user_iov(struc
          * We link the bounce buffer in and could have to traverse it
          * later so we have to get a ref to prevent it from being freed
          */
- -      ret = blk_rq_append_bio(rq, bio);
- -      bio_get(bio);
+ +      ret = blk_rq_append_bio(rq, &bio);
         if (ret) {
- -              bio_endio(bio);
                 __blk_rq_unmap_user(orig_bio);
- -              bio_put(bio);
                 return ret;
         }
+ +      bio_get(bio);
   
         return 0;
   }
@@@ -119,7 -114,7 +119,7 @@@ int blk_rq_map_user_iov(struct request_
         unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
         struct bio *bio = NULL;
         struct iov_iter i;
-       int ret;
+       int ret = -EINVAL;
   
         if (!iter_is_iovec(iter))
                 goto fail;
@@@ -148,7 -143,7 +148,7 @@@ unmap_rq
         __blk_rq_unmap_user(bio);
   fail:
         rq->bio = NULL;
-       return -EINVAL;
+       return ret;
   }
   EXPORT_SYMBOL(blk_rq_map_user_iov);
   
@@@ -218,7 -213,7 +218,7 @@@ int blk_rq_map_kern(struct request_queu
         int reading = rq_data_dir(rq) == READ;
         unsigned long addr = (unsigned long) kbuf;
         int do_copy = 0;
- -      struct bio *bio;
+ +      struct bio *bio, *orig_bio;
         int ret;
   
         if (len > (queue_max_hw_sectors(q) << 9))
@@@ -241,11 -236,10 +241,11 @@@
         if (do_copy)
                 rq->rq_flags |= RQF_COPY_USER;
   
- -      ret = blk_rq_append_bio(rq, bio);
+ +      orig_bio = bio;
+ +      ret = blk_rq_append_bio(rq, &bio);
         if (unlikely(ret)) {
                 /* request is too big */
- -              bio_put(bio);
+ +              bio_put(orig_bio);
                 return ret;
         }
   
diff --combined block/blk-mq.c

index 3d379732749175ece7ae39e427e115f51621c521,43e7449723e0bf7ffa35c5c0814db261c81824fc..01f271d40825ebfd6ca82fdd2f887d432946799c
--- 1/block/blk-mq.c
--- 2/block/blk-mq.c
+++ b/block/blk-mq.c
@@@ -95,8 -95,7 +95,7 @@@ static void blk_mq_check_inflight(struc
   {
         struct mq_inflight *mi = priv;
   
-       if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
-           !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+       if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
                 /*
                  * index[0] counts the specific partition that was asked
                  * for. index[1] counts the ones that are active on the
@@@ -161,8 -160,6 +160,8 @@@ void blk_freeze_queue(struct request_qu
          * exported to drivers as the only user for unfreeze is blk_mq.
          */
         blk_freeze_queue_start(q);
+ +      if (!q->mq_ops)
+ +              blk_drain_queue(q);
         blk_mq_freeze_queue_wait(q);
   }
   
@@@ -222,7 -219,7 +221,7 @@@ void blk_mq_quiesce_queue(struct reques
   
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (hctx->flags & BLK_MQ_F_BLOCKING)
-                       synchronize_srcu(hctx->queue_rq_srcu);
+                       synchronize_srcu(hctx->srcu);
                 else
                         rcu = true;
         }
@@@ -272,15 -269,14 +271,14 @@@ static struct request *blk_mq_rq_ctx_in
   {
         struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
         struct request *rq = tags->static_rqs[tag];
- 
-       rq->rq_flags = 0;
+       req_flags_t rq_flags = 0;
   
         if (data->flags & BLK_MQ_REQ_INTERNAL) {
                 rq->tag = -1;
                 rq->internal_tag = tag;
         } else {
                 if (blk_mq_tag_busy(data->hctx)) {
-                       rq->rq_flags = RQF_MQ_INFLIGHT;
+                       rq_flags = RQF_MQ_INFLIGHT;
                         atomic_inc(&data->hctx->nr_active);
                 }
                 rq->tag = tag;
@@@ -288,27 -284,22 +286,22 @@@
                 data->hctx->tags->rqs[rq->tag] = rq;
         }
   
-       INIT_LIST_HEAD(&rq->queuelist);
         /* csd/requeue_work/fifo_time is initialized before use */
         rq->q = data->q;
         rq->mq_ctx = data->ctx;
+       rq->rq_flags = rq_flags;
+       rq->cpu = -1;
         rq->cmd_flags = op;
         if (data->flags & BLK_MQ_REQ_PREEMPT)
                 rq->rq_flags |= RQF_PREEMPT;
         if (blk_queue_io_stat(data->q))
                 rq->rq_flags |= RQF_IO_STAT;
-       /* do not touch atomic flags, it needs atomic ops against the timer */
-       rq->cpu = -1;
+       INIT_LIST_HEAD(&rq->queuelist);
         INIT_HLIST_NODE(&rq->hash);
         RB_CLEAR_NODE(&rq->rb_node);
         rq->rq_disk = NULL;
         rq->part = NULL;
         rq->start_time = jiffies;
- #ifdef CONFIG_BLK_CGROUP
-       rq->rl = NULL;
-       set_start_time_ns(rq);
-       rq->io_start_time_ns = 0;
- #endif
         rq->nr_phys_segments = 0;
   #if defined(CONFIG_BLK_DEV_INTEGRITY)
         rq->nr_integrity_segments = 0;
@@@ -316,6 -307,7 +309,7 @@@
         rq->special = NULL;
         /* tag was already set */
         rq->extra_len = 0;
+       rq->__deadline = 0;
   
         INIT_LIST_HEAD(&rq->timeout_list);
         rq->timeout = 0;
@@@ -324,6 -316,12 +318,12 @@@
         rq->end_io_data = NULL;
         rq->next_rq = NULL;
   
+ #ifdef CONFIG_BLK_CGROUP
+       rq->rl = NULL;
+       set_start_time_ns(rq);
+       rq->io_start_time_ns = 0;
+ #endif
+ 
         data->ctx->rq_dispatched[op_is_sync(op)]++;
         return rq;
   }
@@@ -443,7 -441,7 +443,7 @@@ struct request *blk_mq_alloc_request_hc
                 blk_queue_exit(q);
                 return ERR_PTR(-EXDEV);
         }
-       cpu = cpumask_first(alloc_data.hctx->cpumask);
+       cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
         alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
   
         rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@@ -485,8 -483,7 +485,7 @@@ void blk_mq_free_request(struct reques
         if (blk_rq_rl(rq))
                 blk_put_rl(blk_rq_rl(rq));
   
-       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+       blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
         if (rq->tag != -1)
                 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
         if (sched_tag != -1)
@@@ -532,6 -529,9 +531,9 @@@ static void __blk_mq_complete_request(s
         bool shared = false;
         int cpu;
   
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+       blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+ 
         if (rq->internal_tag != -1)
                 blk_mq_sched_completed_request(rq);
         if (rq->rq_flags & RQF_STATS) {
@@@ -559,6 -559,56 +561,56 @@@
         put_cpu();
   }
   
+ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+       __releases(hctx->srcu)
+ {
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+               rcu_read_unlock();
+       else
+               srcu_read_unlock(hctx->srcu, srcu_idx);
+ }
+ 
+ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+       __acquires(hctx->srcu)
+ {
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+               /* shut up gcc false positive */
+               *srcu_idx = 0;
+               rcu_read_lock();
+       } else
+               *srcu_idx = srcu_read_lock(hctx->srcu);
+ }
+ 
+ static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+ {
+       unsigned long flags;
+ 
+       /*
+        * blk_mq_rq_aborted_gstate() is used from the completion path and
+        * can thus be called from irq context.  u64_stats_fetch in the
+        * middle of update on the same CPU leads to lockup.  Disable irq
+        * while updating.
+        */
+       local_irq_save(flags);
+       u64_stats_update_begin(&rq->aborted_gstate_sync);
+       rq->aborted_gstate = gstate;
+       u64_stats_update_end(&rq->aborted_gstate_sync);
+       local_irq_restore(flags);
+ }
+ 
+ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+ {
+       unsigned int start;
+       u64 aborted_gstate;
+ 
+       do {
+               start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+               aborted_gstate = rq->aborted_gstate;
+       } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+ 
+       return aborted_gstate;
+ }
+ 
   /**
    * blk_mq_complete_request - end I/O on a request
    * @rq:               the request being processed
@@@ -570,17 -620,33 +622,33 @@@
   void blk_mq_complete_request(struct request *rq)
   {
         struct request_queue *q = rq->q;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+       int srcu_idx;
   
         if (unlikely(blk_should_fake_timeout(q)))
                 return;
-       if (!blk_mark_rq_complete(rq))
+ 
+       /*
+        * If @rq->aborted_gstate equals the current instance, timeout is
+        * claiming @rq and we lost.  This is synchronized through
+        * hctx_lock().  See blk_mq_timeout_work() for details.
+        *
+        * Completion path never blocks and we can directly use RCU here
+        * instead of hctx_lock() which can be either RCU or SRCU.
+        * However, that would complicate paths which want to synchronize
+        * against us.  Let stay in sync with the issue path so that
+        * hctx_lock() covers both issue and completion paths.
+        */
+       hctx_lock(hctx, &srcu_idx);
+       if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
                 __blk_mq_complete_request(rq);
+       hctx_unlock(hctx, srcu_idx);
   }
   EXPORT_SYMBOL(blk_mq_complete_request);
   
   int blk_mq_request_started(struct request *rq)
   {
-       return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
   }
   EXPORT_SYMBOL_GPL(blk_mq_request_started);
   
@@@ -598,34 -664,27 +666,27 @@@ void blk_mq_start_request(struct reques
                 wbt_issue(q->rq_wb, &rq->issue_stat);
         }
   
-       blk_add_timer(rq);
- 
-       WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
   
         /*
-        * Mark us as started and clear complete. Complete might have been
-        * set if requeue raced with timeout, which then marked it as
-        * complete. So be sure to clear complete again when we start
-        * the request, otherwise we'll ignore the completion event.
+        * Mark @rq in-flight which also advances the generation number,
+        * and register for timeout.  Protect with a seqcount to allow the
+        * timeout path to read both @rq->gstate and @rq->deadline
+        * coherently.
          *
-        * Ensure that ->deadline is visible before we set STARTED, such that
-        * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-        * it observes STARTED.
+        * This is the only place where a request is marked in-flight.  If
+        * the timeout path reads an in-flight @rq->gstate, the
+        * @rq->deadline it reads together under @rq->gstate_seq is
+        * guaranteed to be the matching one.
          */
-       smp_wmb();
-       set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-               /*
-                * Coherence order guarantees these consecutive stores to a
-                * single variable propagate in the specified order. Thus the
-                * clear_bit() is ordered _after_ the set bit. See
-                * blk_mq_check_expired().
-                *
-                * (the bits must be part of the same byte for this to be
-                * true).
-                */
-               clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-       }
+       preempt_disable();
+       write_seqcount_begin(&rq->gstate_seq);
+ 
+       blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+       blk_add_timer(rq);
+ 
+       write_seqcount_end(&rq->gstate_seq);
+       preempt_enable();
   
         if (q->dma_drain_size && blk_rq_bytes(rq)) {
                 /*
@@@ -639,13 -698,9 +700,9 @@@
   EXPORT_SYMBOL(blk_mq_start_request);
   
   /*
-  * When we reach here because queue is busy, REQ_ATOM_COMPLETE
-  * flag isn't set yet, so there may be race with timeout handler,
-  * but given rq->deadline is just set in .queue_rq() under
-  * this situation, the race won't be possible in reality because
-  * rq->timeout should be set as big enough to cover the window
-  * between blk_mq_start_request() called from .queue_rq() and
-  * clearing REQ_ATOM_STARTED here.
+  * When we reach here because queue is busy, it's safe to change the state
+  * to IDLE without checking @rq->aborted_gstate because we should still be
+  * holding the RCU read lock and thus protected against timeout.
    */
   static void __blk_mq_requeue_request(struct request *rq)
   {
@@@ -657,7 -712,8 +714,8 @@@
         wbt_requeue(q->rq_wb, &rq->issue_stat);
         blk_mq_sched_requeue_request(rq);
   
-       if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+       if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+               blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
                 if (q->dma_drain_size && blk_rq_bytes(rq))
                         rq->nr_phys_segments--;
         }
@@@ -689,13 -745,13 +747,13 @@@ static void blk_mq_requeue_work(struct 
   
                 rq->rq_flags &= ~RQF_SOFTBARRIER;
                 list_del_init(&rq->queuelist);
-               blk_mq_sched_insert_request(rq, true, false, false, true);
+               blk_mq_sched_insert_request(rq, true, false, false);
         }
   
         while (!list_empty(&rq_list)) {
                 rq = list_entry(rq_list.next, struct request, queuelist);
                 list_del_init(&rq->queuelist);
-               blk_mq_sched_insert_request(rq, false, false, false, true);
+               blk_mq_sched_insert_request(rq, false, false, false);
         }
   
         blk_mq_run_hw_queues(q, false);
@@@ -729,7 -785,7 +787,7 @@@ EXPORT_SYMBOL(blk_mq_add_to_requeue_lis
   
   void blk_mq_kick_requeue_list(struct request_queue *q)
   {
-       kblockd_schedule_delayed_work(&q->requeue_work, 0);
+       kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
   }
   EXPORT_SYMBOL(blk_mq_kick_requeue_list);
   
@@@ -755,24 -811,15 +813,15 @@@ EXPORT_SYMBOL(blk_mq_tag_to_rq)
   struct blk_mq_timeout_data {
         unsigned long next;
         unsigned int next_set;
+       unsigned int nr_expired;
   };
   
- void blk_mq_rq_timed_out(struct request *req, bool reserved)
+ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
   {
         const struct blk_mq_ops *ops = req->q->mq_ops;
         enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
   
-       /*
-        * We know that complete is set at this point. If STARTED isn't set
-        * anymore, then the request isn't active and the "timeout" should
-        * just be ignored. This can happen due to the bitflag ordering.
-        * Timeout first checks if STARTED is set, and if it is, assumes
-        * the request is active. But if we race with completion, then
-        * both flags will get cleared. So check here again, and ignore
-        * a timeout event with a request that isn't active.
-        */
-       if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
-               return;
+       req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
   
         if (ops->timeout)
                 ret = ops->timeout(req, reserved);
@@@ -782,8 -829,13 +831,13 @@@
                 __blk_mq_complete_request(req);
                 break;
         case BLK_EH_RESET_TIMER:
+               /*
+                * As nothing prevents from completion happening while
+                * ->aborted_gstate is set, this may lead to ignored
+                * completions and further spurious timeouts.
+                */
+               blk_mq_rq_update_aborted_gstate(req, 0);
                 blk_add_timer(req);
-               blk_clear_rq_complete(req);
                 break;
         case BLK_EH_NOT_HANDLED:
                 break;
@@@ -797,50 -849,51 +851,51 @@@ static void blk_mq_check_expired(struc
                 struct request *rq, void *priv, bool reserved)
   {
         struct blk_mq_timeout_data *data = priv;
-       unsigned long deadline;
+       unsigned long gstate, deadline;
+       int start;
   
-       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-               return;
+       might_sleep();
   
-       /*
-        * Ensures that if we see STARTED we must also see our
-        * up-to-date deadline, see blk_mq_start_request().
-        */
-       smp_rmb();
+       if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
+               return;
   
-       deadline = READ_ONCE(rq->deadline);
+       /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+       while (true) {
+               start = read_seqcount_begin(&rq->gstate_seq);
+               gstate = READ_ONCE(rq->gstate);
+               deadline = blk_rq_deadline(rq);
+               if (!read_seqcount_retry(&rq->gstate_seq, start))
+                       break;
+               cond_resched();
+       }
   
-       /*
-        * The rq being checked may have been freed and reallocated
-        * out already here, we avoid this race by checking rq->deadline
-        * and REQ_ATOM_COMPLETE flag together:
-        *
-        * - if rq->deadline is observed as new value because of
-        *   reusing, the rq won't be timed out because of timing.
-        * - if rq->deadline is observed as previous value,
-        *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-        *   because we put a barrier between setting rq->deadline
-        *   and clearing the flag in blk_mq_start_request(), so
-        *   this rq won't be timed out too.
-        */
-       if (time_after_eq(jiffies, deadline)) {
-               if (!blk_mark_rq_complete(rq)) {
-                       /*
-                        * Again coherence order ensures that consecutive reads
-                        * from the same variable must be in that order. This
-                        * ensures that if we see COMPLETE clear, we must then
-                        * see STARTED set and we'll ignore this timeout.
-                        *
-                        * (There's also the MB implied by the test_and_clear())
-                        */
-                       blk_mq_rq_timed_out(rq, reserved);
-               }
+       /* if in-flight && overdue, mark for abortion */
+       if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+           time_after_eq(jiffies, deadline)) {
+               blk_mq_rq_update_aborted_gstate(rq, gstate);
+               data->nr_expired++;
+               hctx->nr_expired++;
         } else if (!data->next_set || time_after(data->next, deadline)) {
                 data->next = deadline;
                 data->next_set = 1;
         }
   }
   
+ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+               struct request *rq, void *priv, bool reserved)
+ {
+       /*
+        * We marked @rq->aborted_gstate and waited for RCU.  If there were
+        * completions that we lost to, they would have finished and
+        * updated @rq->gstate by now; otherwise, the completion path is
+        * now guaranteed to see @rq->aborted_gstate and yield.  If
+        * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+        */
+       if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+           READ_ONCE(rq->gstate) == rq->aborted_gstate)
+               blk_mq_rq_timed_out(rq, reserved);
+ }
+ 
   static void blk_mq_timeout_work(struct work_struct *work)
   {
         struct request_queue *q =
@@@ -848,7 -901,9 +903,9 @@@
         struct blk_mq_timeout_data data = {
                 .next           = 0,
                 .next_set       = 0,
+               .nr_expired     = 0,
         };
+       struct blk_mq_hw_ctx *hctx;
         int i;
   
         /* A deadlock might occur if a request is stuck requiring a
@@@ -867,14 -922,46 +924,46 @@@
         if (!percpu_ref_tryget(&q->q_usage_counter))
                 return;
   
+       /* scan for the expired ones and set their ->aborted_gstate */
         blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
   
+       if (data.nr_expired) {
+               bool has_rcu = false;
+ 
+               /*
+                * Wait till everyone sees ->aborted_gstate.  The
+                * sequential waits for SRCUs aren't ideal.  If this ever
+                * becomes a problem, we can add per-hw_ctx rcu_head and
+                * wait in parallel.
+                */
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->nr_expired)
+                               continue;
+ 
+                       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                               has_rcu = true;
+                       else
+                               synchronize_srcu(hctx->srcu);
+ 
+                       hctx->nr_expired = 0;
+               }
+               if (has_rcu)
+                       synchronize_rcu();
+ 
+               /* terminate the ones we won */
+               blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+       }
+ 
         if (data.next_set) {
                 data.next = blk_rq_timeout(round_jiffies_up(data.next));
                 mod_timer(&q->timeout, data.next);
         } else {
-               struct blk_mq_hw_ctx *hctx;
- 
+               /*
+                * Request timeouts are handled as a forward rolling timer. If
+                * we end up here it means that no requests are pending and
+                * also that no request has been pending for a while. Mark
+                * each hctx as idle.
+                */
                 queue_for_each_hw_ctx(q, hctx, i) {
                         /* the hctx may be unmapped, so check it here */
                         if (blk_mq_hw_queue_mapped(hctx))
@@@ -1010,66 -1097,67 +1099,67 @@@ static int blk_mq_dispatch_wake(wait_qu
   
   /*
    * Mark us waiting for a tag. For shared tags, this involves hooking us into
-  * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
-  * restart. For both caes, take care to check the condition again after
+  * the tag wakeups. For non-shared tags, we can simply mark us needing a
+  * restart. For both cases, take care to check the condition again after
    * marking us as waiting.
    */
   static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
                                  struct request *rq)
   {
         struct blk_mq_hw_ctx *this_hctx = *hctx;
-       bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
         struct sbq_wait_state *ws;
         wait_queue_entry_t *wait;
         bool ret;
   
-       if (!shared_tags) {
+       if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
                 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
                         set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
-       } else {
-               wait = &this_hctx->dispatch_wait;
-               if (!list_empty_careful(&wait->entry))
-                       return false;
   
-               spin_lock(&this_hctx->lock);
-               if (!list_empty(&wait->entry)) {
-                       spin_unlock(&this_hctx->lock);
-                       return false;
-               }
+               /*
+                * It's possible that a tag was freed in the window between the
+                * allocation failure and adding the hardware queue to the wait
+                * queue.
+                *
+                * Don't clear RESTART here, someone else could have set it.
+                * At most this will cost an extra queue run.
+                */
+               return blk_mq_get_driver_tag(rq, hctx, false);
+       }
   
-               ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-               add_wait_queue(&ws->wait, wait);
+       wait = &this_hctx->dispatch_wait;
+       if (!list_empty_careful(&wait->entry))
+               return false;
+ 
+       spin_lock(&this_hctx->lock);
+       if (!list_empty(&wait->entry)) {
+               spin_unlock(&this_hctx->lock);
+               return false;
         }
   
+       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+       add_wait_queue(&ws->wait, wait);
+ 
         /*
          * It's possible that a tag was freed in the window between the
          * allocation failure and adding the hardware queue to the wait
          * queue.
          */
         ret = blk_mq_get_driver_tag(rq, hctx, false);
- 
-       if (!shared_tags) {
-               /*
-                * Don't clear RESTART here, someone else could have set it.
-                * At most this will cost an extra queue run.
-                */
-               return ret;
-       } else {
-               if (!ret) {
-                       spin_unlock(&this_hctx->lock);
-                       return false;
-               }
- 
-               /*
-                * We got a tag, remove ourselves from the wait queue to ensure
-                * someone else gets the wakeup.
-                */
-               spin_lock_irq(&ws->wait.lock);
-               list_del_init(&wait->entry);
-               spin_unlock_irq(&ws->wait.lock);
+       if (!ret) {
                 spin_unlock(&this_hctx->lock);
-               return true;
+               return false;
         }
+ 
+       /*
+        * We got a tag, remove ourselves from the wait queue to ensure
+        * someone else gets the wakeup.
+        */
+       spin_lock_irq(&ws->wait.lock);
+       list_del_init(&wait->entry);
+       spin_unlock_irq(&ws->wait.lock);
+       spin_unlock(&this_hctx->lock);
+ 
+       return true;
   }
   
   bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@@ -1206,9 -1294,27 +1296,27 @@@ static void __blk_mq_run_hw_queue(struc
         /*
          * We should be running this queue from one of the CPUs that
          * are mapped to it.
+        *
+        * There are at least two related races now between setting
+        * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+        * __blk_mq_run_hw_queue():
+        *
+        * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+        *   but later it becomes online, then this warning is harmless
+        *   at all
+        *
+        * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+        *   but later it becomes offline, then the warning can't be
+        *   triggered, and we depend on blk-mq timeout handler to
+        *   handle dispatched requests to this hctx
          */
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-               cpu_online(hctx->next_cpu));
+       if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu)) {
+               printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+                       raw_smp_processor_id(),
+                       cpumask_empty(hctx->cpumask) ? "inactive": "active");
+               dump_stack();
+       }
   
         /*
          * We can't run the queue inline with ints disabled. Ensure that
@@@ -1216,17 -1322,11 +1324,11 @@@
          */
         WARN_ON_ONCE(in_interrupt());
   
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               blk_mq_sched_dispatch_requests(hctx);
-               rcu_read_unlock();
-       } else {
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
   
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               blk_mq_sched_dispatch_requests(hctx);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+       blk_mq_sched_dispatch_requests(hctx);
+       hctx_unlock(hctx, srcu_idx);
   }
   
   /*
@@@ -1237,20 -1337,47 +1339,47 @@@
    */
   static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
   {
+       bool tried = false;
+ 
         if (hctx->queue->nr_hw_queues == 1)
                 return WORK_CPU_UNBOUND;
   
         if (--hctx->next_cpu_batch <= 0) {
                 int next_cpu;
- 
-               next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+ select_cpu:
+               next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+                               cpu_online_mask);
                 if (next_cpu >= nr_cpu_ids)
-                       next_cpu = cpumask_first(hctx->cpumask);
+                       next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
   
-               hctx->next_cpu = next_cpu;
+               /*
+                * No online CPU is found, so have to make sure hctx->next_cpu
+                * is set correctly for not breaking workqueue.
+                */
+               if (next_cpu >= nr_cpu_ids)
+                       hctx->next_cpu = cpumask_first(hctx->cpumask);
+               else
+                       hctx->next_cpu = next_cpu;
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
         }
   
+       /*
+        * Do unbound schedule if we can't find a online CPU for this hctx,
+        * and it should only happen in the path of handling CPU DEAD.
+        */
+       if (!cpu_online(hctx->next_cpu)) {
+               if (!tried) {
+                       tried = true;
+                       goto select_cpu;
+               }
+ 
+               /*
+                * Make sure to re-select CPU next time once after CPUs
+                * in hctx->cpumask become online again.
+                */
+               hctx->next_cpu_batch = 1;
+               return WORK_CPU_UNBOUND;
+       }
         return hctx->next_cpu;
   }
   
@@@ -1274,9 -1401,8 +1403,8 @@@ static void __blk_mq_delay_run_hw_queue
                 put_cpu();
         }
   
-       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                                        &hctx->run_work,
-                                        msecs_to_jiffies(msecs));
+       kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+                                   msecs_to_jiffies(msecs));
   }
   
   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@@ -1287,7 -1413,23 +1415,23 @@@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue
   
   bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
   {
-       if (blk_mq_hctx_has_pending(hctx)) {
+       int srcu_idx;
+       bool need_run;
+ 
+       /*
+        * When queue is quiesced, we may be switching io scheduler, or
+        * updating nr_hw_queues, or other things, and we can't run queue
+        * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+        *
+        * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+        * quiesced.
+        */
+       hctx_lock(hctx, &srcu_idx);
+       need_run = !blk_queue_quiesced(hctx->queue) &&
+               blk_mq_hctx_has_pending(hctx);
+       hctx_unlock(hctx, srcu_idx);
+ 
+       if (need_run) {
                 __blk_mq_delay_run_hw_queue(hctx, async, 0);
                 return true;
         }
@@@ -1595,9 -1737,9 +1739,9 @@@ static blk_qc_t request_to_qc_t(struct 
         return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
   }
   
- static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                       struct request *rq,
-                                       blk_qc_t *cookie, bool may_sleep)
+ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+                                           struct request *rq,
+                                           blk_qc_t *cookie)
   {
         struct request_queue *q = rq->q;
         struct blk_mq_queue_data bd = {
@@@ -1606,15 -1748,52 +1750,52 @@@
         };
         blk_qc_t new_cookie;
         blk_status_t ret;
+ 
+       new_cookie = request_to_qc_t(hctx, rq);
+ 
+       /*
+        * For OK queue, we are done. For error, caller may kill it.
+        * Any other error (busy), just add it to our list as we
+        * previously would have done.
+        */
+       ret = q->mq_ops->queue_rq(hctx, &bd);
+       switch (ret) {
+       case BLK_STS_OK:
+               *cookie = new_cookie;
+               break;
+       case BLK_STS_RESOURCE:
+               __blk_mq_requeue_request(rq);
+               break;
+       default:
+               *cookie = BLK_QC_T_NONE;
+               break;
+       }
+ 
+       return ret;
+ }
+ 
+ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+                                               struct request *rq,
+                                               blk_qc_t *cookie,
+                                               bool bypass_insert)
+ {
+       struct request_queue *q = rq->q;
         bool run_queue = true;
   
-       /* RCU or SRCU read lock is needed before checking quiesced flag */
+       /*
+        * RCU or SRCU read lock is needed before checking quiesced flag.
+        *
+        * When queue is stopped or quiesced, ignore 'bypass_insert' from
+        * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+        * and avoid driver to try to dispatch again.
+        */
         if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
                 run_queue = false;
+               bypass_insert = false;
                 goto insert;
         }
   
-       if (q->elevator)
+       if (q->elevator && !bypass_insert)
                 goto insert;
   
         if (!blk_mq_get_driver_tag(rq, NULL, false))
@@@ -1625,47 -1804,47 +1806,47 @@@
                 goto insert;
         }
   
-       new_cookie = request_to_qc_t(hctx, rq);
- 
-       /*
-        * For OK queue, we are done. For error, kill it. Any other
-        * error (busy), just add it to our list as we previously
-        * would have done
-        */
-       ret = q->mq_ops->queue_rq(hctx, &bd);
-       switch (ret) {
-       case BLK_STS_OK:
-               *cookie = new_cookie;
-               return;
-       case BLK_STS_RESOURCE:
-               __blk_mq_requeue_request(rq);
-               goto insert;
-       default:
-               *cookie = BLK_QC_T_NONE;
-               blk_mq_end_request(rq, ret);
-               return;
-       }
- 
+       return __blk_mq_issue_directly(hctx, rq, cookie);
   insert:
-       blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+       if (bypass_insert)
+               return BLK_STS_RESOURCE;
+ 
+       blk_mq_sched_insert_request(rq, false, run_queue, false);
+       return BLK_STS_OK;
   }
   
   static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                 struct request *rq, blk_qc_t *cookie)
   {
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-               rcu_read_unlock();
-       } else {
-               unsigned int srcu_idx;
+       blk_status_t ret;
+       int srcu_idx;
   
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
   
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               __blk_mq_try_issue_directly(hctx, rq, cookie, true);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+ 
+       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+       if (ret == BLK_STS_RESOURCE)
+               blk_mq_sched_insert_request(rq, false, true, false);
+       else if (ret != BLK_STS_OK)
+               blk_mq_end_request(rq, ret);
+ 
+       hctx_unlock(hctx, srcu_idx);
+ }
+ 
+ blk_status_t blk_mq_request_issue_directly(struct request *rq)
+ {
+       blk_status_t ret;
+       int srcu_idx;
+       blk_qc_t unused_cookie;
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+ 
+       hctx_lock(hctx, &srcu_idx);
+       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+       hctx_unlock(hctx, srcu_idx);
+ 
+       return ret;
   }
   
   static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
@@@ -1776,7 -1955,7 +1957,7 @@@
         } else if (q->elevator) {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
-               blk_mq_sched_insert_request(rq, false, true, true, true);
+               blk_mq_sched_insert_request(rq, false, true, true);
         } else {
                 blk_mq_put_ctx(data.ctx);
                 blk_mq_bio_to_request(rq, bio);
@@@ -1869,6 -2048,22 +2050,22 @@@ static size_t order_to_size(unsigned in
         return (size_t)PAGE_SIZE << order;
   }
   
+ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+                              unsigned int hctx_idx, int node)
+ {
+       int ret;
+ 
+       if (set->ops->init_request) {
+               ret = set->ops->init_request(set, rq, hctx_idx, node);
+               if (ret)
+                       return ret;
+       }
+ 
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
+       return 0;
+ }
+ 
   int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                      unsigned int hctx_idx, unsigned int depth)
   {
@@@ -1930,12 -2125,9 +2127,9 @@@
                         struct request *rq = p;
   
                         tags->static_rqs[i] = rq;
-                       if (set->ops->init_request) {
-                               if (set->ops->init_request(set, rq, hctx_idx,
-                                               node)) {
-                                       tags->static_rqs[i] = NULL;
-                                       goto fail;
-                               }
+                       if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+                               tags->static_rqs[i] = NULL;
+                               goto fail;
                         }
   
                         p += rq_size;
@@@ -1994,7 -2186,8 +2188,8 @@@ static void blk_mq_exit_hctx(struct req
   {
         blk_mq_debugfs_unregister_hctx(hctx);
   
-       blk_mq_tag_idle(hctx);
+       if (blk_mq_hw_queue_mapped(hctx))
+               blk_mq_tag_idle(hctx);
   
         if (set->ops->exit_request)
                 set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
@@@ -2005,7 -2198,7 +2200,7 @@@
                 set->ops->exit_hctx(hctx, hctx_idx);
   
         if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->queue_rq_srcu);
+               cleanup_srcu_struct(hctx->srcu);
   
         blk_mq_remove_cpuhp(hctx);
         blk_free_flush_queue(hctx->fq);
@@@ -2074,13 -2267,11 +2269,11 @@@ static int blk_mq_init_hctx(struct requ
         if (!hctx->fq)
                 goto sched_exit_hctx;
   
-       if (set->ops->init_request &&
-           set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-                                  node))
+       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
                 goto free_fq;
   
         if (hctx->flags & BLK_MQ_F_BLOCKING)
-               init_srcu_struct(hctx->queue_rq_srcu);
+               init_srcu_struct(hctx->srcu);
   
         blk_mq_debugfs_register_hctx(q, hctx);
   
@@@ -2116,16 -2307,11 +2309,11 @@@ static void blk_mq_init_cpu_queues(stru
                 INIT_LIST_HEAD(&__ctx->rq_list);
                 __ctx->queue = q;
   
-               /* If the cpu isn't present, the cpu is mapped to first hctx */
-               if (!cpu_present(i))
-                       continue;
- 
-               hctx = blk_mq_map_queue(q, i);
- 
                 /*
                  * Set local node, IFF we have more than one hw queue. If
                  * not, we remain on the home node of the device
                  */
+               hctx = blk_mq_map_queue(q, i);
                 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                         hctx->numa_node = local_memory_node(cpu_to_node(i));
         }
@@@ -2182,7 -2368,7 +2370,7 @@@ static void blk_mq_map_swqueue(struct r
          *
          * If the cpu isn't present, the cpu is mapped to first hctx.
          */
-       for_each_present_cpu(i) {
+       for_each_possible_cpu(i) {
                 hctx_idx = q->mq_map[i];
                 /* unmapped hw queue can be remapped after CPU topo changed */
                 if (!set->tags[hctx_idx] &&
@@@ -2236,7 -2422,8 +2424,8 @@@
                 /*
                  * Initialize batch roundrobin counts
                  */
-               hctx->next_cpu = cpumask_first(hctx->cpumask);
+               hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+                               cpu_online_mask);
                 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
         }
   }
@@@ -2369,7 -2556,7 +2558,7 @@@ static int blk_mq_hw_ctx_size(struct bl
   {
         int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
   
-       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
                            __alignof__(struct blk_mq_hw_ctx)) !=
                      sizeof(struct blk_mq_hw_ctx));
   
@@@ -2386,6 -2573,9 +2575,9 @@@ static void blk_mq_realloc_hw_ctxs(stru
         struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
   
         blk_mq_sysfs_unregister(q);
+ 
+       /* protect against switching io scheduler  */
+       mutex_lock(&q->sysfs_lock);
         for (i = 0; i < set->nr_hw_queues; i++) {
                 int node;
   
@@@ -2430,6 -2620,7 +2622,7 @@@
                 }
         }
         q->nr_hw_queues = i;
+       mutex_unlock(&q->sysfs_lock);
         blk_mq_sysfs_register(q);
   }
   
@@@ -2601,9 -2792,27 +2794,27 @@@ static int blk_mq_alloc_rq_maps(struct 
   
   static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
   {
-       if (set->ops->map_queues)
+       if (set->ops->map_queues) {
+               int cpu;
+               /*
+                * transport .map_queues is usually done in the following
+                * way:
+                *
+                * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+                *      mask = get_cpu_mask(queue)
+                *      for_each_cpu(cpu, mask)
+                *              set->mq_map[cpu] = queue;
+                * }
+                *
+                * When we need to remap, the table has to be cleared for
+                * killing stale mapping since one CPU may not be mapped
+                * to any hw queue.
+                */
+               for_each_possible_cpu(cpu)
+                       set->mq_map[cpu] = 0;
+ 
                 return set->ops->map_queues(set);
-       else
+       } else
                 return blk_mq_map_queues(set);
   }
   
@@@ -2712,6 -2921,7 +2923,7 @@@ int blk_mq_update_nr_requests(struct re
                 return -EINVAL;
   
         blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
   
         ret = 0;
         queue_for_each_hw_ctx(q, hctx, i) {
@@@ -2735,6 -2945,7 +2947,7 @@@
         if (!ret)
                 q->nr_requests = nr;
   
+       blk_mq_unquiesce_queue(q);
         blk_mq_unfreeze_queue(q);
   
         return ret;
@@@ -2850,7 -3061,7 +3063,7 @@@ static bool blk_mq_poll_hybrid_sleep(st
         unsigned int nsecs;
         ktime_t kt;
   
-       if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+       if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
                 return false;
   
         /*
@@@ -2870,7 -3081,7 +3083,7 @@@
         if (!nsecs)
                 return false;
   
-       set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+       rq->rq_flags |= RQF_MQ_POLL_SLEPT;
   
         /*
          * This will be replaced with the stats tracking code, using
@@@ -2884,7 -3095,7 +3097,7 @@@
   
         hrtimer_init_sleeper(&hs, current);
         do {
-               if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+               if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                         break;
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 hrtimer_start_expires(&hs.timer, mode);
@@@ -2970,12 -3181,6 +3183,6 @@@ static bool blk_mq_poll(struct request_
   
   static int __init blk_mq_init(void)
   {
-       /*
-        * See comment in block/blk.h rq_atomic_flags enum
-        */
-       BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-                       (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
- 
         cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                 blk_mq_hctx_notify_dead);
         return 0;
diff --combined block/blk-throttle.c

index d19f416d61012ac032c49608f0afe463c948e8bc,c475f0fe3530667ce70f3771f406f8bc6944d8e7..c5a1316737331ba785a0c569aac1615994cb1996
--- 1/block/blk-throttle.c
--- 2/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@@ -216,9 -216,9 +216,9 @@@ struct throtl_dat
   
         unsigned int scale;
   
-       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-       struct latency_bucket __percpu *latency_buckets;
+       struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets[2];
         unsigned long last_calculate_time;
         unsigned long filtered_latency;
   
@@@ -1510,11 -1510,21 +1510,21 @@@ static struct cftype throtl_legacy_file
                 .private = (unsigned long)&blkcg_policy_throtl,
                 .seq_show = blkg_print_stat_bytes,
         },
+       {
+               .name = "throttle.io_service_bytes_recursive",
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_bytes_recursive,
+       },
         {
                 .name = "throttle.io_serviced",
                 .private = (unsigned long)&blkcg_policy_throtl,
                 .seq_show = blkg_print_stat_ios,
         },
+       {
+               .name = "throttle.io_serviced_recursive",
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_ios_recursive,
+       },
         { }     /* terminate */
   };
   
@@@ -2040,10 -2050,10 +2050,10 @@@ static void blk_throtl_update_idletime(
   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
   static void throtl_update_latency_buckets(struct throtl_data *td)
   {
-       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-       int i, cpu;
-       unsigned long last_latency = 0;
-       unsigned long latency;
+       struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+       int i, cpu, rw;
+       unsigned long last_latency[2] = { 0 };
+       unsigned long latency[2];
   
         if (!blk_queue_nonrot(td->queue))
                 return;
@@@ -2052,56 -2062,67 +2062,67 @@@
         td->last_calculate_time = jiffies;
   
         memset(avg_latency, 0, sizeof(avg_latency));
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               struct latency_bucket *tmp = &td->tmp_buckets[i];
- 
-               for_each_possible_cpu(cpu) {
-                       struct latency_bucket *bucket;
- 
-                       /* this isn't race free, but ok in practice */
-                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
-                       tmp->total_latency += bucket[i].total_latency;
-                       tmp->samples += bucket[i].samples;
-                       bucket[i].total_latency = 0;
-                       bucket[i].samples = 0;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+ 
+                       for_each_possible_cpu(cpu) {
+                               struct latency_bucket *bucket;
+ 
+                               /* this isn't race free, but ok in practice */
+                               bucket = per_cpu_ptr(td->latency_buckets[rw],
+                                       cpu);
+                               tmp->total_latency += bucket[i].total_latency;
+                               tmp->samples += bucket[i].samples;
+                               bucket[i].total_latency = 0;
+                               bucket[i].samples = 0;
+                       }
   
-               if (tmp->samples >= 32) {
-                       int samples = tmp->samples;
+                       if (tmp->samples >= 32) {
+                               int samples = tmp->samples;
   
-                       latency = tmp->total_latency;
+                               latency[rw] = tmp->total_latency;
   
-                       tmp->total_latency = 0;
-                       tmp->samples = 0;
-                       latency /= samples;
-                       if (latency == 0)
-                               continue;
-                       avg_latency[i].latency = latency;
+                               tmp->total_latency = 0;
+                               tmp->samples = 0;
+                               latency[rw] /= samples;
+                               if (latency[rw] == 0)
+                                       continue;
+                               avg_latency[rw][i].latency = latency[rw];
+                       }
                 }
         }
   
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               if (!avg_latency[i].latency) {
-                       if (td->avg_buckets[i].latency < last_latency)
-                               td->avg_buckets[i].latency = last_latency;
-                       continue;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       if (!avg_latency[rw][i].latency) {
+                               if (td->avg_buckets[rw][i].latency < last_latency[rw])
+                                       td->avg_buckets[rw][i].latency =
+                                               last_latency[rw];
+                               continue;
+                       }
   
-               if (!td->avg_buckets[i].valid)
-                       latency = avg_latency[i].latency;
-               else
-                       latency = (td->avg_buckets[i].latency * 7 +
-                               avg_latency[i].latency) >> 3;
+                       if (!td->avg_buckets[rw][i].valid)
+                               latency[rw] = avg_latency[rw][i].latency;
+                       else
+                               latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+                                       avg_latency[rw][i].latency) >> 3;
   
-               td->avg_buckets[i].latency = max(latency, last_latency);
-               td->avg_buckets[i].valid = true;
-               last_latency = td->avg_buckets[i].latency;
+                       td->avg_buckets[rw][i].latency = max(latency[rw],
+                               last_latency[rw]);
+                       td->avg_buckets[rw][i].valid = true;
+                       last_latency[rw] = td->avg_buckets[rw][i].latency;
+               }
         }
   
         for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
                 throtl_log(&td->service_queue,
-                       "Latency bucket %d: latency=%ld, valid=%d", i,
-                       td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+                       "Latency bucket %d: read latency=%ld, read valid=%d, "
+                       "write latency=%ld, write valid=%d", i,
+                       td->avg_buckets[READ][i].latency,
+                       td->avg_buckets[READ][i].valid,
+                       td->avg_buckets[WRITE][i].latency,
+                       td->avg_buckets[WRITE][i].valid);
   }
   #else
   static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@@ -2226,7 -2247,13 +2247,7 @@@ again
   out_unlock:
         spin_unlock_irq(q->queue_lock);
   out:
- -      /*
- -       * As multiple blk-throtls may stack in the same issue path, we
- -       * don't want bios to leave with the flag set.  Clear the flag if
- -       * being issued.
- -       */
- -      if (!throttled)
- -              bio_clear_flag(bio, BIO_THROTTLED);
+ +      bio_set_flag(bio, BIO_THROTTLED);
   
   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
         if (throttled || !td->track_bio_latency)
@@@ -2242,16 -2269,17 +2263,17 @@@ static void throtl_track_latency(struc
         struct latency_bucket *latency;
         int index;
   
-       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+       if (!td || td->limit_index != LIMIT_LOW ||
+           !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
             !blk_queue_nonrot(td->queue))
                 return;
   
         index = request_bucket_index(size);
   
-       latency = get_cpu_ptr(td->latency_buckets);
+       latency = get_cpu_ptr(td->latency_buckets[op]);
         latency[index].total_latency += time;
         latency[index].samples++;
-       put_cpu_ptr(td->latency_buckets);
+       put_cpu_ptr(td->latency_buckets[op]);
   }
   
   void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@@ -2270,6 -2298,7 +2292,7 @@@ void blk_throtl_bio_endio(struct bio *b
         unsigned long finish_time;
         unsigned long start_time;
         unsigned long lat;
+       int rw = bio_data_dir(bio);
   
         tg = bio->bi_cg_private;
         if (!tg)
@@@ -2298,7 -2327,7 +2321,7 @@@
   
                 bucket = request_bucket_index(
                         blk_stat_size(&bio->bi_issue_stat));
-               threshold = tg->td->avg_buckets[bucket].latency +
+               threshold = tg->td->avg_buckets[rw][bucket].latency +
                         tg->latency_target;
                 if (lat > threshold)
                         tg->bad_bio_cnt++;
@@@ -2391,9 -2420,16 +2414,16 @@@ int blk_throtl_init(struct request_queu
         td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
         if (!td)
                 return -ENOMEM;
-       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+       td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
                 LATENCY_BUCKET_SIZE, __alignof__(u64));
-       if (!td->latency_buckets) {
+       if (!td->latency_buckets[READ]) {
+               kfree(td);
+               return -ENOMEM;
+       }
+       td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets[WRITE]) {
+               free_percpu(td->latency_buckets[READ]);
                 kfree(td);
                 return -ENOMEM;
         }
@@@ -2412,7 -2448,8 +2442,8 @@@
         /* activate policy */
         ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
         if (ret) {
-               free_percpu(td->latency_buckets);
+               free_percpu(td->latency_buckets[READ]);
+               free_percpu(td->latency_buckets[WRITE]);
                 kfree(td);
         }
         return ret;
@@@ -2423,7 -2460,8 +2454,8 @@@ void blk_throtl_exit(struct request_que
         BUG_ON(!q->td);
         throtl_shutdown_wq(q);
         blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-       free_percpu(q->td->latency_buckets);
+       free_percpu(q->td->latency_buckets[READ]);
+       free_percpu(q->td->latency_buckets[WRITE]);
         kfree(q->td);
   }
   
@@@ -2441,15 -2479,17 +2473,17 @@@ void blk_throtl_register_queue(struct r
         } else {
                 td->throtl_slice = DFL_THROTL_SLICE_HD;
                 td->filtered_latency = LATENCY_FILTERED_HD;
-               for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-                       td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+                       td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+               }
         }
   #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
         /* if no low limit, use previous default */
         td->throtl_slice = DFL_THROTL_SLICE_HD;
   #endif
   
-       td->track_bio_latency = !q->mq_ops && !q->request_fn;
+       td->track_bio_latency = !queue_is_rq_based(q);
         if (!td->track_bio_latency)
                 blk_stat_enable_accounting(q);
   }
diff --combined block/blk.h

index 442098aa9463a37dad0dfccb1718eea65be6cdb3,b1771851ed92e4da9803f0efdc6f54719d36f39e..46db5dc83dcb4091ea03c045b27968543c0a699b
--- 1/block/blk.h
--- 2/block/blk.h
+++ b/block/blk.h
@@@ -119,34 -119,24 +119,24 @@@ void blk_account_io_start(struct reques
   void blk_account_io_completion(struct request *req, unsigned int bytes);
   void blk_account_io_done(struct request *req);
   
- /*
-  * Internal atomic flags for request handling
-  */
- enum rq_atomic_flags {
-       /*
-        * Keep these two bits first - not because we depend on the
-        * value of them, but we do depend on them being in the same
-        * byte of storage to ensure ordering on writes. Keeping them
-        * first will achieve that nicely.
-        */
-       REQ_ATOM_COMPLETE = 0,
-       REQ_ATOM_STARTED,
- 
-       REQ_ATOM_POLL_SLEPT,
- };
- 
   /*
    * EH timer and IO completion will both attempt to 'grab' the request, make
-  * sure that only one of them succeeds
+  * sure that only one of them succeeds. Steal the bottom bit of the
+  * __deadline field for this.
    */
   static inline int blk_mark_rq_complete(struct request *rq)
   {
-       return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+       return test_and_set_bit(0, &rq->__deadline);
   }
   
   static inline void blk_clear_rq_complete(struct request *rq)
   {
-       clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+       clear_bit(0, &rq->__deadline);
+ }
+ 
+ static inline bool blk_rq_is_complete(struct request *rq)
+ {
+       return test_bit(0, &rq->__deadline);
   }
   
   /*
@@@ -172,6 -162,9 +162,9 @@@ static inline void elv_deactivate_rq(st
                 e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
   }
   
+ int elv_register_queue(struct request_queue *q);
+ void elv_unregister_queue(struct request_queue *q);
+ 
   struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
   
   #ifdef CONFIG_FAIL_IO_TIMEOUT
@@@ -245,6 -238,21 +238,21 @@@ static inline void req_set_nomerge(stru
                 q->last_merge = NULL;
   }
   
+ /*
+  * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+  * setting the deadline clears the bottom bit, potentially clearing the
+  * completed bit. The user has to be OK with this (current ones are fine).
+  */
+ static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+ {
+       rq->__deadline = time & ~0x1UL;
+ }
+ 
+ static inline unsigned long blk_rq_deadline(struct request *rq)
+ {
+       return rq->__deadline & ~0x1UL;
+ }
+ 
   /*
    * Internal io_context interface
    */
@@@ -330,6 -338,4 +338,6 @@@ static inline void blk_queue_bounce(str
   }
   #endif /* CONFIG_BOUNCE */
   
+ +extern void blk_drain_queue(struct request_queue *q);
+ +
   #endif /* BLK_INTERNAL_H */
diff --combined block/bounce.c

index 1d05c422c932ad56d705f94deed6cce0891ff9d3,c35a3d7f05281e95822e26b98b19eedb275475a7..6a3e68292273b03d3aa12a82a4a5af4979e95d60
--- 1/block/bounce.c
--- 2/block/bounce.c
+++ b/block/bounce.c
@@@ -113,45 -113,50 +113,50 @@@ int init_emergency_isa_pool(void
   static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
   {
         unsigned char *vfrom;
-       struct bio_vec tovec, *fromvec = from->bi_io_vec;
+       struct bio_vec tovec, fromvec;
         struct bvec_iter iter;
+       /*
+        * The bio of @from is created by bounce, so we can iterate
+        * its bvec from start to end, but the @from->bi_iter can't be
+        * trusted because it might be changed by splitting.
+        */
+       struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
   
         bio_for_each_segment(tovec, to, iter) {
-               if (tovec.bv_page != fromvec->bv_page) {
+               fromvec = bio_iter_iovec(from, from_iter);
+               if (tovec.bv_page != fromvec.bv_page) {
                         /*
                          * fromvec->bv_offset and fromvec->bv_len might have
                          * been modified by the block layer, so use the original
                          * copy, bounce_copy_vec already uses tovec->bv_len
                          */
-                       vfrom = page_address(fromvec->bv_page) +
+                       vfrom = page_address(fromvec.bv_page) +
                                 tovec.bv_offset;
   
                         bounce_copy_vec(&tovec, vfrom);
                         flush_dcache_page(tovec.bv_page);
                 }
- 
-               fromvec++;
+               bio_advance_iter(from, &from_iter, tovec.bv_len);
         }
   }
   
   static void bounce_end_io(struct bio *bio, mempool_t *pool)
   {
         struct bio *bio_orig = bio->bi_private;
-       struct bio_vec *bvec, *org_vec;
+       struct bio_vec *bvec, orig_vec;
         int i;
-       int start = bio_orig->bi_iter.bi_idx;
+       struct bvec_iter orig_iter = bio_orig->bi_iter;
   
         /*
          * free up bounce indirect pages used
          */
         bio_for_each_segment_all(bvec, bio, i) {
-               org_vec = bio_orig->bi_io_vec + i + start;
- 
-               if (bvec->bv_page == org_vec->bv_page)
-                       continue;
- 
-               dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-               mempool_free(bvec->bv_page, pool);
+               orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+               if (bvec->bv_page != orig_vec.bv_page) {
+                       dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+                       mempool_free(bvec->bv_page, pool);
+               }
+               bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
         }
   
         bio_orig->bi_status = bio->bi_status;
@@@ -200,7 -205,6 +205,7 @@@ static void __blk_queue_bounce(struct r
         unsigned i = 0;
         bool bounce = false;
         int sectors = 0;
+ +      bool passthrough = bio_is_passthrough(*bio_orig);
   
         bio_for_each_segment(from, *bio_orig, iter) {
                 if (i++ < BIO_MAX_PAGES)
@@@ -211,14 -215,13 +216,14 @@@
         if (!bounce)
                 return;
   
- -      if (sectors < bio_sectors(*bio_orig)) {
+ +      if (!passthrough && sectors < bio_sectors(*bio_orig)) {
                 bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
                 bio_chain(bio, *bio_orig);
                 generic_make_request(*bio_orig);
                 *bio_orig = bio;
         }
- -      bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+ +      bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ +                      bounce_bio_set);
   
         bio_for_each_segment_all(to, bio, i) {
                 struct page *page = to->bv_page;
diff --combined drivers/block/null_blk.c

index ad0477ae820f040affe54f4368d3a02d9da63350,5b94e530570c44d7552df2075c3ef2aebe11dfc1..6655893a3a7a8365a5feb4f035b65021d38f3847
--- 1/drivers/block/null_blk.c
--- 2/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@@ -12,9 -12,9 +12,9 @@@
   #include <linux/slab.h>
   #include <linux/blk-mq.h>
   #include <linux/hrtimer.h>
- #include <linux/lightnvm.h>
   #include <linux/configfs.h>
   #include <linux/badblocks.h>
+ #include <linux/fault-inject.h>
   
   #define SECTOR_SHIFT          9
   #define PAGE_SECTORS_SHIFT    (PAGE_SHIFT - SECTOR_SHIFT)
@@@ -27,6 -27,10 +27,10 @@@
   #define TICKS_PER_SEC         50ULL
   #define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)
   
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static DECLARE_FAULT_ATTR(null_timeout_attr);
+ #endif
+ 
   static inline u64 mb_per_tick(int mbps)
   {
         return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
@@@ -35,13 -39,13 +39,13 @@@
   struct nullb_cmd {
         struct list_head list;
         struct llist_node ll_list;
- -      call_single_data_t csd;
+ +      struct __call_single_data csd;
         struct request *rq;
         struct bio *bio;
         unsigned int tag;
+ +      blk_status_t error;
         struct nullb_queue *nq;
         struct hrtimer timer;
- -      blk_status_t error;
   };
   
   struct nullb_queue {
@@@ -107,7 -111,6 +111,6 @@@ struct nullb_device 
         unsigned int hw_queue_depth; /* queue depth */
         unsigned int index; /* index of the disk, only valid with a disk */
         unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
-       bool use_lightnvm; /* register as a LightNVM device */
         bool blocking; /* blocking blk-mq device */
         bool use_per_node_hctx; /* use per-node allocation for hardware context */
         bool power; /* power on/off the device */
@@@ -121,7 -124,6 +124,6 @@@ struct nullb 
         unsigned int index;
         struct request_queue *q;
         struct gendisk *disk;
-       struct nvm_dev *ndev;
         struct blk_mq_tag_set *tag_set;
         struct blk_mq_tag_set __tag_set;
         unsigned int queue_depth;
@@@ -139,7 -141,6 +141,6 @@@ static LIST_HEAD(nullb_list)
   static struct mutex lock;
   static int null_major;
   static DEFINE_IDA(nullb_indexes);
- static struct kmem_cache *ppa_cache;
   static struct blk_mq_tag_set tag_set;
   
   enum {
@@@ -166,6 -167,11 +167,11 @@@ static int g_home_node = NUMA_NO_NODE
   module_param_named(home_node, g_home_node, int, S_IRUGO);
   MODULE_PARM_DESC(home_node, "Home node for the device");
   
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static char g_timeout_str[80];
+ module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+ #endif
+ 
   static int g_queue_mode = NULL_Q_MQ;
   
   static int null_param_store_val(const char *str, int *val, int min, int max)
@@@ -208,10 -214,6 +214,6 @@@ static int nr_devices = 1
   module_param(nr_devices, int, S_IRUGO);
   MODULE_PARM_DESC(nr_devices, "Number of devices to register");
   
- static bool g_use_lightnvm;
- module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
- MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
- 
   static bool g_blocking;
   module_param_named(blocking, g_blocking, bool, S_IRUGO);
   MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@@@ -345,7 -347,6 +347,6 @@@ NULLB_DEVICE_ATTR(blocksize, uint)
   NULLB_DEVICE_ATTR(irqmode, uint);
   NULLB_DEVICE_ATTR(hw_queue_depth, uint);
   NULLB_DEVICE_ATTR(index, uint);
- NULLB_DEVICE_ATTR(use_lightnvm, bool);
   NULLB_DEVICE_ATTR(blocking, bool);
   NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
   NULLB_DEVICE_ATTR(memory_backed, bool);
@@@ -455,7 -456,6 +456,6 @@@ static struct configfs_attribute *nullb
         &nullb_device_attr_irqmode,
         &nullb_device_attr_hw_queue_depth,
         &nullb_device_attr_index,
-       &nullb_device_attr_use_lightnvm,
         &nullb_device_attr_blocking,
         &nullb_device_attr_use_per_node_hctx,
         &nullb_device_attr_power,
@@@ -573,7 -573,6 +573,6 @@@ static struct nullb_device *null_alloc_
         dev->blocksize = g_bs;
         dev->irqmode = g_irqmode;
         dev->hw_queue_depth = g_hw_queue_depth;
-       dev->use_lightnvm = g_use_lightnvm;
         dev->blocking = g_blocking;
         dev->use_per_node_hctx = g_use_per_node_hctx;
         return dev;
@@@ -1352,6 -1351,12 +1351,12 @@@ static blk_qc_t null_queue_bio(struct r
         return BLK_QC_T_NONE;
   }
   
+ static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+ {
+       pr_info("null: rq %p timed out\n", rq);
+       return BLK_EH_HANDLED;
+ }
+ 
   static int null_rq_prep_fn(struct request_queue *q, struct request *req)
   {
         struct nullb *nullb = q->queuedata;
@@@ -1369,6 -1374,16 +1374,16 @@@
         return BLKPREP_DEFER;
   }
   
+ static bool should_timeout_request(struct request *rq)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (g_timeout_str[0])
+               return should_fail(&null_timeout_attr, 1);
+ #endif
+ 
+       return false;
+ }
+ 
   static void null_request_fn(struct request_queue *q)
   {
         struct request *rq;
@@@ -1376,12 -1391,20 +1391,20 @@@
         while ((rq = blk_fetch_request(q)) != NULL) {
                 struct nullb_cmd *cmd = rq->special;
   
-               spin_unlock_irq(q->queue_lock);
-               null_handle_cmd(cmd);
-               spin_lock_irq(q->queue_lock);
+               if (!should_timeout_request(rq)) {
+                       spin_unlock_irq(q->queue_lock);
+                       null_handle_cmd(cmd);
+                       spin_lock_irq(q->queue_lock);
+               }
         }
   }
   
+ static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+ {
+       pr_info("null: rq %p timed out\n", rq);
+       return BLK_EH_HANDLED;
+ }
+ 
   static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                          const struct blk_mq_queue_data *bd)
   {
@@@ -1399,12 -1422,16 +1422,16 @@@
   
         blk_mq_start_request(bd->rq);
   
-       return null_handle_cmd(cmd);
+       if (!should_timeout_request(bd->rq))
+               return null_handle_cmd(cmd);
+ 
+       return BLK_STS_OK;
   }
   
   static const struct blk_mq_ops null_mq_ops = {
         .queue_rq       = null_queue_rq,
         .complete       = null_softirq_done_fn,
+       .timeout        = null_timeout_rq,
   };
   
   static void cleanup_queue(struct nullb_queue *nq)
@@@ -1423,170 -1450,6 +1450,6 @@@ static void cleanup_queues(struct null
         kfree(nullb->queues);
   }
   
- #ifdef CONFIG_NVM
- 
- static void null_lnvm_end_io(struct request *rq, blk_status_t status)
- {
-       struct nvm_rq *rqd = rq->end_io_data;
- 
-       /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
-       rqd->error = status ? -EIO : 0;
-       nvm_end_io(rqd);
- 
-       blk_put_request(rq);
- }
- 
- static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
- {
-       struct request_queue *q = dev->q;
-       struct request *rq;
-       struct bio *bio = rqd->bio;
- 
-       rq = blk_mq_alloc_request(q,
-               op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-       if (IS_ERR(rq))
-               return -ENOMEM;
- 
-       blk_init_request_from_bio(rq, bio);
- 
-       rq->end_io_data = rqd;
- 
-       blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
- 
-       return 0;
- }
- 
- static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
- {
-       struct nullb *nullb = dev->q->queuedata;
-       sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
-       sector_t blksize;
-       struct nvm_id_group *grp;
- 
-       id->ver_id = 0x1;
-       id->vmnt = 0;
-       id->cap = 0x2;
-       id->dom = 0x1;
- 
-       id->ppaf.blk_offset = 0;
-       id->ppaf.blk_len = 16;
-       id->ppaf.pg_offset = 16;
-       id->ppaf.pg_len = 16;
-       id->ppaf.sect_offset = 32;
-       id->ppaf.sect_len = 8;
-       id->ppaf.pln_offset = 40;
-       id->ppaf.pln_len = 8;
-       id->ppaf.lun_offset = 48;
-       id->ppaf.lun_len = 8;
-       id->ppaf.ch_offset = 56;
-       id->ppaf.ch_len = 8;
- 
-       sector_div(size, nullb->dev->blocksize); /* convert size to pages */
-       size >>= 8; /* concert size to pgs pr blk */
-       grp = &id->grp;
-       grp->mtype = 0;
-       grp->fmtype = 0;
-       grp->num_ch = 1;
-       grp->num_pg = 256;
-       blksize = size;
-       size >>= 16;
-       grp->num_lun = size + 1;
-       sector_div(blksize, grp->num_lun);
-       grp->num_blk = blksize;
-       grp->num_pln = 1;
- 
-       grp->fpg_sz = nullb->dev->blocksize;
-       grp->csecs = nullb->dev->blocksize;
-       grp->trdt = 25000;
-       grp->trdm = 25000;
-       grp->tprt = 500000;
-       grp->tprm = 500000;
-       grp->tbet = 1500000;
-       grp->tbem = 1500000;
-       grp->mpos = 0x010101; /* single plane rwe */
-       grp->cpar = nullb->dev->hw_queue_depth;
- 
-       return 0;
- }
- 
- static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
- {
-       mempool_t *virtmem_pool;
- 
-       virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
-       if (!virtmem_pool) {
-               pr_err("null_blk: Unable to create virtual memory pool\n");
-               return NULL;
-       }
- 
-       return virtmem_pool;
- }
- 
- static void null_lnvm_destroy_dma_pool(void *pool)
- {
-       mempool_destroy(pool);
- }
- 
- static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
-                               gfp_t mem_flags, dma_addr_t *dma_handler)
- {
-       return mempool_alloc(pool, mem_flags);
- }
- 
- static void null_lnvm_dev_dma_free(void *pool, void *entry,
-                                                       dma_addr_t dma_handler)
- {
-       mempool_free(entry, pool);
- }
- 
- static struct nvm_dev_ops null_lnvm_dev_ops = {
-       .identity               = null_lnvm_id,
-       .submit_io              = null_lnvm_submit_io,
- 
-       .create_dma_pool        = null_lnvm_create_dma_pool,
-       .destroy_dma_pool       = null_lnvm_destroy_dma_pool,
-       .dev_dma_alloc          = null_lnvm_dev_dma_alloc,
-       .dev_dma_free           = null_lnvm_dev_dma_free,
- 
-       /* Simulate nvme protocol restriction */
-       .max_phys_sect          = 64,
- };
- 
- static int null_nvm_register(struct nullb *nullb)
- {
-       struct nvm_dev *dev;
-       int rv;
- 
-       dev = nvm_alloc_dev(0);
-       if (!dev)
-               return -ENOMEM;
- 
-       dev->q = nullb->q;
-       memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
-       dev->ops = &null_lnvm_dev_ops;
- 
-       rv = nvm_register(dev);
-       if (rv) {
-               kfree(dev);
-               return rv;
-       }
-       nullb->ndev = dev;
-       return 0;
- }
- 
- static void null_nvm_unregister(struct nullb *nullb)
- {
-       nvm_unregister(nullb->ndev);
- }
- #else
- static int null_nvm_register(struct nullb *nullb)
- {
-       pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
-       return -EINVAL;
- }
- static void null_nvm_unregister(struct nullb *nullb) {}
- #endif /* CONFIG_NVM */
- 
   static void null_del_dev(struct nullb *nullb)
   {
         struct nullb_device *dev = nullb->dev;
@@@ -1595,10 -1458,7 +1458,7 @@@
   
         list_del_init(&nullb->list);
   
-       if (dev->use_lightnvm)
-               null_nvm_unregister(nullb);
-       else
-               del_gendisk(nullb->disk);
+       del_gendisk(nullb->disk);
   
         if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
                 hrtimer_cancel(&nullb->bw_timer);
@@@ -1610,8 -1470,7 +1470,7 @@@
         if (dev->queue_mode == NULL_Q_MQ &&
             nullb->tag_set == &nullb->__tag_set)
                 blk_mq_free_tag_set(nullb->tag_set);
-       if (!dev->use_lightnvm)
-               put_disk(nullb->disk);
+       put_disk(nullb->disk);
         cleanup_queues(nullb);
         if (null_cache_active(nullb))
                 null_free_device_storage(nullb->dev, true);
@@@ -1775,11 -1634,6 +1634,6 @@@ static void null_validate_conf(struct n
   {
         dev->blocksize = round_down(dev->blocksize, 512);
         dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
-       if (dev->use_lightnvm && dev->blocksize != 4096)
-               dev->blocksize = 4096;
- 
-       if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
-               dev->queue_mode = NULL_Q_MQ;
   
         if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
                 if (dev->submit_queues != nr_online_nodes)
@@@ -1805,6 -1659,20 +1659,20 @@@
                 dev->mbps = 0;
   }
   
+ static bool null_setup_fault(void)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (!g_timeout_str[0])
+               return true;
+ 
+       if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+               return false;
+ 
+       null_timeout_attr.verbose = 0;
+ #endif
+       return true;
+ }
+ 
   static int null_add_dev(struct nullb_device *dev)
   {
         struct nullb *nullb;
@@@ -1838,6 -1706,10 +1706,10 @@@
                 if (rv)
                         goto out_cleanup_queues;
   
+               if (!null_setup_fault())
+                       goto out_cleanup_queues;
+ 
+               nullb->tag_set->timeout = 5 * HZ;
                 nullb->q = blk_mq_init_queue(nullb->tag_set);
                 if (IS_ERR(nullb->q)) {
                         rv = -ENOMEM;
@@@ -1861,8 -1733,14 +1733,14 @@@
                         rv = -ENOMEM;
                         goto out_cleanup_queues;
                 }
+ 
+               if (!null_setup_fault())
+                       goto out_cleanup_blk_queue;
+ 
                 blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
                 blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+               blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+               nullb->q->rq_timeout = 5 * HZ;
                 rv = init_driver_queues(nullb);
                 if (rv)
                         goto out_cleanup_blk_queue;
@@@ -1895,11 -1773,7 +1773,7 @@@
   
         sprintf(nullb->disk_name, "nullb%d", nullb->index);
   
-       if (dev->use_lightnvm)
-               rv = null_nvm_register(nullb);
-       else
-               rv = null_gendisk_register(nullb);
- 
+       rv = null_gendisk_register(nullb);
         if (rv)
                 goto out_cleanup_blk_queue;
   
@@@ -1938,18 -1812,6 +1812,6 @@@ static int __init null_init(void
                 g_bs = PAGE_SIZE;
         }
   
-       if (g_use_lightnvm && g_bs != 4096) {
-               pr_warn("null_blk: LightNVM only supports 4k block size\n");
-               pr_warn("null_blk: defaults block size to 4k\n");
-               g_bs = 4096;
-       }
- 
-       if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
-               pr_warn("null_blk: LightNVM only supported for blk-mq\n");
-               pr_warn("null_blk: defaults queue mode to blk-mq\n");
-               g_queue_mode = NULL_Q_MQ;
-       }
- 
         if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
                 if (g_submit_queues != nr_online_nodes) {
                         pr_warn("null_blk: submit_queues param is set to %u.\n",
@@@ -1982,16 -1844,6 +1844,6 @@@
                 goto err_conf;
         }
   
-       if (g_use_lightnvm) {
-               ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
-                                                               0, 0, NULL);
-               if (!ppa_cache) {
-                       pr_err("null_blk: unable to create ppa cache\n");
-                       ret = -ENOMEM;
-                       goto err_ppa;
-               }
-       }
- 
         for (i = 0; i < nr_devices; i++) {
                 dev = null_alloc_dev();
                 if (!dev) {
@@@ -2015,8 -1867,6 +1867,6 @@@ err_dev
                 null_del_dev(nullb);
                 null_free_dev(dev);
         }
-       kmem_cache_destroy(ppa_cache);
- err_ppa:
         unregister_blkdev(null_major, "nullb");
   err_conf:
         configfs_unregister_subsystem(&nullb_subsys);
@@@ -2047,8 -1897,6 +1897,6 @@@ static void __exit null_exit(void
   
         if (g_queue_mode == NULL_Q_MQ && shared_tags)
                 blk_mq_free_tag_set(&tag_set);
- 
-       kmem_cache_destroy(ppa_cache);
   }
   
   module_init(null_init);
diff --combined drivers/md/dm-crypt.c

index 554d60394c0663980d89c3bb84a48007f365470b,48332666fc38494fe813429b4fe450a2cadb3f2e..2ad429100d25df57974f2981e8b6cdcd587b7af6
--- 1/drivers/md/dm-crypt.c
--- 2/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@@ -1446,7 -1446,6 +1446,6 @@@ static void crypt_free_buffer_pages(str
         bio_for_each_segment_all(bv, clone, i) {
                 BUG_ON(!bv->bv_page);
                 mempool_free(bv->bv_page, cc->page_pool);
-               bv->bv_page = NULL;
         }
   }
   
@@@ -1954,15 -1953,10 +1953,15 @@@ static int crypt_setkey(struct crypt_co
         /* Ignore extra keys (which are used for IV etc) */
         subkey_size = crypt_subkey_size(cc);
   
- -      if (crypt_integrity_hmac(cc))
+ +      if (crypt_integrity_hmac(cc)) {
+ +              if (subkey_size < cc->key_mac_size)
+ +                      return -EINVAL;
+ +
                 crypt_copy_authenckey(cc->authenc_key, cc->key,
                                       subkey_size - cc->key_mac_size,
                                       cc->key_mac_size);
+ +      }
+ +
         for (i = 0; i < cc->tfms_count; i++) {
                 if (crypt_integrity_hmac(cc))
                         r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
@@@ -2058,6 -2052,9 +2057,6 @@@ static int crypt_set_keyring_key(struc
   
         ret = crypt_setkey(cc);
   
- -      /* wipe the kernel key payload copy in each case */
- -      memset(cc->key, 0, cc->key_size * sizeof(u8));
- -
         if (!ret) {
                 set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
                 kzfree(cc->key_string);
@@@ -2525,10 -2522,6 +2524,10 @@@ static int crypt_ctr_cipher(struct dm_t
                 }
         }
   
+ +      /* wipe the kernel key payload copy */
+ +      if (cc->key_string)
+ +              memset(cc->key, 0, cc->key_size * sizeof(u8));
+ +
         return ret;
   }
   
@@@ -2746,7 -2739,6 +2745,7 @@@ static int crypt_ctr(struct dm_target *
                         cc->tag_pool_max_sectors * cc->on_disk_tag_size);
                 if (!cc->tag_pool) {
                         ti->error = "Cannot allocate integrity tags mempool";
+ +                      ret = -ENOMEM;
                         goto bad;
                 }
   
@@@ -2968,9 -2960,6 +2967,9 @@@ static int crypt_message(struct dm_targ
                                 return ret;
                         if (cc->iv_gen_ops && cc->iv_gen_ops->init)
                                 ret = cc->iv_gen_ops->init(cc);
+ +                      /* wipe the kernel key payload copy */
+ +                      if (cc->key_string)
+ +                              memset(cc->key, 0, cc->key_size * sizeof(u8));
                         return ret;
                 }
                 if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
@@@ -3017,7 -3006,7 +3016,7 @@@ static void crypt_io_hints(struct dm_ta
   
   static struct target_type crypt_target = {
         .name   = "crypt",
- -      .version = {1, 18, 0},
+ +      .version = {1, 18, 1},
         .module = THIS_MODULE,
         .ctr    = crypt_ctr,
         .dtr    = crypt_dtr,
diff --combined drivers/nvme/host/core.c

index 839650e0926af1aaaf21bafbdc1baa79cf907d76,b3af8e914570ab44fad02bcd8ba9bcbb065670d4..e8104871cbbf753a19f4d6915601b81e3350d686
--- 1/drivers/nvme/host/core.c
--- 2/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@@ -29,6 -29,9 +29,9 @@@
   #include <linux/pm_qos.h>
   #include <asm/unaligned.h>
   
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
+ 
   #include "nvme.h"
   #include "fabrics.h"
   
@@@ -65,9 -68,26 +68,26 @@@ static bool streams
   module_param(streams, bool, 0644);
   MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
   
+ /*
+  * nvme_wq - hosts nvme related works that are not reset or delete
+  * nvme_reset_wq - hosts nvme reset works
+  * nvme_delete_wq - hosts nvme delete works
+  *
+  * nvme_wq will host works such are scan, aen handling, fw activation,
+  * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+  * runs reset works which also flush works hosted on nvme_wq for
+  * serialization purposes. nvme_delete_wq host controller deletion
+  * works which flush reset works for serialization.
+  */
   struct workqueue_struct *nvme_wq;
   EXPORT_SYMBOL_GPL(nvme_wq);
   
+ struct workqueue_struct *nvme_reset_wq;
+ EXPORT_SYMBOL_GPL(nvme_reset_wq);
+ 
+ struct workqueue_struct *nvme_delete_wq;
+ EXPORT_SYMBOL_GPL(nvme_delete_wq);
+ 
   static DEFINE_IDA(nvme_subsystems_ida);
   static LIST_HEAD(nvme_subsystems);
   static DEFINE_MUTEX(nvme_subsystems_lock);
@@@ -89,13 -109,13 +109,13 @@@ int nvme_reset_ctrl(struct nvme_ctrl *c
   {
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
                 return -EBUSY;
-       if (!queue_work(nvme_wq, &ctrl->reset_work))
+       if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
                 return -EBUSY;
         return 0;
   }
   EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
   
- static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
   {
         int ret;
   
@@@ -104,6 -124,7 +124,7 @@@
                 flush_work(&ctrl->reset_work);
         return ret;
   }
+ EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
   
   static void nvme_delete_ctrl_work(struct work_struct *work)
   {
@@@ -122,7 -143,7 +143,7 @@@ int nvme_delete_ctrl(struct nvme_ctrl *
   {
         if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
                 return -EBUSY;
-       if (!queue_work(nvme_wq, &ctrl->delete_work))
+       if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
                 return -EBUSY;
         return 0;
   }
@@@ -157,13 -178,20 +178,20 @@@ static blk_status_t nvme_error_status(s
                 return BLK_STS_OK;
         case NVME_SC_CAP_EXCEEDED:
                 return BLK_STS_NOSPC;
+       case NVME_SC_LBA_RANGE:
+               return BLK_STS_TARGET;
+       case NVME_SC_BAD_ATTRIBUTES:
         case NVME_SC_ONCS_NOT_SUPPORTED:
+       case NVME_SC_INVALID_OPCODE:
+       case NVME_SC_INVALID_FIELD:
+       case NVME_SC_INVALID_NS:
                 return BLK_STS_NOTSUPP;
         case NVME_SC_WRITE_FAULT:
         case NVME_SC_READ_ERROR:
         case NVME_SC_UNWRITTEN_BLOCK:
         case NVME_SC_ACCESS_DENIED:
         case NVME_SC_READ_ONLY:
+       case NVME_SC_COMPARE_FAILED:
                 return BLK_STS_MEDIUM;
         case NVME_SC_GUARD_CHECK:
         case NVME_SC_APPTAG_CHECK:
@@@ -190,8 -218,12 +218,12 @@@ static inline bool nvme_req_needs_retry
   
   void nvme_complete_rq(struct request *req)
   {
-       if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
-               if (nvme_req_needs_failover(req)) {
+       blk_status_t status = nvme_error_status(req);
+ 
+       trace_nvme_complete_rq(req);
+ 
+       if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+               if (nvme_req_needs_failover(req, status)) {
                         nvme_failover_req(req);
                         return;
                 }
@@@ -202,8 -234,7 +234,7 @@@
                         return;
                 }
         }
- 
-       blk_mq_end_request(req, nvme_error_status(req));
+       blk_mq_end_request(req, status);
   }
   EXPORT_SYMBOL_GPL(nvme_complete_rq);
   
@@@ -232,6 -263,15 +263,15 @@@ bool nvme_change_ctrl_state(struct nvme
   
         old_state = ctrl->state;
         switch (new_state) {
+       case NVME_CTRL_ADMIN_ONLY:
+               switch (old_state) {
+               case NVME_CTRL_RECONNECTING:
+                       changed = true;
+                       /* FALLTHRU */
+               default:
+                       break;
+               }
+               break;
         case NVME_CTRL_LIVE:
                 switch (old_state) {
                 case NVME_CTRL_NEW:
@@@ -247,6 -287,7 +287,7 @@@
                 switch (old_state) {
                 case NVME_CTRL_NEW:
                 case NVME_CTRL_LIVE:
+               case NVME_CTRL_ADMIN_ONLY:
                         changed = true;
                         /* FALLTHRU */
                 default:
@@@ -266,6 -307,7 +307,7 @@@
         case NVME_CTRL_DELETING:
                 switch (old_state) {
                 case NVME_CTRL_LIVE:
+               case NVME_CTRL_ADMIN_ONLY:
                 case NVME_CTRL_RESETTING:
                 case NVME_CTRL_RECONNECTING:
                         changed = true;
@@@ -591,6 -633,10 +633,10 @@@ blk_status_t nvme_setup_cmd(struct nvme
         }
   
         cmd->common.command_id = req->tag;
+       if (ns)
+               trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+       else
+               trace_nvme_setup_admin_cmd(cmd);
         return ret;
   }
   EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@@ -1217,16 -1263,27 +1263,27 @@@ static int nvme_open(struct block_devic
   #ifdef CONFIG_NVME_MULTIPATH
         /* should never be called due to GENHD_FL_HIDDEN */
         if (WARN_ON_ONCE(ns->head->disk))
-               return -ENXIO;
+               goto fail;
   #endif
         if (!kref_get_unless_zero(&ns->kref))
-               return -ENXIO;
+               goto fail;
+       if (!try_module_get(ns->ctrl->ops->module))
+               goto fail_put_ns;
+ 
         return 0;
+ 
+ fail_put_ns:
+       nvme_put_ns(ns);
+ fail:
+       return -ENXIO;
   }
   
   static void nvme_release(struct gendisk *disk, fmode_t mode)
   {
-       nvme_put_ns(disk->private_data);
+       struct nvme_ns *ns = disk->private_data;
+ 
+       module_put(ns->ctrl->ops->module);
+       nvme_put_ns(ns);
   }
   
   static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@@ -1287,7 -1344,7 +1344,7 @@@ static void nvme_config_discard(struct 
         BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                         NVME_DSM_MAX_RANGES);
   
- -      queue->limits.discard_alignment = size;
+ +      queue->limits.discard_alignment = 0;
         queue->limits.discard_granularity = size;
   
         blk_queue_max_discard_sectors(queue, UINT_MAX);
@@@ -1335,7 -1392,6 +1392,7 @@@ static void nvme_update_disk_info(struc
                 struct nvme_ns *ns, struct nvme_id_ns *id)
   {
         sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ +      unsigned short bs = 1 << ns->lba_shift;
         unsigned stream_alignment = 0;
   
         if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
@@@ -1344,10 -1400,7 +1401,10 @@@
         blk_mq_freeze_queue(disk->queue);
         blk_integrity_unregister(disk);
   
- -      blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+ +      blk_queue_logical_block_size(disk->queue, bs);
+ +      blk_queue_physical_block_size(disk->queue, bs);
+ +      blk_queue_io_min(disk->queue, bs);
+ +
         if (ns->ms && !ns->ext &&
             (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
                 nvme_init_integrity(disk, ns->ms, ns->pi_type);
@@@ -1709,8 -1762,7 +1766,8 @@@ static void nvme_set_queue_limits(struc
                 blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
                 blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
         }
- -      if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
+ +      if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ +          is_power_of_2(ctrl->max_hw_sectors))
                 blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
         blk_queue_virt_boundary(q, ctrl->page_size - 1);
         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
@@@ -2052,6 -2104,22 +2109,22 @@@ static const struct attribute_group *nv
         NULL,
   };
   
+ static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+ {
+       int count = 0;
+       struct nvme_ctrl *ctrl;
+ 
+       mutex_lock(&subsys->lock);
+       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+               if (ctrl->state != NVME_CTRL_DELETING &&
+                   ctrl->state != NVME_CTRL_DEAD)
+                       count++;
+       }
+       mutex_unlock(&subsys->lock);
+ 
+       return count;
+ }
+ 
   static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
   {
         struct nvme_subsystem *subsys, *found;
@@@ -2090,7 -2158,7 +2163,7 @@@
                  * Verify that the subsystem actually supports multiple
                  * controllers, else bail out.
                  */
-               if (!(id->cmic & (1 << 1))) {
+               if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
                         dev_err(ctrl->device,
                                 "ignoring ctrl due to duplicate subnqn (%s).\n",
                                 found->subnqn);
@@@ -2257,7 -2325,7 +2330,7 @@@ int nvme_init_identify(struct nvme_ctr
                                                  shutdown_timeout, 60);
   
                 if (ctrl->shutdown_timeout != shutdown_timeout)
-                       dev_warn(ctrl->device,
+                       dev_info(ctrl->device,
                                  "Shutdown timeout set to %u seconds\n",
                                  ctrl->shutdown_timeout);
         } else
@@@ -2341,8 -2409,14 +2414,14 @@@ static int nvme_dev_open(struct inode *
         struct nvme_ctrl *ctrl =
                 container_of(inode->i_cdev, struct nvme_ctrl, cdev);
   
-       if (ctrl->state != NVME_CTRL_LIVE)
+       switch (ctrl->state) {
+       case NVME_CTRL_LIVE:
+       case NVME_CTRL_ADMIN_ONLY:
+               break;
+       default:
                 return -EWOULDBLOCK;
+       }
+ 
         file->private_data = ctrl;
         return 0;
   }
@@@ -2606,6 -2680,7 +2685,7 @@@ static ssize_t nvme_sysfs_show_state(st
         static const char *const state_name[] = {
                 [NVME_CTRL_NEW]         = "new",
                 [NVME_CTRL_LIVE]        = "live",
+               [NVME_CTRL_ADMIN_ONLY]  = "only-admin",
                 [NVME_CTRL_RESETTING]   = "resetting",
                 [NVME_CTRL_RECONNECTING]= "reconnecting",
                 [NVME_CTRL_DELETING]    = "deleting",
@@@ -2874,6 -2949,7 +2954,6 @@@ static void nvme_alloc_ns(struct nvme_c
   
         blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
         nvme_set_queue_limits(ctrl, ns->queue);
- -      nvme_setup_streams_ns(ctrl, ns);
   
         id = nvme_identify_ns(ctrl, nsid);
         if (!id)
@@@ -2884,7 -2960,6 +2964,7 @@@
   
         if (nvme_init_ns_head(ns, nsid, id, &new))
                 goto out_free_id;
+ +      nvme_setup_streams_ns(ctrl, ns);
         
   #ifdef CONFIG_NVME_MULTIPATH
         /*
@@@ -2970,6 -3045,8 +3050,6 @@@ static void nvme_ns_remove(struct nvme_
                 return;
   
         if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
- -              if (blk_get_integrity(ns->disk))
- -                      blk_integrity_unregister(ns->disk);
                 nvme_mpath_remove_disk_links(ns);
                 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
                                         &nvme_ns_id_attr_group);
@@@ -2977,8 -3054,6 +3057,8 @@@
                         nvme_nvm_unregister_sysfs(ns);
                 del_gendisk(ns->disk);
                 blk_cleanup_queue(ns->queue);
+ +              if (blk_get_integrity(ns->disk))
+ +                      blk_integrity_unregister(ns->disk);
         }
   
         mutex_lock(&ns->ctrl->subsys->lock);
@@@ -2991,7 -3066,6 +3071,7 @@@
         mutex_unlock(&ns->ctrl->namespaces_mutex);
   
         synchronize_srcu(&ns->head->srcu);
+ +      nvme_mpath_check_last_path(ns);
         nvme_put_ns(ns);
   }
   
@@@ -3079,6 -3153,8 +3159,8 @@@ static void nvme_scan_work(struct work_
         if (ctrl->state != NVME_CTRL_LIVE)
                 return;
   
+       WARN_ON_ONCE(!ctrl->tagset);
+ 
         if (nvme_identify_ctrl(ctrl, &id))
                 return;
   
@@@ -3099,8 -3175,7 +3181,7 @@@
   void nvme_queue_scan(struct nvme_ctrl *ctrl)
   {
         /*
-        * Do not queue new scan work when a controller is reset during
-        * removal.
+        * Only new queue scan work when admin and IO queues are both alive
          */
         if (ctrl->state == NVME_CTRL_LIVE)
                 queue_work(nvme_wq, &ctrl->scan_work);
@@@ -3477,16 -3552,26 +3558,26 @@@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset)
   
   int __init nvme_core_init(void)
   {
-       int result;
+       int result = -ENOMEM;
   
         nvme_wq = alloc_workqueue("nvme-wq",
                         WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
         if (!nvme_wq)
-               return -ENOMEM;
+               goto out;
+ 
+       nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+                       WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+       if (!nvme_reset_wq)
+               goto destroy_wq;
+ 
+       nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+                       WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+       if (!nvme_delete_wq)
+               goto destroy_reset_wq;
   
         result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
         if (result < 0)
-               goto destroy_wq;
+               goto destroy_delete_wq;
   
         nvme_class = class_create(THIS_MODULE, "nvme");
         if (IS_ERR(nvme_class)) {
@@@ -3505,8 -3590,13 +3596,13 @@@ destroy_class
         class_destroy(nvme_class);
   unregister_chrdev:
         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+ destroy_delete_wq:
+       destroy_workqueue(nvme_delete_wq);
+ destroy_reset_wq:
+       destroy_workqueue(nvme_reset_wq);
   destroy_wq:
         destroy_workqueue(nvme_wq);
+ out:
         return result;
   }
   
@@@ -3516,6 -3606,8 +3612,8 @@@ void nvme_core_exit(void
         class_destroy(nvme_subsys_class);
         class_destroy(nvme_class);
         unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+       destroy_workqueue(nvme_delete_wq);
+       destroy_workqueue(nvme_reset_wq);
         destroy_workqueue(nvme_wq);
   }
   
diff --combined drivers/nvme/host/fabrics.c

index 894c2ccb3891e0b83e1c839f0b08f4cba5d179ae,9cee72a80472946d02899d92db08d51d021ef57a..5dd4ceefed8fe0d0897aa8dadb1d266174b2eb02
--- 1/drivers/nvme/host/fabrics.c
--- 2/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@@ -74,7 -74,6 +74,7 @@@ static struct nvmf_host *nvmf_host_defa
                 return NULL;
   
         kref_init(&host->ref);
+ +      uuid_gen(&host->id);
         snprintf(host->nqn, NVMF_NQN_SIZE,
                 "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
   
@@@ -493,7 -492,7 +493,7 @@@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect
    */
   int nvmf_register_transport(struct nvmf_transport_ops *ops)
   {
-       if (!ops->create_ctrl)
+       if (!ops->create_ctrl || !ops->module)
                 return -EINVAL;
   
         down_write(&nvmf_transports_rwsem);
@@@ -739,11 -738,14 +739,14 @@@ static int nvmf_parse_options(struct nv
                                 ret = -ENOMEM;
                                 goto out;
                         }
-                       if (uuid_parse(p, &hostid)) {
+                       ret = uuid_parse(p, &hostid);
+                       if (ret) {
                                 pr_err("Invalid hostid %s\n", p);
                                 ret = -EINVAL;
+                               kfree(p);
                                 goto out;
                         }
+                       kfree(p);
                         break;
                 case NVMF_OPT_DUP_CONNECT:
                         opts->duplicate_connect = true;
@@@ -869,32 -871,41 +872,41 @@@ nvmf_create_ctrl(struct device *dev, co
                 goto out_unlock;
         }
   
+       if (!try_module_get(ops->module)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
+ 
         ret = nvmf_check_required_opts(opts, ops->required_opts);
         if (ret)
-               goto out_unlock;
+               goto out_module_put;
         ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
                                 ops->allowed_opts | ops->required_opts);
         if (ret)
-               goto out_unlock;
+               goto out_module_put;
   
         ctrl = ops->create_ctrl(dev, opts);
         if (IS_ERR(ctrl)) {
                 ret = PTR_ERR(ctrl);
-               goto out_unlock;
+               goto out_module_put;
         }
   
         if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
                 dev_warn(ctrl->device,
                         "controller returned incorrect NQN: \"%s\".\n",
                         ctrl->subsys->subnqn);
+               module_put(ops->module);
                 up_read(&nvmf_transports_rwsem);
                 nvme_delete_ctrl_sync(ctrl);
                 return ERR_PTR(-EINVAL);
         }
   
+       module_put(ops->module);
         up_read(&nvmf_transports_rwsem);
         return ctrl;
   
+ out_module_put:
+       module_put(ops->module);
   out_unlock:
         up_read(&nvmf_transports_rwsem);
   out_free_opts:
diff --combined drivers/nvme/host/fc.c

index 794e66e4aa20115f4dc3a6b5fc12f706b2040bf4,b76ba4629e02a41b811fe9344d3596a7084427f1..99bf51c7e51325e25ead4f4bb0dfbeff7972e526
--- 1/drivers/nvme/host/fc.c
--- 2/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@@ -2921,6 -2921,9 +2921,9 @@@ nvme_fc_delete_association(struct nvme_
         __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
         nvme_fc_free_queue(&ctrl->queues[0]);
   
+       /* re-enable the admin_q so anything new can fast fail */
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ 
         nvme_fc_ctlr_inactive_on_rport(ctrl);
   }
   
@@@ -2935,6 -2938,9 +2938,9 @@@ nvme_fc_delete_ctrl(struct nvme_ctrl *n
          * waiting for io to terminate
          */
         nvme_fc_delete_association(ctrl);
+ 
+       /* resume the io queues so that things will fast fail */
+       nvme_start_queues(nctrl);
   }
   
   static void
@@@ -3221,6 -3227,7 +3227,6 @@@ nvme_fc_init_ctrl(struct device *dev, s
   
                 /* initiate nvme ctrl ref counting teardown */
                 nvme_uninit_ctrl(&ctrl->ctrl);
- -              nvme_put_ctrl(&ctrl->ctrl);
   
                 /* Remove core ctrl ref. */
                 nvme_put_ctrl(&ctrl->ctrl);
@@@ -3380,6 -3387,7 +3386,7 @@@ nvme_fc_create_ctrl(struct device *dev
   
   static struct nvmf_transport_ops nvme_fc_transport = {
         .name           = "fc",
+       .module         = THIS_MODULE,
         .required_opts  = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
         .allowed_opts   = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
         .create_ctrl    = nvme_fc_create_ctrl,
diff --combined drivers/nvme/host/nvme.h

index a00eabd0642738bbdbaf0b11d7f8e1747c996e3c,8e7fc1b041b7b1c9db38b3ae089f9db50fa9be2a..8e4550fa08f8bd775e7e5e8e0c169e287845509a
--- 1/drivers/nvme/host/nvme.h
--- 2/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@@ -32,6 -32,8 +32,8 @@@ extern unsigned int admin_timeout
   #define NVME_KATO_GRACE               10
   
   extern struct workqueue_struct *nvme_wq;
+ extern struct workqueue_struct *nvme_reset_wq;
+ extern struct workqueue_struct *nvme_delete_wq;
   
   enum {
         NVME_NS_LBA             = 0,
@@@ -119,6 -121,7 +121,7 @@@ static inline struct nvme_request *nvme
   enum nvme_ctrl_state {
         NVME_CTRL_NEW,
         NVME_CTRL_LIVE,
+       NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
         NVME_CTRL_RESETTING,
         NVME_CTRL_RECONNECTING,
         NVME_CTRL_DELETING,
@@@ -393,6 -396,7 +396,7 @@@ int nvme_set_queue_count(struct nvme_ct
   void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
   void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
   int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
   int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
   int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
   
@@@ -401,7 -405,7 +405,7 @@@ extern const struct block_device_operat
   
   #ifdef CONFIG_NVME_MULTIPATH
   void nvme_failover_req(struct request *req);
- bool nvme_req_needs_failover(struct request *req);
+ bool nvme_req_needs_failover(struct request *req, blk_status_t error);
   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
   void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@@ -417,20 -421,12 +421,21 @@@ static inline void nvme_mpath_clear_cur
                 rcu_assign_pointer(head->current_path, NULL);
   }
   struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+ +
+ +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+ +{
+ +      struct nvme_ns_head *head = ns->head;
+ +
+ +      if (head->disk && list_empty(&head->list))
+ +              kblockd_schedule_work(&head->requeue_work);
+ +}
+ +
   #else
   static inline void nvme_failover_req(struct request *req)
   {
   }
- static inline bool nvme_req_needs_failover(struct request *req)
+ static inline bool nvme_req_needs_failover(struct request *req,
+                                          blk_status_t error)
   {
         return false;
   }
@@@ -455,9 -451,6 +460,9 @@@ static inline void nvme_mpath_remove_di
   {
   }
   static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+ +{
+ +}
+ +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
   {
   }
   #endif /* CONFIG_NVME_MULTIPATH */
diff --combined drivers/nvme/host/pci.c

index 4276ebfff22ba00fd90e8c241cc7edd56deca353,0bc6a9e48c8ea933e4b004787ecf9fafb3639688..6fe7af00a1f42a7dcb3354ac49db499cef6f9c88
--- 1/drivers/nvme/host/pci.c
--- 2/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -75,7 -75,7 +75,7 @@@ static void nvme_dev_disable(struct nvm
    * Represents an NVM Express device.  Each nvme_dev is a PCI function.
    */
   struct nvme_dev {
-       struct nvme_queue **queues;
+       struct nvme_queue *queues;
         struct blk_mq_tag_set tagset;
         struct blk_mq_tag_set admin_tagset;
         u32 __iomem *dbs;
@@@ -365,7 -365,7 +365,7 @@@ static int nvme_admin_init_hctx(struct 
                                 unsigned int hctx_idx)
   {
         struct nvme_dev *dev = data;
-       struct nvme_queue *nvmeq = dev->queues[0];
+       struct nvme_queue *nvmeq = &dev->queues[0];
   
         WARN_ON(hctx_idx != 0);
         WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@@ -387,7 -387,7 +387,7 @@@ static int nvme_init_hctx(struct blk_mq
                           unsigned int hctx_idx)
   {
         struct nvme_dev *dev = data;
-       struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+       struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
   
         if (!nvmeq->tags)
                 nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@@ -403,7 -403,7 +403,7 @@@ static int nvme_init_request(struct blk
         struct nvme_dev *dev = set->driver_data;
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
         int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
-       struct nvme_queue *nvmeq = dev->queues[queue_idx];
+       struct nvme_queue *nvmeq = &dev->queues[queue_idx];
   
         BUG_ON(!nvmeq);
         iod->nvmeq = nvmeq;
@@@ -448,34 -448,12 +448,34 @@@ static void **nvme_pci_iod_list(struct 
         return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
   }
   
+ +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+ +{
+ +      struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ +      int nseg = blk_rq_nr_phys_segments(req);
+ +      unsigned int avg_seg_size;
+ +
+ +      if (nseg == 0)
+ +              return false;
+ +
+ +      avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+ +
+ +      if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+ +              return false;
+ +      if (!iod->nvmeq->qid)
+ +              return false;
+ +      if (!sgl_threshold || avg_seg_size < sgl_threshold)
+ +              return false;
+ +      return true;
+ +}
+ +
   static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
   {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
         int nseg = blk_rq_nr_phys_segments(rq);
         unsigned int size = blk_rq_payload_bytes(rq);
   
+ +      iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+ +
         if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
                 size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
                                 iod->use_sgl);
@@@ -626,6 -604,8 +626,6 @@@ static blk_status_t nvme_pci_setup_prps
         dma_addr_t prp_dma;
         int nprps, i;
   
- -      iod->use_sgl = false;
- -
         length -= (page_size - offset);
         if (length <= 0) {
                 iod->first_dma = 0;
@@@ -725,19 -705,22 +725,19 @@@ static void nvme_pci_sgl_set_seg(struc
   }
   
   static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- -              struct request *req, struct nvme_rw_command *cmd)
+ +              struct request *req, struct nvme_rw_command *cmd, int entries)
   {
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- -      int length = blk_rq_payload_bytes(req);
         struct dma_pool *pool;
         struct nvme_sgl_desc *sg_list;
         struct scatterlist *sg = iod->sg;
- -      int entries = iod->nents, i = 0;
         dma_addr_t sgl_dma;
- -
- -      iod->use_sgl = true;
+ +      int i = 0;
   
         /* setting the transfer type as SGL */
         cmd->flags = NVME_CMD_SGL_METABUF;
   
- -      if (length == sg_dma_len(sg)) {
+ +      if (entries == 1) {
                 nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
                 return BLK_STS_OK;
         }
@@@ -777,12 -760,33 +777,12 @@@
                 }
   
                 nvme_pci_sgl_set_data(&sg_list[i++], sg);
- -
- -              length -= sg_dma_len(sg);
                 sg = sg_next(sg);
- -              entries--;
- -      } while (length > 0);
+ +      } while (--entries > 0);
   
- -      WARN_ON(entries > 0);
         return BLK_STS_OK;
   }
   
- -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
- -{
- -      struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- -      unsigned int avg_seg_size;
- -
- -      avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
- -                      blk_rq_nr_phys_segments(req));
- -
- -      if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
- -              return false;
- -      if (!iod->nvmeq->qid)
- -              return false;
- -      if (!sgl_threshold || avg_seg_size < sgl_threshold)
- -              return false;
- -      return true;
- -}
- -
   static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                 struct nvme_command *cmnd)
   {
@@@ -791,7 -795,6 +791,7 @@@
         enum dma_data_direction dma_dir = rq_data_dir(req) ?
                         DMA_TO_DEVICE : DMA_FROM_DEVICE;
         blk_status_t ret = BLK_STS_IOERR;
+ +      int nr_mapped;
   
         sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
         iod->nents = blk_rq_map_sg(q, req, iod->sg);
@@@ -799,13 -802,12 +799,13 @@@
                 goto out;
   
         ret = BLK_STS_RESOURCE;
- -      if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
- -                              DMA_ATTR_NO_WARN))
+ +      nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
+ +                      DMA_ATTR_NO_WARN);
+ +      if (!nr_mapped)
                 goto out;
   
- -      if (nvme_pci_use_sgls(dev, req))
- -              ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ +      if (iod->use_sgl)
+ +              ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
         else
                 ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
   
@@@ -1044,7 -1046,7 +1044,7 @@@ static int nvme_poll(struct blk_mq_hw_c
   static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
   {
         struct nvme_dev *dev = to_nvme_dev(ctrl);
-       struct nvme_queue *nvmeq = dev->queues[0];
+       struct nvme_queue *nvmeq = &dev->queues[0];
         struct nvme_command c;
   
         memset(&c, 0, sizeof(c));
@@@ -1138,9 -1140,14 +1138,14 @@@ static bool nvme_should_reset(struct nv
          */
         bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
   
-       /* If there is a reset ongoing, we shouldn't reset again. */
-       if (dev->ctrl.state == NVME_CTRL_RESETTING)
+       /* If there is a reset/reinit ongoing, we shouldn't reset again. */
+       switch (dev->ctrl.state) {
+       case NVME_CTRL_RESETTING:
+       case NVME_CTRL_RECONNECTING:
                 return false;
+       default:
+               break;
+       }
   
         /* We shouldn't reset unless the controller is on fatal error state
          * _or_ if we lost the communication with it.
@@@ -1280,7 -1287,6 +1285,6 @@@ static void nvme_free_queue(struct nvme
         if (nvmeq->sq_cmds)
                 dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
                                         nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-       kfree(nvmeq);
   }
   
   static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@@ -1288,10 -1294,8 +1292,8 @@@
         int i;
   
         for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
-               struct nvme_queue *nvmeq = dev->queues[i];
                 dev->ctrl.queue_count--;
-               dev->queues[i] = NULL;
-               nvme_free_queue(nvmeq);
+               nvme_free_queue(&dev->queues[i]);
         }
   }
   
@@@ -1323,12 -1327,7 +1325,7 @@@ static int nvme_suspend_queue(struct nv
   
   static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
   {
-       struct nvme_queue *nvmeq = dev->queues[0];
- 
-       if (!nvmeq)
-               return;
-       if (nvme_suspend_queue(nvmeq))
-               return;
+       struct nvme_queue *nvmeq = &dev->queues[0];
   
         if (shutdown)
                 nvme_shutdown_ctrl(&dev->ctrl);
@@@ -1367,7 -1366,7 +1364,7 @@@ static int nvme_cmb_qdepth(struct nvme_
   static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                 int qid, int depth)
   {
-       if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+       if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                 unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                       dev->ctrl.page_size);
                 nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@@@ -1382,13 -1381,13 +1379,13 @@@
         return 0;
   }
   
- static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
-                                                       int depth, int node)
+ static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+               int depth, int node)
   {
-       struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
-                                                       node);
-       if (!nvmeq)
-               return NULL;
+       struct nvme_queue *nvmeq = &dev->queues[qid];
+ 
+       if (dev->ctrl.queue_count > qid)
+               return 0;
   
         nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
                                           &nvmeq->cq_dma_addr, GFP_KERNEL);
@@@ -1407,17 -1406,15 +1404,15 @@@
         nvmeq->q_depth = depth;
         nvmeq->qid = qid;
         nvmeq->cq_vector = -1;
-       dev->queues[qid] = nvmeq;
         dev->ctrl.queue_count++;
   
-       return nvmeq;
+       return 0;
   
    free_cqdma:
         dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
                                                         nvmeq->cq_dma_addr);
    free_nvmeq:
-       kfree(nvmeq);
-       return NULL;
+       return -ENOMEM;
   }
   
   static int queue_request_irq(struct nvme_queue *nvmeq)
@@@ -1590,14 -1587,12 +1585,12 @@@ static int nvme_pci_configure_admin_que
         if (result < 0)
                 return result;
   
-       nvmeq = dev->queues[0];
-       if (!nvmeq) {
-               nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
-                                       dev_to_node(dev->dev));
-               if (!nvmeq)
-                       return -ENOMEM;
-       }
+       result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+                       dev_to_node(dev->dev));
+       if (result)
+               return result;
   
+       nvmeq = &dev->queues[0];
         aqa = nvmeq->q_depth - 1;
         aqa |= aqa << 16;
   
@@@ -1627,7 -1622,7 +1620,7 @@@ static int nvme_create_io_queues(struc
   
         for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
                 /* vector == qid - 1, match nvme_create_queue */
-               if (!nvme_alloc_queue(dev, i, dev->q_depth,
+               if (nvme_alloc_queue(dev, i, dev->q_depth,
                      pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
                         ret = -ENOMEM;
                         break;
@@@ -1636,15 -1631,15 +1629,15 @@@
   
         max = min(dev->max_qid, dev->ctrl.queue_count - 1);
         for (i = dev->online_queues; i <= max; i++) {
-               ret = nvme_create_queue(dev->queues[i], i);
+               ret = nvme_create_queue(&dev->queues[i], i);
                 if (ret)
                         break;
         }
   
         /*
          * Ignore failing Create SQ/CQ commands, we can continue with less
-        * than the desired aount of queues, and even a controller without
-        * I/O queues an still be used to issue admin commands.  This might
+        * than the desired amount of queues, and even a controller without
+        * I/O queues can still be used to issue admin commands.  This might
          * be useful to upgrade a buggy firmware for example.
          */
         return ret >= 0 ? 0 : ret;
@@@ -1661,30 -1656,40 +1654,40 @@@ static ssize_t nvme_cmb_show(struct dev
   }
   static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
   
- static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+ static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
   {
-       u64 szu, size, offset;
+       u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+ 
+       return 1ULL << (12 + 4 * szu);
+ }
+ 
+ static u32 nvme_cmb_size(struct nvme_dev *dev)
+ {
+       return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+ }
+ 
+ static void nvme_map_cmb(struct nvme_dev *dev)
+ {
+       u64 size, offset;
         resource_size_t bar_size;
         struct pci_dev *pdev = to_pci_dev(dev->dev);
-       void __iomem *cmb;
         int bar;
   
         dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-       if (!(NVME_CMB_SZ(dev->cmbsz)))
-               return NULL;
+       if (!dev->cmbsz)
+               return;
         dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
   
         if (!use_cmb_sqes)
-               return NULL;
+               return;
   
-       szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
-       size = szu * NVME_CMB_SZ(dev->cmbsz);
-       offset = szu * NVME_CMB_OFST(dev->cmbloc);
+       size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+       offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
         bar = NVME_CMB_BIR(dev->cmbloc);
         bar_size = pci_resource_len(pdev, bar);
   
         if (offset > bar_size)
-               return NULL;
+               return;
   
         /*
          * Controllers may support a CMB size larger than their BAR,
@@@ -1694,13 -1699,16 +1697,16 @@@
         if (size > bar_size - offset)
                 size = bar_size - offset;
   
-       cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
-       if (!cmb)
-               return NULL;
- 
+       dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+       if (!dev->cmb)
+               return;
         dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
         dev->cmb_size = size;
-       return cmb;
+ 
+       if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+                                   &dev_attr_cmb.attr, NULL))
+               dev_warn(dev->ctrl.device,
+                        "failed to add sysfs attribute for CMB\n");
   }
   
   static inline void nvme_release_cmb(struct nvme_dev *dev)
@@@ -1768,7 -1776,7 +1774,7 @@@ static int __nvme_alloc_host_mem(struc
         dma_addr_t descs_dma;
         int i = 0;
         void **bufs;
-       u64 size = 0, tmp;
+       u64 size, tmp;
   
         tmp = (preferred + chunk_size - 1);
         do_div(tmp, chunk_size);
@@@ -1851,7 -1859,7 +1857,7 @@@ static int nvme_setup_host_mem(struct n
         u64 preferred = (u64)dev->ctrl.hmpre * 4096;
         u64 min = (u64)dev->ctrl.hmmin * 4096;
         u32 enable_bits = NVME_HOST_MEM_ENABLE;
-       int ret = 0;
+       int ret;
   
         preferred = min(preferred, max);
         if (min > max) {
@@@ -1892,7 -1900,7 +1898,7 @@@
   
   static int nvme_setup_io_queues(struct nvme_dev *dev)
   {
-       struct nvme_queue *adminq = dev->queues[0];
+       struct nvme_queue *adminq = &dev->queues[0];
         struct pci_dev *pdev = to_pci_dev(dev->dev);
         int result, nr_io_queues;
         unsigned long size;
@@@ -1905,7 -1913,7 +1911,7 @@@
         if (nr_io_queues == 0)
                 return 0;
   
-       if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+       if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                 result = nvme_cmb_qdepth(dev, nr_io_queues,
                                 sizeof(struct nvme_command));
                 if (result > 0)
@@@ -2005,9 -2013,9 +2011,9 @@@ static int nvme_delete_queue(struct nvm
         return 0;
   }
   
- static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
+ static void nvme_disable_io_queues(struct nvme_dev *dev)
   {
-       int pass;
+       int pass, queues = dev->online_queues - 1;
         unsigned long timeout;
         u8 opcode = nvme_admin_delete_sq;
   
@@@ -2018,7 -2026,7 +2024,7 @@@
    retry:
                 timeout = ADMIN_TIMEOUT;
                 for (; i > 0; i--, sent++)
-                       if (nvme_delete_queue(dev->queues[i], opcode))
+                       if (nvme_delete_queue(&dev->queues[i], opcode))
                                 break;
   
                 while (sent--) {
@@@ -2033,13 -2041,12 +2039,12 @@@
   }
   
   /*
-  * Return: error value if an error occurred setting up the queues or calling
-  * Identify Device.  0 if these succeeded, even if adding some of the
-  * namespaces failed.  At the moment, these failures are silent.  TBD which
-  * failures should be reported.
+  * return error value only when tagset allocation failed
    */
   static int nvme_dev_add(struct nvme_dev *dev)
   {
+       int ret;
+ 
         if (!dev->ctrl.tagset) {
                 dev->tagset.ops = &nvme_mq_ops;
                 dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@@ -2055,8 -2062,12 +2060,12 @@@
                 dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
                 dev->tagset.driver_data = dev;
   
-               if (blk_mq_alloc_tag_set(&dev->tagset))
-                       return 0;
+               ret = blk_mq_alloc_tag_set(&dev->tagset);
+               if (ret) {
+                       dev_warn(dev->ctrl.device,
+                               "IO queues tagset allocation failed %d\n", ret);
+                       return ret;
+               }
                 dev->ctrl.tagset = &dev->tagset;
   
                 nvme_dbbuf_set(dev);
@@@ -2122,22 -2133,7 +2131,7 @@@ static int nvme_pci_enable(struct nvme_
                           "set queue depth=%u\n", dev->q_depth);
         }
   
-       /*
-        * CMBs can currently only exist on >=1.2 PCIe devices. We only
-        * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
-        * has no name we can pass NULL as final argument to
-        * sysfs_add_file_to_group.
-        */
- 
-       if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
-               dev->cmb = nvme_map_cmb(dev);
-               if (dev->cmb) {
-                       if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
-                                                   &dev_attr_cmb.attr, NULL))
-                               dev_warn(dev->ctrl.device,
-                                        "failed to add sysfs attribute for CMB\n");
-               }
-       }
+       nvme_map_cmb(dev);
   
         pci_enable_pcie_error_reporting(pdev);
         pci_save_state(pdev);
@@@ -2170,7 -2166,7 +2164,7 @@@ static void nvme_pci_disable(struct nvm
   
   static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
   {
-       int i, queues;
+       int i;
         bool dead = true;
         struct pci_dev *pdev = to_pci_dev(dev->dev);
   
@@@ -2205,21 -2201,13 +2199,13 @@@
         }
         nvme_stop_queues(&dev->ctrl);
   
-       queues = dev->online_queues - 1;
-       for (i = dev->ctrl.queue_count - 1; i > 0; i--)
-               nvme_suspend_queue(dev->queues[i]);
- 
-       if (dead) {
-               /* A device might become IO incapable very soon during
-                * probe, before the admin queue is configured. Thus,
-                * queue_count can be 0 here.
-                */
-               if (dev->ctrl.queue_count)
-                       nvme_suspend_queue(dev->queues[0]);
-       } else {
-               nvme_disable_io_queues(dev, queues);
+       if (!dead) {
+               nvme_disable_io_queues(dev);
                 nvme_disable_admin_queue(dev, shutdown);
         }
+       for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+               nvme_suspend_queue(&dev->queues[i]);
+ 
         nvme_pci_disable(dev);
   
         blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@@@ -2289,6 -2277,7 +2275,7 @@@ static void nvme_reset_work(struct work
                 container_of(work, struct nvme_dev, ctrl.reset_work);
         bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
         int result = -ENODEV;
+       enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
   
         if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
                 goto out;
@@@ -2300,6 -2289,16 +2287,16 @@@
         if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                 nvme_dev_disable(dev, false);
   
+       /*
+        * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+        * initializing procedure here.
+        */
+       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+               dev_warn(dev->ctrl.device,
+                       "failed to mark controller RECONNECTING\n");
+               goto out;
+       }
+ 
         result = nvme_pci_enable(dev);
         if (result)
                 goto out;
@@@ -2352,15 -2351,23 +2349,23 @@@
                 dev_warn(dev->ctrl.device, "IO queues not created\n");
                 nvme_kill_queues(&dev->ctrl);
                 nvme_remove_namespaces(&dev->ctrl);
+               new_state = NVME_CTRL_ADMIN_ONLY;
         } else {
                 nvme_start_queues(&dev->ctrl);
                 nvme_wait_freeze(&dev->ctrl);
-               nvme_dev_add(dev);
+               /* hit this only when allocate tagset fails */
+               if (nvme_dev_add(dev))
+                       new_state = NVME_CTRL_ADMIN_ONLY;
                 nvme_unfreeze(&dev->ctrl);
         }
   
-       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
-               dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+       /*
+        * If only admin queue live, keep it to do further investigation or
+        * recovery.
+        */
+       if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+               dev_warn(dev->ctrl.device,
+                       "failed to mark controller state %d\n", new_state);
                 goto out;
         }
   
@@@ -2468,8 -2475,9 +2473,9 @@@ static int nvme_probe(struct pci_dev *p
         dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
         if (!dev)
                 return -ENOMEM;
-       dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
-                                                       GFP_KERNEL, node);
+ 
+       dev->queues = kcalloc_node(num_possible_cpus() + 1,
+                       sizeof(struct nvme_queue), GFP_KERNEL, node);
         if (!dev->queues)
                 goto free;
   
@@@ -2496,10 -2504,10 +2502,10 @@@
         if (result)
                 goto release_pools;
   
-       nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
         dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
   
-       queue_work(nvme_wq, &dev->ctrl.reset_work);
+       nvme_reset_ctrl(&dev->ctrl);
+ 
         return 0;
   
    release_pools:
@@@ -2523,7 -2531,7 +2529,7 @@@ static void nvme_reset_prepare(struct p
   static void nvme_reset_done(struct pci_dev *pdev)
   {
         struct nvme_dev *dev = pci_get_drvdata(pdev);
-       nvme_reset_ctrl(&dev->ctrl);
+       nvme_reset_ctrl_sync(&dev->ctrl);
   }
   
   static void nvme_shutdown(struct pci_dev *pdev)
diff --combined drivers/nvme/host/rdma.c

index 2a0bba7f50cf43bb76e9d1f3073e24ba1edd9e1c,6c2fdfa4c86a1d51ede0df382ef58d6e1d7364a1..2bc059f7d73c7da7ea13273aa9a0b92d1cbf2b63
--- 1/drivers/nvme/host/rdma.c
--- 2/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@@ -66,7 -66,6 +66,6 @@@ struct nvme_rdma_request 
         struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
         u32                     num_sge;
         int                     nents;
-       bool                    inline_data;
         struct ib_reg_wr        reg_wr;
         struct ib_cqe           reg_cqe;
         struct nvme_rdma_queue  *queue;
@@@ -974,18 -973,12 +973,18 @@@ static void nvme_rdma_error_recovery_wo
         blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
         nvme_start_queues(&ctrl->ctrl);
   
+ +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ +              /* state change failure should never happen */
+ +              WARN_ON_ONCE(1);
+ +              return;
+ +      }
+ +
         nvme_rdma_reconnect_or_remove(ctrl);
   }
   
   static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
   {
- -      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
+ +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
                 return;
   
         queue_work(nvme_wq, &ctrl->err_work);
@@@ -1092,7 -1085,6 +1091,6 @@@ static int nvme_rdma_map_sg_inline(stru
         sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
         sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
   
-       req->inline_data = true;
         req->num_sge++;
         return 0;
   }
@@@ -1164,7 -1156,6 +1162,6 @@@ static int nvme_rdma_map_data(struct nv
         int count, ret;
   
         req->num_sge = 1;
-       req->inline_data = false;
         refcount_set(&req->ref, 2); /* send and recv completions */
   
         c->common.flags |= NVME_CMD_SGL_METABUF;
@@@ -1759,12 -1750,6 +1756,12 @@@ static void nvme_rdma_reset_ctrl_work(s
         nvme_stop_ctrl(&ctrl->ctrl);
         nvme_rdma_shutdown_ctrl(ctrl, false);
   
+ +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ +              /* state change failure should never happen */
+ +              WARN_ON_ONCE(1);
+ +              return;
+ +      }
+ +
         ret = nvme_rdma_configure_admin_queue(ctrl, false);
         if (ret)
                 goto out_fail;
@@@ -2018,6 -2003,7 +2015,7 @@@ out_free_ctrl
   
   static struct nvmf_transport_ops nvme_rdma_transport = {
         .name           = "rdma",
+       .module         = THIS_MODULE,
         .required_opts  = NVMF_OPT_TRADDR,
         .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
                           NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@@@ -2040,7 -2026,7 +2038,7 @@@ static void nvme_rdma_remove_one(struc
         }
         mutex_unlock(&nvme_rdma_ctrl_mutex);
   
-       flush_workqueue(nvme_wq);
+       flush_workqueue(nvme_delete_wq);
   }
   
   static struct ib_client nvme_rdma_ib_client = {
diff --combined drivers/nvme/target/fcloop.c

index 6a018a0bd6ce851306dd82e5c21e680c626f99d5,9f8a6726df91502b551c02db7018cb34469cc060..34712def81b15a566bb16a7e320ec7012796cf27
--- 1/drivers/nvme/target/fcloop.c
--- 2/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@@ -204,6 -204,10 +204,10 @@@ struct fcloop_lport 
         struct completion unreg_done;
   };
   
+ struct fcloop_lport_priv {
+       struct fcloop_lport *lport;
+ };
+ 
   struct fcloop_rport {
         struct nvme_fc_remote_port *remoteport;
         struct nvmet_fc_target_port *targetport;
@@@ -238,21 -242,32 +242,32 @@@ struct fcloop_lsreq 
         int                             status;
   };
   
+ enum {
+       INI_IO_START            = 0,
+       INI_IO_ACTIVE           = 1,
+       INI_IO_ABORTED          = 2,
+       INI_IO_COMPLETED        = 3,
+ };
+ 
   struct fcloop_fcpreq {
         struct fcloop_tport             *tport;
         struct nvmefc_fcp_req           *fcpreq;
         spinlock_t                      reqlock;
         u16                             status;
+       u32                             inistate;
         bool                            active;
         bool                            aborted;
-       struct work_struct              work;
+       struct kref                     ref;
+       struct work_struct              fcp_rcv_work;
+       struct work_struct              abort_rcv_work;
+       struct work_struct              tio_done_work;
         struct nvmefc_tgt_fcp_req       tgt_fcp_req;
   };
   
   struct fcloop_ini_fcpreq {
         struct nvmefc_fcp_req           *fcpreq;
         struct fcloop_fcpreq            *tfcp_req;
-       struct work_struct              iniwork;
+       spinlock_t                      inilock;
   };
   
   static inline struct fcloop_lsreq *
@@@ -343,17 -358,122 +358,122 @@@ fcloop_xmt_ls_rsp(struct nvmet_fc_targe
         return 0;
   }
   
- /*
-  * FCP IO operation done by initiator abort.
-  * call back up initiator "done" flows.
-  */
   static void
- fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+ fcloop_tfcp_req_free(struct kref *ref)
   {
-       struct fcloop_ini_fcpreq *inireq =
-               container_of(work, struct fcloop_ini_fcpreq, iniwork);
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(ref, struct fcloop_fcpreq, ref);
   
-       inireq->fcpreq->done(inireq->fcpreq);
+       kfree(tfcp_req);
+ }
+ 
+ static void
+ fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+ {
+       kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+ }
+ 
+ static int
+ fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+ {
+       return kref_get_unless_zero(&tfcp_req->ref);
+ }
+ 
+ static void
+ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+                       struct fcloop_fcpreq *tfcp_req, int status)
+ {
+       struct fcloop_ini_fcpreq *inireq = NULL;
+ 
+       if (fcpreq) {
+               inireq = fcpreq->private;
+               spin_lock(&inireq->inilock);
+               inireq->tfcp_req = NULL;
+               spin_unlock(&inireq->inilock);
+ 
+               fcpreq->status = status;
+               fcpreq->done(fcpreq);
+       }
+ 
+       /* release original io reference on tgt struct */
+       fcloop_tfcp_req_put(tfcp_req);
+ }
+ 
+ static void
+ fcloop_fcp_recv_work(struct work_struct *work)
+ {
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+       struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+       int ret = 0;
+       bool aborted = false;
+ 
+       spin_lock(&tfcp_req->reqlock);
+       switch (tfcp_req->inistate) {
+       case INI_IO_START:
+               tfcp_req->inistate = INI_IO_ACTIVE;
+               break;
+       case INI_IO_ABORTED:
+               aborted = true;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
+       spin_unlock(&tfcp_req->reqlock);
+ 
+       if (unlikely(aborted))
+               ret = -ECANCELED;
+       else
+               ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+                               &tfcp_req->tgt_fcp_req,
+                               fcpreq->cmdaddr, fcpreq->cmdlen);
+       if (ret)
+               fcloop_call_host_done(fcpreq, tfcp_req, ret);
+ 
+       return;
+ }
+ 
+ static void
+ fcloop_fcp_abort_recv_work(struct work_struct *work)
+ {
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+       struct nvmefc_fcp_req *fcpreq;
+       bool completed = false;
+ 
+       spin_lock(&tfcp_req->reqlock);
+       fcpreq = tfcp_req->fcpreq;
+       switch (tfcp_req->inistate) {
+       case INI_IO_ABORTED:
+               break;
+       case INI_IO_COMPLETED:
+               completed = true;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
+       spin_unlock(&tfcp_req->reqlock);
+ 
+       if (unlikely(completed)) {
+               /* remove reference taken in original abort downcall */
+               fcloop_tfcp_req_put(tfcp_req);
+               return;
+       }
+ 
+       if (tfcp_req->tport->targetport)
+               nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+                                       &tfcp_req->tgt_fcp_req);
+ 
+       spin_lock(&tfcp_req->reqlock);
+       tfcp_req->fcpreq = NULL;
+       spin_unlock(&tfcp_req->reqlock);
+ 
+       fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+       /* call_host_done releases reference for abort downcall */
   }
   
   /*
@@@ -364,20 -484,15 +484,15 @@@ static voi
   fcloop_tgt_fcprqst_done_work(struct work_struct *work)
   {
         struct fcloop_fcpreq *tfcp_req =
-               container_of(work, struct fcloop_fcpreq, work);
-       struct fcloop_tport *tport = tfcp_req->tport;
+               container_of(work, struct fcloop_fcpreq, tio_done_work);
         struct nvmefc_fcp_req *fcpreq;
   
         spin_lock(&tfcp_req->reqlock);
         fcpreq = tfcp_req->fcpreq;
+       tfcp_req->inistate = INI_IO_COMPLETED;
         spin_unlock(&tfcp_req->reqlock);
   
-       if (tport->remoteport && fcpreq) {
-               fcpreq->status = tfcp_req->status;
-               fcpreq->done(fcpreq);
-       }
- 
-       kfree(tfcp_req);
+       fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
   }
   
   
@@@ -390,7 -505,6 +505,6 @@@ fcloop_fcp_req(struct nvme_fc_local_por
         struct fcloop_rport *rport = remoteport->private;
         struct fcloop_ini_fcpreq *inireq = fcpreq->private;
         struct fcloop_fcpreq *tfcp_req;
-       int ret = 0;
   
         if (!rport->targetport)
                 return -ECONNREFUSED;
@@@ -401,16 -515,20 +515,20 @@@
   
         inireq->fcpreq = fcpreq;
         inireq->tfcp_req = tfcp_req;
-       INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
+       spin_lock_init(&inireq->inilock);
+ 
         tfcp_req->fcpreq = fcpreq;
         tfcp_req->tport = rport->targetport->private;
+       tfcp_req->inistate = INI_IO_START;
         spin_lock_init(&tfcp_req->reqlock);
-       INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+       INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+       INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+       INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+       kref_init(&tfcp_req->ref);
   
-       ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
-                                fcpreq->cmdaddr, fcpreq->cmdlen);
+       schedule_work(&tfcp_req->fcp_rcv_work);
   
-       return ret;
+       return 0;
   }
   
   static void
@@@ -589,7 -707,7 +707,7 @@@ fcloop_fcp_req_release(struct nvmet_fc_
   {
         struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
   
-       schedule_work(&tfcp_req->work);
+       schedule_work(&tfcp_req->tio_done_work);
   }
   
   static void
@@@ -605,27 -723,47 +723,47 @@@ fcloop_fcp_abort(struct nvme_fc_local_p
                         void *hw_queue_handle,
                         struct nvmefc_fcp_req *fcpreq)
   {
-       struct fcloop_rport *rport = remoteport->private;
         struct fcloop_ini_fcpreq *inireq = fcpreq->private;
-       struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+       struct fcloop_fcpreq *tfcp_req;
+       bool abortio = true;
+ 
+       spin_lock(&inireq->inilock);
+       tfcp_req = inireq->tfcp_req;
+       if (tfcp_req)
+               fcloop_tfcp_req_get(tfcp_req);
+       spin_unlock(&inireq->inilock);
   
         if (!tfcp_req)
                 /* abort has already been called */
                 return;
   
-       if (rport->targetport)
-               nvmet_fc_rcv_fcp_abort(rport->targetport,
-                                       &tfcp_req->tgt_fcp_req);
- 
         /* break initiator/target relationship for io */
         spin_lock(&tfcp_req->reqlock);
-       inireq->tfcp_req = NULL;
-       tfcp_req->fcpreq = NULL;
+       switch (tfcp_req->inistate) {
+       case INI_IO_START:
+       case INI_IO_ACTIVE:
+               tfcp_req->inistate = INI_IO_ABORTED;
+               break;
+       case INI_IO_COMPLETED:
+               abortio = false;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
         spin_unlock(&tfcp_req->reqlock);
   
-       /* post the aborted io completion */
-       fcpreq->status = -ECANCELED;
-       schedule_work(&inireq->iniwork);
+       if (abortio)
+               /* leave the reference while the work item is scheduled */
+               WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+       else  {
+               /*
+                * as the io has already had the done callback made,
+                * nothing more to do. So release the reference taken above
+                */
+               fcloop_tfcp_req_put(tfcp_req);
+       }
   }
   
   static void
@@@ -657,7 -795,8 +795,8 @@@ fcloop_nport_get(struct fcloop_nport *n
   static void
   fcloop_localport_delete(struct nvme_fc_local_port *localport)
   {
-       struct fcloop_lport *lport = localport->private;
+       struct fcloop_lport_priv *lport_priv = localport->private;
+       struct fcloop_lport *lport = lport_priv->lport;
   
         /* release any threads waiting for the unreg to complete */
         complete(&lport->unreg_done);
@@@ -697,7 -836,7 +836,7 @@@ static struct nvme_fc_port_template fct
         .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
         .dma_boundary           = FCLOOP_DMABOUND_4G,
         /* sizes of additional private data for data structures */
-       .local_priv_sz          = sizeof(struct fcloop_lport),
+       .local_priv_sz          = sizeof(struct fcloop_lport_priv),
         .remote_priv_sz         = sizeof(struct fcloop_rport),
         .lsrqst_priv_sz         = sizeof(struct fcloop_lsreq),
         .fcprqst_priv_sz        = sizeof(struct fcloop_ini_fcpreq),
@@@ -714,8 -853,7 +853,7 @@@ static struct nvmet_fc_target_template 
         .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
         .dma_boundary           = FCLOOP_DMABOUND_4G,
         /* optional features */
-       .target_features        = NVMET_FCTGTFEAT_CMD_IN_ISR |
-                                 NVMET_FCTGTFEAT_OPDONE_IN_ISR,
+       .target_features        = 0,
         /* sizes of additional private data for data structures */
         .target_priv_sz         = sizeof(struct fcloop_tport),
   };
@@@ -728,11 -866,17 +866,17 @@@ fcloop_create_local_port(struct device 
         struct fcloop_ctrl_options *opts;
         struct nvme_fc_local_port *localport;
         struct fcloop_lport *lport;
-       int ret;
+       struct fcloop_lport_priv *lport_priv;
+       unsigned long flags;
+       int ret = -ENOMEM;
+ 
+       lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+       if (!lport)
+               return -ENOMEM;
   
         opts = kzalloc(sizeof(*opts), GFP_KERNEL);
         if (!opts)
-               return -ENOMEM;
+               goto out_free_lport;
   
         ret = fcloop_parse_options(opts, buf);
         if (ret)
@@@ -752,23 -896,25 +896,25 @@@
   
         ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
         if (!ret) {
-               unsigned long flags;
- 
                 /* success */
-               lport = localport->private;
+               lport_priv = localport->private;
+               lport_priv->lport = lport;
+ 
                 lport->localport = localport;
                 INIT_LIST_HEAD(&lport->lport_list);
   
                 spin_lock_irqsave(&fcloop_lock, flags);
                 list_add_tail(&lport->lport_list, &fcloop_lports);
                 spin_unlock_irqrestore(&fcloop_lock, flags);
- 
-               /* mark all of the input buffer consumed */
-               ret = count;
         }
   
   out_free_opts:
         kfree(opts);
+ out_free_lport:
+       /* free only if we're going to fail */
+       if (ret)
+               kfree(lport);
+ 
         return ret ? ret : count;
   }
   
@@@ -790,6 -936,8 +936,8 @@@ __wait_localport_unreg(struct fcloop_lp
   
         wait_for_completion(&lport->unreg_done);
   
+       kfree(lport);
+ 
         return ret;
   }
   
@@@ -1085,7 -1233,7 +1233,7 @@@ fcloop_delete_target_port(struct devic
                 const char *buf, size_t count)
   {
         struct fcloop_nport *nport = NULL, *tmpport;
- -      struct fcloop_tport *tport;
+ +      struct fcloop_tport *tport = NULL;
         u64 nodename, portname;
         unsigned long flags;
         int ret;
diff --combined include/linux/bio.h

index 23d29b39f71e83e8a6a25540adc2e3f28702aec7,367a979fd4a6f250c6778b0dfe29b6b6450ddc6d..d0eb659fa733eb91b57a135932f45b1eea8d9975
--- 1/include/linux/bio.h
--- 2/include/linux/bio.h
+++ b/include/linux/bio.h
@@@ -300,6 -300,29 +300,29 @@@ static inline void bio_get_last_bvec(st
                 bv->bv_len = iter.bi_bvec_done;
   }
   
+ static inline unsigned bio_pages_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return bio->bi_vcnt;
+ }
+ 
+ static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return bio->bi_io_vec;
+ }
+ 
+ static inline struct page *bio_first_page_all(struct bio *bio)
+ {
+       return bio_first_bvec_all(bio)->bv_page;
+ }
+ 
+ static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return &bio->bi_io_vec[bio->bi_vcnt - 1];
+ }
+ 
   enum bip_flags {
         BIP_BLOCK_INTEGRITY     = 1 << 0, /* block layer owns integrity data */
         BIP_MAPPED_INTEGRITY    = 1 << 1, /* ref tag has been remapped */
@@@ -477,7 -500,6 +500,6 @@@ static inline void bio_flush_dcache_pag
   #endif
   
   extern void bio_copy_data(struct bio *dst, struct bio *src);
- extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
   extern void bio_free_pages(struct bio *bio);
   
   extern struct bio *bio_copy_user_iov(struct request_queue *,
@@@ -492,8 -514,6 +514,8 @@@ extern unsigned int bvec_nr_vecs(unsign
   
   #define bio_set_dev(bio, bdev)                        \
   do {                                          \
+ +      if ((bio)->bi_disk != (bdev)->bd_disk)  \
+ +              bio_clear_flag(bio, BIO_THROTTLED);\
         (bio)->bi_disk = (bdev)->bd_disk;       \
         (bio)->bi_partno = (bdev)->bd_partno;   \
   } while (0)
diff --combined include/linux/blk_types.h

index 9e7d8bd776d227d2ba92b137af7230300f5b1d4a,2d973ac54b09f084c35e499c15302fd3066f4823..c5d3db0d83f8ac1adf177f0f92c7bf3ed0e2c261
--- 1/include/linux/blk_types.h
--- 2/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@@ -39,6 -39,34 +39,34 @@@ typedef u8 __bitwise blk_status_t
   
   #define BLK_STS_AGAIN         ((__force blk_status_t)12)
   
+ /**
+  * blk_path_error - returns true if error may be path related
+  * @error: status the request was completed with
+  *
+  * Description:
+  *     This classifies block error status into non-retryable errors and ones
+  *     that may be successful if retried on a failover path.
+  *
+  * Return:
+  *     %false - retrying failover path will not help
+  *     %true  - may succeed if retried
+  */
+ static inline bool blk_path_error(blk_status_t error)
+ {
+       switch (error) {
+       case BLK_STS_NOTSUPP:
+       case BLK_STS_NOSPC:
+       case BLK_STS_TARGET:
+       case BLK_STS_NEXUS:
+       case BLK_STS_MEDIUM:
+       case BLK_STS_PROTECTION:
+               return false;
+       }
+ 
+       /* Anything else could be a path failure, so should be retried */
+       return true;
+ }
+ 
   struct blk_issue_stat {
         u64 stat;
   };
@@@ -50,6 -78,8 +78,6 @@@
   struct bio {
         struct bio              *bi_next;       /* request queue link */
         struct gendisk          *bi_disk;
- -      u8                      bi_partno;
- -      blk_status_t            bi_status;
         unsigned int            bi_opf;         /* bottom bits req flags,
                                                  * top bits REQ_OP. Use
                                                  * accessors.
@@@ -57,8 -87,8 +85,8 @@@
         unsigned short          bi_flags;       /* status, etc and bvec pool number */
         unsigned short          bi_ioprio;
         unsigned short          bi_write_hint;
- -
- -      struct bvec_iter        bi_iter;
+ +      blk_status_t            bi_status;
+ +      u8                      bi_partno;
   
         /* Number of segments in this BIO after
          * physical address coalescing is performed.
@@@ -72,9 -102,8 +100,9 @@@
         unsigned int            bi_seg_front_size;
         unsigned int            bi_seg_back_size;
   
- -      atomic_t                __bi_remaining;
+ +      struct bvec_iter        bi_iter;
   
+ +      atomic_t                __bi_remaining;
         bio_end_io_t            *bi_end_io;
   
         void                    *bi_private;
diff --combined include/linux/blkdev.h

index 0ce8a372d5069a7aca7810429a968d20e923d3d1,afc43fb63c1604009f9539fd81ae55502a7a4a16..4f3df807cf8f73076ca6e735b901b14360528aa6
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -27,6 -27,8 +27,8 @@@
   #include <linux/percpu-refcount.h>
   #include <linux/scatterlist.h>
   #include <linux/blkzoned.h>
+ #include <linux/seqlock.h>
+ #include <linux/u64_stats_sync.h>
   
   struct module;
   struct scsi_ioctl_command;
@@@ -121,6 -123,12 +123,12 @@@ typedef __u32 __bitwise req_flags_t
   /* Look at ->special_vec for the actual data payload instead of the
      bio chain. */
   #define RQF_SPECIAL_PAYLOAD   ((__force req_flags_t)(1 << 18))
+ /* The per-zone write lock is held for this request */
+ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+ /* timeout is expired */
+ #define RQF_MQ_TIMEOUT_EXPIRED        ((__force req_flags_t)(1 << 20))
+ /* already slept for hybrid poll */
+ #define RQF_MQ_POLL_SLEPT     ((__force req_flags_t)(1 << 21))
   
   /* flags that prevent us from merging requests: */
   #define RQF_NOMERGE_FLAGS \
@@@ -133,12 -141,6 +141,6 @@@
    * especially blk_mq_rq_ctx_init() to take care of the added fields.
    */
   struct request {
-       struct list_head queuelist;
-       union {
-               struct __call_single_data csd;
-               u64 fifo_time;
-       };
- 
         struct request_queue *q;
         struct blk_mq_ctx *mq_ctx;
   
@@@ -148,8 -150,6 +150,6 @@@
   
         int internal_tag;
   
-       unsigned long atomic_flags;
- 
         /* the following two fields are internal, NEVER access directly */
         unsigned int __data_len;        /* total data len */
         int tag;
@@@ -158,6 -158,8 +158,8 @@@
         struct bio *bio;
         struct bio *biotail;
   
+       struct list_head queuelist;
+ 
         /*
          * The hash is used inside the scheduler, and killed once the
          * request reaches the dispatch list. The ipi_list is only used
@@@ -205,19 -207,16 +207,16 @@@
         struct hd_struct *part;
         unsigned long start_time;
         struct blk_issue_stat issue_stat;
- #ifdef CONFIG_BLK_CGROUP
-       struct request_list *rl;                /* rl this rq is alloced from */
-       unsigned long long start_time_ns;
-       unsigned long long io_start_time_ns;    /* when passed to hardware */
- #endif
         /* Number of scatter-gather DMA addr+len pairs after
          * physical address coalescing is performed.
          */
         unsigned short nr_phys_segments;
+ 
   #if defined(CONFIG_BLK_DEV_INTEGRITY)
         unsigned short nr_integrity_segments;
   #endif
   
+       unsigned short write_hint;
         unsigned short ioprio;
   
         unsigned int timeout;
@@@ -226,11 -225,37 +225,37 @@@
   
         unsigned int extra_len; /* length of alignment and padding */
   
-       unsigned short write_hint;
+       /*
+        * On blk-mq, the lower bits of ->gstate (generation number and
+        * state) carry the MQ_RQ_* state value and the upper bits the
+        * generation number which is monotonically incremented and used to
+        * distinguish the reuse instances.
+        *
+        * ->gstate_seq allows updates to ->gstate and other fields
+        * (currently ->deadline) during request start to be read
+        * atomically from the timeout path, so that it can operate on a
+        * coherent set of information.
+        */
+       seqcount_t gstate_seq;
+       u64 gstate;
+ 
+       /*
+        * ->aborted_gstate is used by the timeout to claim a specific
+        * recycle instance of this request.  See blk_mq_timeout_work().
+        */
+       struct u64_stats_sync aborted_gstate_sync;
+       u64 aborted_gstate;
+ 
+       /* access through blk_rq_set_deadline, blk_rq_deadline */
+       unsigned long __deadline;
   
-       unsigned long deadline;
         struct list_head timeout_list;
   
- -              call_single_data_t csd;
+       union {
++              struct __call_single_data csd;
+               u64 fifo_time;
+       };
+ 
         /*
          * completion callback.
          */
@@@ -239,26 -264,22 +264,32 @@@
   
         /* for bidi */
         struct request *next_rq;
+ 
+ #ifdef CONFIG_BLK_CGROUP
+       struct request_list *rl;                /* rl this rq is alloced from */
+       unsigned long long start_time_ns;
+       unsigned long long io_start_time_ns;    /* when passed to hardware */
+ #endif
   };
   
+ +static inline bool blk_op_is_scsi(unsigned int op)
+ +{
+ +      return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
+ +}
+ +
+ +static inline bool blk_op_is_private(unsigned int op)
+ +{
+ +      return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
+ +}
+ +
   static inline bool blk_rq_is_scsi(struct request *rq)
   {
- -      return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+ +      return blk_op_is_scsi(req_op(rq));
   }
   
   static inline bool blk_rq_is_private(struct request *rq)
   {
- -      return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+ +      return blk_op_is_private(req_op(rq));
   }
   
   static inline bool blk_rq_is_passthrough(struct request *rq)
@@@ -266,13 -287,6 +297,13 @@@
         return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
   }
   
+ +static inline bool bio_is_passthrough(struct bio *bio)
+ +{
+ +      unsigned op = bio_op(bio);
+ +
+ +      return blk_op_is_scsi(op) || blk_op_is_private(op);
+ +}
+ +
   static inline unsigned short req_get_ioprio(struct request *req)
   {
         return req->ioprio;
@@@ -563,6 -577,22 +594,22 @@@ struct request_queue 
   
         struct queue_limits     limits;
   
+       /*
+        * Zoned block device information for request dispatch control.
+        * nr_zones is the total number of zones of the device. This is always
+        * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+        * bits which indicates if a zone is conventional (bit clear) or
+        * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+        * bits which indicates if a zone is write locked, that is, if a write
+        * request targeting the zone was dispatched. All three fields are
+        * initialized by the low level device driver (e.g. scsi/sd.c).
+        * Stacking drivers (device mappers) may or may not initialize
+        * these fields.
+        */
+       unsigned int            nr_zones;
+       unsigned long           *seq_zones_bitmap;
+       unsigned long           *seq_zones_wlock;
+ 
         /*
          * sg stuff
          */
@@@ -807,6 -837,27 +854,27 @@@ static inline unsigned int blk_queue_zo
         return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
   }
   
+ static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+ {
+       return q->nr_zones;
+ }
+ 
+ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+                                            sector_t sector)
+ {
+       if (!blk_queue_is_zoned(q))
+               return 0;
+       return sector >> ilog2(q->limits.chunk_sectors);
+ }
+ 
+ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+                                        sector_t sector)
+ {
+       if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+               return false;
+       return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+ }
+ 
   static inline bool rq_is_sync(struct request *rq)
   {
         return op_is_sync(rq->cmd_flags);
@@@ -965,7 -1016,7 +1033,7 @@@ extern int blk_rq_prep_clone(struct req
   extern void blk_rq_unprep_clone(struct request *rq);
   extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
                                      struct request *rq);
- -extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
+ +extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
   extern void blk_delay_queue(struct request_queue *, unsigned long);
   extern void blk_queue_split(struct request_queue *, struct bio **);
   extern void blk_recount_segments(struct request_queue *, struct bio *);
@@@ -1046,6 -1097,16 +1114,16 @@@ static inline unsigned int blk_rq_cur_s
         return blk_rq_cur_bytes(rq) >> 9;
   }
   
+ static inline unsigned int blk_rq_zone_no(struct request *rq)
+ {
+       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+ }
+ 
+ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+ {
+       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+ }
+ 
   /*
    * Some commands like WRITE SAME have a payload or data transfer size which
    * is different from the size of the request.  Any driver that supports such
@@@ -1595,7 -1656,15 +1673,15 @@@ static inline unsigned int bdev_zone_se
   
         if (q)
                 return blk_queue_zone_sectors(q);
+       return 0;
+ }
+ 
+ static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+ {
+       struct request_queue *q = bdev_get_queue(bdev);
   
+       if (q)
+               return blk_queue_nr_zones(q);
         return 0;
   }
   
@@@ -1731,8 -1800,6 +1817,6 @@@ static inline bool req_gap_front_merge(
   
   int kblockd_schedule_work(struct work_struct *work);
   int kblockd_schedule_work_on(int cpu, struct work_struct *work);
- int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
   int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
   
   #ifdef CONFIG_BLK_CGROUP
@@@ -1971,6 -2038,60 +2055,60 @@@ extern int __blkdev_driver_ioctl(struc
   extern int bdev_read_page(struct block_device *, sector_t, struct page *);
   extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                 struct writeback_control *);
+ 
+ #ifdef CONFIG_BLK_DEV_ZONED
+ bool blk_req_needs_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_unlock(struct request *rq);
+ 
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+       if (blk_req_needs_zone_write_lock(rq))
+               __blk_req_zone_write_lock(rq);
+ }
+ 
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+               __blk_req_zone_write_unlock(rq);
+ }
+ 
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+       return rq->q->seq_zones_wlock &&
+               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+ }
+ 
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+       if (!blk_req_needs_zone_write_lock(rq))
+               return true;
+       return !blk_req_zone_is_write_locked(rq);
+ }
+ #else
+ static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+ {
+       return false;
+ }
+ 
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+ }
+ 
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+ }
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+       return false;
+ }
+ 
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+       return true;
+ }
+ #endif /* CONFIG_BLK_DEV_ZONED */
+ 
   #else /* CONFIG_BLOCK */
   
   struct block_device;
diff --combined kernel/power/swap.c

index a46be1261c095e74fa49f363b917ae7996a29cbd,96c736313faab8e536257fc191a800753166a7c2..11b4282c2d2031d8208d53917f78d3aec5323bf0
--- 1/kernel/power/swap.c
--- 2/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@@ -240,7 -240,7 +240,7 @@@ static void hib_init_batch(struct hib_b
   static void hib_end_io(struct bio *bio)
   {
         struct hib_bio_batch *hb = bio->bi_private;
-       struct page *page = bio->bi_io_vec[0].bv_page;
+       struct page *page = bio_first_page_all(bio);
   
         if (bio->bi_status) {
                 pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
@@@ -879,7 -879,7 +879,7 @@@ out_clean
    *    space avaiable from the resume partition.
    */
   
- -static int enough_swap(unsigned int nr_pages, unsigned int flags)
+ +static int enough_swap(unsigned int nr_pages)
   {
         unsigned int free_swap = count_swap_pages(root_swap, 1);
         unsigned int required;
@@@ -915,7 -915,7 +915,7 @@@ int swsusp_write(unsigned int flags
                 return error;
         }
         if (flags & SF_NOCOMPRESS_MODE) {
- -              if (!enough_swap(pages, flags)) {
+ +              if (!enough_swap(pages)) {
                         pr_err("Not enough free swap\n");
                         error = -ENOSPC;
                         goto out_finish;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
		1	2
block/bio.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-map.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-mq.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-throttle.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk.h	patch \|	diff1 \|	diff2 \|	blob \| history
block/bounce.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/block/null_blk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm-crypt.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/fabrics.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/fc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/nvme.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/pci.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/host/rdma.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/nvme/target/fcloop.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/bio.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/power/swap.c	patch \|	diff1 \|	diff2 \|	blob \| history