]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge branch 'for-4.16/block' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 29 Jan 2018 19:51:49 +0000 (11:51 -0800)
Pull block updates from Jens Axboe:
 "This is the main pull request for block IO related changes for the
  4.16 kernel. Nothing major in this pull request, but a good amount of
  improvements and fixes all over the map. This contains:

   - BFQ improvements, fixes, and cleanups from Angelo, Chiara, and
     Paolo.

   - Support for SMR zones for deadline and mq-deadline from Damien and
     Christoph.

   - Set of fixes for bcache by way of Michael Lyle, including fixes
     from himself, Kent, Rui, Tang, and Coly.

   - Series from Matias for lightnvm with fixes from Hans Holmberg,
     Javier, and Matias. Mostly centered around pblk, and the removing
     rrpc 1.2 in preparation for supporting 2.0.

   - A couple of NVMe pull requests from Christoph. Nothing major in
     here, just fixes and cleanups, and support for command tracing from
     Johannes.

   - Support for blk-throttle for tracking reads and writes separately.
     From Joseph Qi. A few cleanups/fixes also for blk-throttle from
     Weiping.

   - Series from Mike Snitzer that enables dm to register its queue more
     logically, something that's alwways been problematic on dm since
     it's a stacked device.

   - Series from Ming cleaning up some of the bio accessor use, in
     preparation for supporting multipage bvecs.

   - Various fixes from Ming closing up holes around queue mapping and
     quiescing.

   - BSD partition fix from Richard Narron, fixing a problem where we
     can't mount newer (10/11) FreeBSD partitions.

   - Series from Tejun reworking blk-mq timeout handling. The previous
     scheme relied on atomic bits, but it had races where we would think
     a request had timed out if it to reused at the wrong time.

   - null_blk now supports faking timeouts, to enable us to better
     exercise and test that functionality separately. From me.

   - Kill the separate atomic poll bit in the request struct. After
     this, we don't use the atomic bits on blk-mq anymore at all. From
     me.

   - sgl_alloc/free helpers from Bart.

   - Heavily contended tag case scalability improvement from me.

   - Various little fixes and cleanups from Arnd, Bart, Corentin,
     Douglas, Eryu, Goldwyn, and myself"

* 'for-4.16/block' of git://git.kernel.dk/linux-block: (186 commits)
  block: remove smart1,2.h
  nvme: add tracepoint for nvme_complete_rq
  nvme: add tracepoint for nvme_setup_cmd
  nvme-pci: introduce RECONNECTING state to mark initializing procedure
  nvme-rdma: remove redundant boolean for inline_data
  nvme: don't free uuid pointer before printing it
  nvme-pci: Suspend queues after deleting them
  bsg: use pr_debug instead of hand crafted macros
  blk-mq-debugfs: don't allow write on attributes with seq_operations set
  nvme-pci: Fix queue double allocations
  block: Set BIO_TRACE_COMPLETION on new bio during split
  blk-throttle: use queue_is_rq_based
  block: Remove kblockd_schedule_delayed_work{,_on}()
  blk-mq: Avoid that blk_mq_delay_run_hw_queue() introduces unintended delays
  blk-mq: Rename blk_mq_request_direct_issue() into blk_mq_request_issue_directly()
  lib/scatterlist: Fix chaining support in sgl_alloc_order()
  blk-throttle: track read and write request individually
  block: add bdev_read_only() checks to common helpers
  block: fail op_is_write() requests to read-only partitions
  blk-throttle: export io_serviced_recursive, io_service_bytes_recursive
  ...

20 files changed:
1  2 
block/bio.c
block/blk-core.c
block/blk-map.c
block/blk-mq.c
block/blk-throttle.c
block/blk.h
block/bounce.c
drivers/block/null_blk.c
drivers/md/dm-crypt.c
drivers/nvme/host/core.c
drivers/nvme/host/fabrics.c
drivers/nvme/host/fc.c
drivers/nvme/host/nvme.h
drivers/nvme/host/pci.c
drivers/nvme/host/rdma.c
drivers/nvme/target/fcloop.c
include/linux/bio.h
include/linux/blk_types.h
include/linux/blkdev.h
kernel/power/swap.c

diff --combined block/bio.c
index 9ef6cf3addb38cae822d0e5c5ef18ba9e98cd2d7,77993fb4bac682e0ed83d2a7a6bdd5f5d090b0ba..e1708db48258cb9bc8487d732074f476dc1fb520
@@@ -599,8 -599,6 +599,8 @@@ void __bio_clone_fast(struct bio *bio, 
        bio->bi_disk = bio_src->bi_disk;
        bio->bi_partno = bio_src->bi_partno;
        bio_set_flag(bio, BIO_CLONED);
 +      if (bio_flagged(bio_src, BIO_THROTTLED))
 +              bio_set_flag(bio, BIO_THROTTLED);
        bio->bi_opf = bio_src->bi_opf;
        bio->bi_write_hint = bio_src->bi_write_hint;
        bio->bi_iter = bio_src->bi_iter;
@@@ -970,34 -968,6 +970,6 @@@ void bio_advance(struct bio *bio, unsig
  }
  EXPORT_SYMBOL(bio_advance);
  
- /**
-  * bio_alloc_pages - allocates a single page for each bvec in a bio
-  * @bio: bio to allocate pages for
-  * @gfp_mask: flags for allocation
-  *
-  * Allocates pages up to @bio->bi_vcnt.
-  *
-  * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
-  * freed.
-  */
- int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
- {
-       int i;
-       struct bio_vec *bv;
-       bio_for_each_segment_all(bv, bio, i) {
-               bv->bv_page = alloc_page(gfp_mask);
-               if (!bv->bv_page) {
-                       while (--bv >= bio->bi_io_vec)
-                               __free_page(bv->bv_page);
-                       return -ENOMEM;
-               }
-       }
-       return 0;
- }
- EXPORT_SYMBOL(bio_alloc_pages);
  /**
   * bio_copy_data - copy contents of data buffers from one chain of bios to
   * another
@@@ -1838,7 -1808,7 +1810,7 @@@ struct bio *bio_split(struct bio *bio, 
        bio_advance(bio, split->bi_iter.bi_size);
  
        if (bio_flagged(bio, BIO_TRACE_COMPLETION))
-               bio_set_flag(bio, BIO_TRACE_COMPLETION);
+               bio_set_flag(split, BIO_TRACE_COMPLETION);
  
        return split;
  }
diff --combined block/blk-core.c
index 3ba4326a63b59632fad81e686bf8229eee07320b,cdae69be68e9c779ff70028c60ab2bfd44786d6e..a2005a485335b5b42082bf02ffd7b3d1e90b3f3c
@@@ -126,6 -126,8 +126,8 @@@ void blk_rq_init(struct request_queue *
        rq->start_time = jiffies;
        set_start_time_ns(rq);
        rq->part = NULL;
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
  }
  EXPORT_SYMBOL(blk_rq_init);
  
@@@ -562,13 -564,6 +564,13 @@@ static void __blk_drain_queue(struct re
        }
  }
  
 +void blk_drain_queue(struct request_queue *q)
 +{
 +      spin_lock_irq(q->queue_lock);
 +      __blk_drain_queue(q, true);
 +      spin_unlock_irq(q->queue_lock);
 +}
 +
  /**
   * blk_queue_bypass_start - enter queue bypass mode
   * @q: queue of interest
@@@ -696,9 -691,20 +698,18 @@@ void blk_cleanup_queue(struct request_q
         */
        blk_freeze_queue(q);
        spin_lock_irq(lock);
 -      if (!q->mq_ops)
 -              __blk_drain_queue(q, true);
        queue_flag_set(QUEUE_FLAG_DEAD, q);
        spin_unlock_irq(lock);
  
+       /*
+        * make sure all in-progress dispatch are completed because
+        * blk_freeze_queue() can only complete all requests, and
+        * dispatch may still be in-progress since we dispatch requests
+        * from more than one contexts
+        */
+       if (q->mq_ops)
+               blk_mq_quiesce_queue(q);
        /* for synchronous bio-based driver finish in-flight integrity i/o */
        blk_flush_integrity();
  
@@@ -1646,6 -1652,7 +1657,7 @@@ void __blk_put_request(struct request_q
  
        lockdep_assert_held(q->queue_lock);
  
+       blk_req_zone_write_unlock(req);
        blk_pm_put_request(req);
  
        elv_completed_request(q, req);
@@@ -2055,6 -2062,21 +2067,21 @@@ static inline bool should_fail_request(
  
  #endif /* CONFIG_FAIL_MAKE_REQUEST */
  
+ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+ {
+       if (part->policy && op_is_write(bio_op(bio))) {
+               char b[BDEVNAME_SIZE];
+               printk(KERN_ERR
+                      "generic_make_request: Trying to write "
+                       "to read-only block-device %s (partno %d)\n",
+                       bio_devname(bio, b), part->partno);
+               return true;
+       }
+       return false;
+ }
  /*
   * Remap block n of partition p to block n+start(p) of the disk.
   */
@@@ -2063,27 -2085,28 +2090,28 @@@ static inline int blk_partition_remap(s
        struct hd_struct *p;
        int ret = 0;
  
+       rcu_read_lock();
+       p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+       if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+                    bio_check_ro(bio, p))) {
+               ret = -EIO;
+               goto out;
+       }
        /*
         * Zone reset does not include bi_size so bio_sectors() is always 0.
         * Include a test for the reset op code and perform the remap if needed.
         */
-       if (!bio->bi_partno ||
-           (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
-               return 0;
+       if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+               goto out;
  
-       rcu_read_lock();
-       p = __disk_get_part(bio->bi_disk, bio->bi_partno);
-       if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
-               bio->bi_iter.bi_sector += p->start_sect;
-               bio->bi_partno = 0;
-               trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
-                               bio->bi_iter.bi_sector - p->start_sect);
-       } else {
-               printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
-               ret = -EIO;
-       }
-       rcu_read_unlock();
+       bio->bi_iter.bi_sector += p->start_sect;
+       bio->bi_partno = 0;
+       trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+                             bio->bi_iter.bi_sector - p->start_sect);
  
+ out:
+       rcu_read_unlock();
        return ret;
  }
  
@@@ -2142,15 -2165,19 +2170,19 @@@ generic_make_request_checks(struct bio 
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue is not a request based queue.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
                goto not_supported;
  
        if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
                goto end_io;
  
-       if (blk_partition_remap(bio))
-               goto end_io;
+       if (!bio->bi_partno) {
+               if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+                       goto end_io;
+       } else {
+               if (blk_partition_remap(bio))
+                       goto end_io;
+       }
  
        if (bio_check_eod(bio, nr_sectors))
                goto end_io;
@@@ -2493,8 -2520,7 +2525,7 @@@ blk_status_t blk_insert_cloned_request(
                 * bypass a potential scheduler on the bottom device for
                 * insert.
                 */
-               blk_mq_request_bypass_insert(rq, true);
-               return BLK_STS_OK;
+               return blk_mq_request_issue_directly(rq);
        }
  
        spin_lock_irqsave(q->queue_lock, flags);
@@@ -2846,7 -2872,7 +2877,7 @@@ void blk_start_request(struct request *
                wbt_issue(req->q->rq_wb, &req->issue_stat);
        }
  
-       BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+       BUG_ON(blk_rq_is_complete(req));
        blk_add_timer(req);
  }
  EXPORT_SYMBOL(blk_start_request);
@@@ -3415,20 -3441,6 +3446,6 @@@ int kblockd_mod_delayed_work_on(int cpu
  }
  EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
  
- int kblockd_schedule_delayed_work(struct delayed_work *dwork,
-                                 unsigned long delay)
- {
-       return queue_delayed_work(kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work);
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-                                    unsigned long delay)
- {
-       return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
  /**
   * blk_start_plug - initialize blk_plug and track it inside the task_struct
   * @plug:     The &struct blk_plug that needs to be initialized
diff --combined block/blk-map.c
index d3a94719f03fb2af81d6270d6fc9ed58f0dde373,209eb3b45c54d95cb4dfff07cd78a01ac2b4fd91..db9373bd31aca0e9393dd77ec9ef5f404d88923e
  #include "blk.h"
  
  /*
 - * Append a bio to a passthrough request.  Only works can be merged into
 - * the request based on the driver constraints.
 + * Append a bio to a passthrough request.  Only works if the bio can be merged
 + * into the request based on the driver constraints.
   */
 -int blk_rq_append_bio(struct request *rq, struct bio *bio)
 +int blk_rq_append_bio(struct request *rq, struct bio **bio)
  {
 -      blk_queue_bounce(rq->q, &bio);
 +      struct bio *orig_bio = *bio;
 +
 +      blk_queue_bounce(rq->q, bio);
  
        if (!rq->bio) {
 -              blk_rq_bio_prep(rq->q, rq, bio);
 +              blk_rq_bio_prep(rq->q, rq, *bio);
        } else {
 -              if (!ll_back_merge_fn(rq->q, rq, bio))
 +              if (!ll_back_merge_fn(rq->q, rq, *bio)) {
 +                      if (orig_bio != *bio) {
 +                              bio_put(*bio);
 +                              *bio = orig_bio;
 +                      }
                        return -EINVAL;
 +              }
  
 -              rq->biotail->bi_next = bio;
 -              rq->biotail = bio;
 -              rq->__data_len += bio->bi_iter.bi_size;
 +              rq->biotail->bi_next = *bio;
 +              rq->biotail = *bio;
 +              rq->__data_len += (*bio)->bi_iter.bi_size;
        }
  
        return 0;
@@@ -80,12 -73,14 +80,12 @@@ static int __blk_rq_map_user_iov(struc
         * We link the bounce buffer in and could have to traverse it
         * later so we have to get a ref to prevent it from being freed
         */
 -      ret = blk_rq_append_bio(rq, bio);
 -      bio_get(bio);
 +      ret = blk_rq_append_bio(rq, &bio);
        if (ret) {
 -              bio_endio(bio);
                __blk_rq_unmap_user(orig_bio);
 -              bio_put(bio);
                return ret;
        }
 +      bio_get(bio);
  
        return 0;
  }
@@@ -119,7 -114,7 +119,7 @@@ int blk_rq_map_user_iov(struct request_
        unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
        struct bio *bio = NULL;
        struct iov_iter i;
-       int ret;
+       int ret = -EINVAL;
  
        if (!iter_is_iovec(iter))
                goto fail;
@@@ -148,7 -143,7 +148,7 @@@ unmap_rq
        __blk_rq_unmap_user(bio);
  fail:
        rq->bio = NULL;
-       return -EINVAL;
+       return ret;
  }
  EXPORT_SYMBOL(blk_rq_map_user_iov);
  
@@@ -218,7 -213,7 +218,7 @@@ int blk_rq_map_kern(struct request_queu
        int reading = rq_data_dir(rq) == READ;
        unsigned long addr = (unsigned long) kbuf;
        int do_copy = 0;
 -      struct bio *bio;
 +      struct bio *bio, *orig_bio;
        int ret;
  
        if (len > (queue_max_hw_sectors(q) << 9))
        if (do_copy)
                rq->rq_flags |= RQF_COPY_USER;
  
 -      ret = blk_rq_append_bio(rq, bio);
 +      orig_bio = bio;
 +      ret = blk_rq_append_bio(rq, &bio);
        if (unlikely(ret)) {
                /* request is too big */
 -              bio_put(bio);
 +              bio_put(orig_bio);
                return ret;
        }
  
diff --combined block/blk-mq.c
index 3d379732749175ece7ae39e427e115f51621c521,43e7449723e0bf7ffa35c5c0814db261c81824fc..01f271d40825ebfd6ca82fdd2f887d432946799c
@@@ -95,8 -95,7 +95,7 @@@ static void blk_mq_check_inflight(struc
  {
        struct mq_inflight *mi = priv;
  
-       if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
-           !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+       if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
                /*
                 * index[0] counts the specific partition that was asked
                 * for. index[1] counts the ones that are active on the
@@@ -161,8 -160,6 +160,8 @@@ void blk_freeze_queue(struct request_qu
         * exported to drivers as the only user for unfreeze is blk_mq.
         */
        blk_freeze_queue_start(q);
 +      if (!q->mq_ops)
 +              blk_drain_queue(q);
        blk_mq_freeze_queue_wait(q);
  }
  
@@@ -222,7 -219,7 +221,7 @@@ void blk_mq_quiesce_queue(struct reques
  
        queue_for_each_hw_ctx(q, hctx, i) {
                if (hctx->flags & BLK_MQ_F_BLOCKING)
-                       synchronize_srcu(hctx->queue_rq_srcu);
+                       synchronize_srcu(hctx->srcu);
                else
                        rcu = true;
        }
@@@ -272,15 -269,14 +271,14 @@@ static struct request *blk_mq_rq_ctx_in
  {
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
-       rq->rq_flags = 0;
+       req_flags_t rq_flags = 0;
  
        if (data->flags & BLK_MQ_REQ_INTERNAL) {
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
                if (blk_mq_tag_busy(data->hctx)) {
-                       rq->rq_flags = RQF_MQ_INFLIGHT;
+                       rq_flags = RQF_MQ_INFLIGHT;
                        atomic_inc(&data->hctx->nr_active);
                }
                rq->tag = tag;
                data->hctx->tags->rqs[rq->tag] = rq;
        }
  
-       INIT_LIST_HEAD(&rq->queuelist);
        /* csd/requeue_work/fifo_time is initialized before use */
        rq->q = data->q;
        rq->mq_ctx = data->ctx;
+       rq->rq_flags = rq_flags;
+       rq->cpu = -1;
        rq->cmd_flags = op;
        if (data->flags & BLK_MQ_REQ_PREEMPT)
                rq->rq_flags |= RQF_PREEMPT;
        if (blk_queue_io_stat(data->q))
                rq->rq_flags |= RQF_IO_STAT;
-       /* do not touch atomic flags, it needs atomic ops against the timer */
-       rq->cpu = -1;
+       INIT_LIST_HEAD(&rq->queuelist);
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->rq_disk = NULL;
        rq->part = NULL;
        rq->start_time = jiffies;
- #ifdef CONFIG_BLK_CGROUP
-       rq->rl = NULL;
-       set_start_time_ns(rq);
-       rq->io_start_time_ns = 0;
- #endif
        rq->nr_phys_segments = 0;
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
        rq->nr_integrity_segments = 0;
        rq->special = NULL;
        /* tag was already set */
        rq->extra_len = 0;
+       rq->__deadline = 0;
  
        INIT_LIST_HEAD(&rq->timeout_list);
        rq->timeout = 0;
        rq->end_io_data = NULL;
        rq->next_rq = NULL;
  
+ #ifdef CONFIG_BLK_CGROUP
+       rq->rl = NULL;
+       set_start_time_ns(rq);
+       rq->io_start_time_ns = 0;
+ #endif
        data->ctx->rq_dispatched[op_is_sync(op)]++;
        return rq;
  }
@@@ -443,7 -441,7 +443,7 @@@ struct request *blk_mq_alloc_request_hc
                blk_queue_exit(q);
                return ERR_PTR(-EXDEV);
        }
-       cpu = cpumask_first(alloc_data.hctx->cpumask);
+       cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
        alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
  
        rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@@ -485,8 -483,7 +485,7 @@@ void blk_mq_free_request(struct reques
        if (blk_rq_rl(rq))
                blk_put_rl(blk_rq_rl(rq));
  
-       clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+       blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
        if (rq->tag != -1)
                blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
        if (sched_tag != -1)
@@@ -532,6 -529,9 +531,9 @@@ static void __blk_mq_complete_request(s
        bool shared = false;
        int cpu;
  
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+       blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
        if (rq->internal_tag != -1)
                blk_mq_sched_completed_request(rq);
        if (rq->rq_flags & RQF_STATS) {
        put_cpu();
  }
  
+ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+       __releases(hctx->srcu)
+ {
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+               rcu_read_unlock();
+       else
+               srcu_read_unlock(hctx->srcu, srcu_idx);
+ }
+ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+       __acquires(hctx->srcu)
+ {
+       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+               /* shut up gcc false positive */
+               *srcu_idx = 0;
+               rcu_read_lock();
+       } else
+               *srcu_idx = srcu_read_lock(hctx->srcu);
+ }
+ static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+ {
+       unsigned long flags;
+       /*
+        * blk_mq_rq_aborted_gstate() is used from the completion path and
+        * can thus be called from irq context.  u64_stats_fetch in the
+        * middle of update on the same CPU leads to lockup.  Disable irq
+        * while updating.
+        */
+       local_irq_save(flags);
+       u64_stats_update_begin(&rq->aborted_gstate_sync);
+       rq->aborted_gstate = gstate;
+       u64_stats_update_end(&rq->aborted_gstate_sync);
+       local_irq_restore(flags);
+ }
+ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+ {
+       unsigned int start;
+       u64 aborted_gstate;
+       do {
+               start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+               aborted_gstate = rq->aborted_gstate;
+       } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+       return aborted_gstate;
+ }
  /**
   * blk_mq_complete_request - end I/O on a request
   * @rq:               the request being processed
  void blk_mq_complete_request(struct request *rq)
  {
        struct request_queue *q = rq->q;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+       int srcu_idx;
  
        if (unlikely(blk_should_fake_timeout(q)))
                return;
-       if (!blk_mark_rq_complete(rq))
+       /*
+        * If @rq->aborted_gstate equals the current instance, timeout is
+        * claiming @rq and we lost.  This is synchronized through
+        * hctx_lock().  See blk_mq_timeout_work() for details.
+        *
+        * Completion path never blocks and we can directly use RCU here
+        * instead of hctx_lock() which can be either RCU or SRCU.
+        * However, that would complicate paths which want to synchronize
+        * against us.  Let stay in sync with the issue path so that
+        * hctx_lock() covers both issue and completion paths.
+        */
+       hctx_lock(hctx, &srcu_idx);
+       if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
                __blk_mq_complete_request(rq);
+       hctx_unlock(hctx, srcu_idx);
  }
  EXPORT_SYMBOL(blk_mq_complete_request);
  
  int blk_mq_request_started(struct request *rq)
  {
-       return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+       return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
  }
  EXPORT_SYMBOL_GPL(blk_mq_request_started);
  
@@@ -598,34 -664,27 +666,27 @@@ void blk_mq_start_request(struct reques
                wbt_issue(q->rq_wb, &rq->issue_stat);
        }
  
-       blk_add_timer(rq);
-       WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
  
        /*
-        * Mark us as started and clear complete. Complete might have been
-        * set if requeue raced with timeout, which then marked it as
-        * complete. So be sure to clear complete again when we start
-        * the request, otherwise we'll ignore the completion event.
+        * Mark @rq in-flight which also advances the generation number,
+        * and register for timeout.  Protect with a seqcount to allow the
+        * timeout path to read both @rq->gstate and @rq->deadline
+        * coherently.
         *
-        * Ensure that ->deadline is visible before we set STARTED, such that
-        * blk_mq_check_expired() is guaranteed to observe our ->deadline when
-        * it observes STARTED.
+        * This is the only place where a request is marked in-flight.  If
+        * the timeout path reads an in-flight @rq->gstate, the
+        * @rq->deadline it reads together under @rq->gstate_seq is
+        * guaranteed to be the matching one.
         */
-       smp_wmb();
-       set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
-       if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
-               /*
-                * Coherence order guarantees these consecutive stores to a
-                * single variable propagate in the specified order. Thus the
-                * clear_bit() is ordered _after_ the set bit. See
-                * blk_mq_check_expired().
-                *
-                * (the bits must be part of the same byte for this to be
-                * true).
-                */
-               clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
-       }
+       preempt_disable();
+       write_seqcount_begin(&rq->gstate_seq);
+       blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+       blk_add_timer(rq);
+       write_seqcount_end(&rq->gstate_seq);
+       preempt_enable();
  
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
  EXPORT_SYMBOL(blk_mq_start_request);
  
  /*
-  * When we reach here because queue is busy, REQ_ATOM_COMPLETE
-  * flag isn't set yet, so there may be race with timeout handler,
-  * but given rq->deadline is just set in .queue_rq() under
-  * this situation, the race won't be possible in reality because
-  * rq->timeout should be set as big enough to cover the window
-  * between blk_mq_start_request() called from .queue_rq() and
-  * clearing REQ_ATOM_STARTED here.
+  * When we reach here because queue is busy, it's safe to change the state
+  * to IDLE without checking @rq->aborted_gstate because we should still be
+  * holding the RCU read lock and thus protected against timeout.
   */
  static void __blk_mq_requeue_request(struct request *rq)
  {
        wbt_requeue(q->rq_wb, &rq->issue_stat);
        blk_mq_sched_requeue_request(rq);
  
-       if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+       if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+               blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
                if (q->dma_drain_size && blk_rq_bytes(rq))
                        rq->nr_phys_segments--;
        }
@@@ -689,13 -745,13 +747,13 @@@ static void blk_mq_requeue_work(struct 
  
                rq->rq_flags &= ~RQF_SOFTBARRIER;
                list_del_init(&rq->queuelist);
-               blk_mq_sched_insert_request(rq, true, false, false, true);
+               blk_mq_sched_insert_request(rq, true, false, false);
        }
  
        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
-               blk_mq_sched_insert_request(rq, false, false, false, true);
+               blk_mq_sched_insert_request(rq, false, false, false);
        }
  
        blk_mq_run_hw_queues(q, false);
@@@ -729,7 -785,7 +787,7 @@@ EXPORT_SYMBOL(blk_mq_add_to_requeue_lis
  
  void blk_mq_kick_requeue_list(struct request_queue *q)
  {
-       kblockd_schedule_delayed_work(&q->requeue_work, 0);
+       kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
  }
  EXPORT_SYMBOL(blk_mq_kick_requeue_list);
  
@@@ -755,24 -811,15 +813,15 @@@ EXPORT_SYMBOL(blk_mq_tag_to_rq)
  struct blk_mq_timeout_data {
        unsigned long next;
        unsigned int next_set;
+       unsigned int nr_expired;
  };
  
- void blk_mq_rq_timed_out(struct request *req, bool reserved)
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
  {
        const struct blk_mq_ops *ops = req->q->mq_ops;
        enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
  
-       /*
-        * We know that complete is set at this point. If STARTED isn't set
-        * anymore, then the request isn't active and the "timeout" should
-        * just be ignored. This can happen due to the bitflag ordering.
-        * Timeout first checks if STARTED is set, and if it is, assumes
-        * the request is active. But if we race with completion, then
-        * both flags will get cleared. So check here again, and ignore
-        * a timeout event with a request that isn't active.
-        */
-       if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
-               return;
+       req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
  
        if (ops->timeout)
                ret = ops->timeout(req, reserved);
                __blk_mq_complete_request(req);
                break;
        case BLK_EH_RESET_TIMER:
+               /*
+                * As nothing prevents from completion happening while
+                * ->aborted_gstate is set, this may lead to ignored
+                * completions and further spurious timeouts.
+                */
+               blk_mq_rq_update_aborted_gstate(req, 0);
                blk_add_timer(req);
-               blk_clear_rq_complete(req);
                break;
        case BLK_EH_NOT_HANDLED:
                break;
@@@ -797,50 -849,51 +851,51 @@@ static void blk_mq_check_expired(struc
                struct request *rq, void *priv, bool reserved)
  {
        struct blk_mq_timeout_data *data = priv;
-       unsigned long deadline;
+       unsigned long gstate, deadline;
+       int start;
  
-       if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
-               return;
+       might_sleep();
  
-       /*
-        * Ensures that if we see STARTED we must also see our
-        * up-to-date deadline, see blk_mq_start_request().
-        */
-       smp_rmb();
+       if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
+               return;
  
-       deadline = READ_ONCE(rq->deadline);
+       /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+       while (true) {
+               start = read_seqcount_begin(&rq->gstate_seq);
+               gstate = READ_ONCE(rq->gstate);
+               deadline = blk_rq_deadline(rq);
+               if (!read_seqcount_retry(&rq->gstate_seq, start))
+                       break;
+               cond_resched();
+       }
  
-       /*
-        * The rq being checked may have been freed and reallocated
-        * out already here, we avoid this race by checking rq->deadline
-        * and REQ_ATOM_COMPLETE flag together:
-        *
-        * - if rq->deadline is observed as new value because of
-        *   reusing, the rq won't be timed out because of timing.
-        * - if rq->deadline is observed as previous value,
-        *   REQ_ATOM_COMPLETE flag won't be cleared in reuse path
-        *   because we put a barrier between setting rq->deadline
-        *   and clearing the flag in blk_mq_start_request(), so
-        *   this rq won't be timed out too.
-        */
-       if (time_after_eq(jiffies, deadline)) {
-               if (!blk_mark_rq_complete(rq)) {
-                       /*
-                        * Again coherence order ensures that consecutive reads
-                        * from the same variable must be in that order. This
-                        * ensures that if we see COMPLETE clear, we must then
-                        * see STARTED set and we'll ignore this timeout.
-                        *
-                        * (There's also the MB implied by the test_and_clear())
-                        */
-                       blk_mq_rq_timed_out(rq, reserved);
-               }
+       /* if in-flight && overdue, mark for abortion */
+       if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+           time_after_eq(jiffies, deadline)) {
+               blk_mq_rq_update_aborted_gstate(rq, gstate);
+               data->nr_expired++;
+               hctx->nr_expired++;
        } else if (!data->next_set || time_after(data->next, deadline)) {
                data->next = deadline;
                data->next_set = 1;
        }
  }
  
+ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+               struct request *rq, void *priv, bool reserved)
+ {
+       /*
+        * We marked @rq->aborted_gstate and waited for RCU.  If there were
+        * completions that we lost to, they would have finished and
+        * updated @rq->gstate by now; otherwise, the completion path is
+        * now guaranteed to see @rq->aborted_gstate and yield.  If
+        * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+        */
+       if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+           READ_ONCE(rq->gstate) == rq->aborted_gstate)
+               blk_mq_rq_timed_out(rq, reserved);
+ }
  static void blk_mq_timeout_work(struct work_struct *work)
  {
        struct request_queue *q =
        struct blk_mq_timeout_data data = {
                .next           = 0,
                .next_set       = 0,
+               .nr_expired     = 0,
        };
+       struct blk_mq_hw_ctx *hctx;
        int i;
  
        /* A deadlock might occur if a request is stuck requiring a
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;
  
+       /* scan for the expired ones and set their ->aborted_gstate */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
  
+       if (data.nr_expired) {
+               bool has_rcu = false;
+               /*
+                * Wait till everyone sees ->aborted_gstate.  The
+                * sequential waits for SRCUs aren't ideal.  If this ever
+                * becomes a problem, we can add per-hw_ctx rcu_head and
+                * wait in parallel.
+                */
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->nr_expired)
+                               continue;
+                       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                               has_rcu = true;
+                       else
+                               synchronize_srcu(hctx->srcu);
+                       hctx->nr_expired = 0;
+               }
+               if (has_rcu)
+                       synchronize_rcu();
+               /* terminate the ones we won */
+               blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+       }
        if (data.next_set) {
                data.next = blk_rq_timeout(round_jiffies_up(data.next));
                mod_timer(&q->timeout, data.next);
        } else {
-               struct blk_mq_hw_ctx *hctx;
+               /*
+                * Request timeouts are handled as a forward rolling timer. If
+                * we end up here it means that no requests are pending and
+                * also that no request has been pending for a while. Mark
+                * each hctx as idle.
+                */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
@@@ -1010,66 -1097,67 +1099,67 @@@ static int blk_mq_dispatch_wake(wait_qu
  
  /*
   * Mark us waiting for a tag. For shared tags, this involves hooking us into
-  * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
-  * restart. For both caes, take care to check the condition again after
+  * the tag wakeups. For non-shared tags, we can simply mark us needing a
+  * restart. For both cases, take care to check the condition again after
   * marking us as waiting.
   */
  static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
                                 struct request *rq)
  {
        struct blk_mq_hw_ctx *this_hctx = *hctx;
-       bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
        struct sbq_wait_state *ws;
        wait_queue_entry_t *wait;
        bool ret;
  
-       if (!shared_tags) {
+       if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
                if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
                        set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
-       } else {
-               wait = &this_hctx->dispatch_wait;
-               if (!list_empty_careful(&wait->entry))
-                       return false;
  
-               spin_lock(&this_hctx->lock);
-               if (!list_empty(&wait->entry)) {
-                       spin_unlock(&this_hctx->lock);
-                       return false;
-               }
+               /*
+                * It's possible that a tag was freed in the window between the
+                * allocation failure and adding the hardware queue to the wait
+                * queue.
+                *
+                * Don't clear RESTART here, someone else could have set it.
+                * At most this will cost an extra queue run.
+                */
+               return blk_mq_get_driver_tag(rq, hctx, false);
+       }
  
-               ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
-               add_wait_queue(&ws->wait, wait);
+       wait = &this_hctx->dispatch_wait;
+       if (!list_empty_careful(&wait->entry))
+               return false;
+       spin_lock(&this_hctx->lock);
+       if (!list_empty(&wait->entry)) {
+               spin_unlock(&this_hctx->lock);
+               return false;
        }
  
+       ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+       add_wait_queue(&ws->wait, wait);
        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq, hctx, false);
-       if (!shared_tags) {
-               /*
-                * Don't clear RESTART here, someone else could have set it.
-                * At most this will cost an extra queue run.
-                */
-               return ret;
-       } else {
-               if (!ret) {
-                       spin_unlock(&this_hctx->lock);
-                       return false;
-               }
-               /*
-                * We got a tag, remove ourselves from the wait queue to ensure
-                * someone else gets the wakeup.
-                */
-               spin_lock_irq(&ws->wait.lock);
-               list_del_init(&wait->entry);
-               spin_unlock_irq(&ws->wait.lock);
+       if (!ret) {
                spin_unlock(&this_hctx->lock);
-               return true;
+               return false;
        }
+       /*
+        * We got a tag, remove ourselves from the wait queue to ensure
+        * someone else gets the wakeup.
+        */
+       spin_lock_irq(&ws->wait.lock);
+       list_del_init(&wait->entry);
+       spin_unlock_irq(&ws->wait.lock);
+       spin_unlock(&this_hctx->lock);
+       return true;
  }
  
  bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
@@@ -1206,9 -1294,27 +1296,27 @@@ static void __blk_mq_run_hw_queue(struc
        /*
         * We should be running this queue from one of the CPUs that
         * are mapped to it.
+        *
+        * There are at least two related races now between setting
+        * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+        * __blk_mq_run_hw_queue():
+        *
+        * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+        *   but later it becomes online, then this warning is harmless
+        *   at all
+        *
+        * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+        *   but later it becomes offline, then the warning can't be
+        *   triggered, and we depend on blk-mq timeout handler to
+        *   handle dispatched requests to this hctx
         */
-       WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
-               cpu_online(hctx->next_cpu));
+       if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+               cpu_online(hctx->next_cpu)) {
+               printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+                       raw_smp_processor_id(),
+                       cpumask_empty(hctx->cpumask) ? "inactive": "active");
+               dump_stack();
+       }
  
        /*
         * We can't run the queue inline with ints disabled. Ensure that
         */
        WARN_ON_ONCE(in_interrupt());
  
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               blk_mq_sched_dispatch_requests(hctx);
-               rcu_read_unlock();
-       } else {
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
  
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               blk_mq_sched_dispatch_requests(hctx);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+       blk_mq_sched_dispatch_requests(hctx);
+       hctx_unlock(hctx, srcu_idx);
  }
  
  /*
   */
  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
  {
+       bool tried = false;
        if (hctx->queue->nr_hw_queues == 1)
                return WORK_CPU_UNBOUND;
  
        if (--hctx->next_cpu_batch <= 0) {
                int next_cpu;
-               next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+ select_cpu:
+               next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+                               cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
-                       next_cpu = cpumask_first(hctx->cpumask);
+                       next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
  
-               hctx->next_cpu = next_cpu;
+               /*
+                * No online CPU is found, so have to make sure hctx->next_cpu
+                * is set correctly for not breaking workqueue.
+                */
+               if (next_cpu >= nr_cpu_ids)
+                       hctx->next_cpu = cpumask_first(hctx->cpumask);
+               else
+                       hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
  
+       /*
+        * Do unbound schedule if we can't find a online CPU for this hctx,
+        * and it should only happen in the path of handling CPU DEAD.
+        */
+       if (!cpu_online(hctx->next_cpu)) {
+               if (!tried) {
+                       tried = true;
+                       goto select_cpu;
+               }
+               /*
+                * Make sure to re-select CPU next time once after CPUs
+                * in hctx->cpumask become online again.
+                */
+               hctx->next_cpu_batch = 1;
+               return WORK_CPU_UNBOUND;
+       }
        return hctx->next_cpu;
  }
  
@@@ -1274,9 -1401,8 +1403,8 @@@ static void __blk_mq_delay_run_hw_queue
                put_cpu();
        }
  
-       kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-                                        &hctx->run_work,
-                                        msecs_to_jiffies(msecs));
+       kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+                                   msecs_to_jiffies(msecs));
  }
  
  void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
@@@ -1287,7 -1413,23 +1415,23 @@@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue
  
  bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
  {
-       if (blk_mq_hctx_has_pending(hctx)) {
+       int srcu_idx;
+       bool need_run;
+       /*
+        * When queue is quiesced, we may be switching io scheduler, or
+        * updating nr_hw_queues, or other things, and we can't run queue
+        * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+        *
+        * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+        * quiesced.
+        */
+       hctx_lock(hctx, &srcu_idx);
+       need_run = !blk_queue_quiesced(hctx->queue) &&
+               blk_mq_hctx_has_pending(hctx);
+       hctx_unlock(hctx, srcu_idx);
+       if (need_run) {
                __blk_mq_delay_run_hw_queue(hctx, async, 0);
                return true;
        }
@@@ -1595,9 -1737,9 +1739,9 @@@ static blk_qc_t request_to_qc_t(struct 
        return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
  }
  
- static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-                                       struct request *rq,
-                                       blk_qc_t *cookie, bool may_sleep)
+ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+                                           struct request *rq,
+                                           blk_qc_t *cookie)
  {
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
        };
        blk_qc_t new_cookie;
        blk_status_t ret;
+       new_cookie = request_to_qc_t(hctx, rq);
+       /*
+        * For OK queue, we are done. For error, caller may kill it.
+        * Any other error (busy), just add it to our list as we
+        * previously would have done.
+        */
+       ret = q->mq_ops->queue_rq(hctx, &bd);
+       switch (ret) {
+       case BLK_STS_OK:
+               *cookie = new_cookie;
+               break;
+       case BLK_STS_RESOURCE:
+               __blk_mq_requeue_request(rq);
+               break;
+       default:
+               *cookie = BLK_QC_T_NONE;
+               break;
+       }
+       return ret;
+ }
+ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+                                               struct request *rq,
+                                               blk_qc_t *cookie,
+                                               bool bypass_insert)
+ {
+       struct request_queue *q = rq->q;
        bool run_queue = true;
  
-       /* RCU or SRCU read lock is needed before checking quiesced flag */
+       /*
+        * RCU or SRCU read lock is needed before checking quiesced flag.
+        *
+        * When queue is stopped or quiesced, ignore 'bypass_insert' from
+        * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+        * and avoid driver to try to dispatch again.
+        */
        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
                run_queue = false;
+               bypass_insert = false;
                goto insert;
        }
  
-       if (q->elevator)
+       if (q->elevator && !bypass_insert)
                goto insert;
  
        if (!blk_mq_get_driver_tag(rq, NULL, false))
                goto insert;
        }
  
-       new_cookie = request_to_qc_t(hctx, rq);
-       /*
-        * For OK queue, we are done. For error, kill it. Any other
-        * error (busy), just add it to our list as we previously
-        * would have done
-        */
-       ret = q->mq_ops->queue_rq(hctx, &bd);
-       switch (ret) {
-       case BLK_STS_OK:
-               *cookie = new_cookie;
-               return;
-       case BLK_STS_RESOURCE:
-               __blk_mq_requeue_request(rq);
-               goto insert;
-       default:
-               *cookie = BLK_QC_T_NONE;
-               blk_mq_end_request(rq, ret);
-               return;
-       }
+       return __blk_mq_issue_directly(hctx, rq, cookie);
  insert:
-       blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+       if (bypass_insert)
+               return BLK_STS_RESOURCE;
+       blk_mq_sched_insert_request(rq, false, run_queue, false);
+       return BLK_STS_OK;
  }
  
  static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq, blk_qc_t *cookie)
  {
-       if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
-               rcu_read_lock();
-               __blk_mq_try_issue_directly(hctx, rq, cookie, false);
-               rcu_read_unlock();
-       } else {
-               unsigned int srcu_idx;
+       blk_status_t ret;
+       int srcu_idx;
  
-               might_sleep();
+       might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
  
-               srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
-               __blk_mq_try_issue_directly(hctx, rq, cookie, true);
-               srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
-       }
+       hctx_lock(hctx, &srcu_idx);
+       ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+       if (ret == BLK_STS_RESOURCE)
+               blk_mq_sched_insert_request(rq, false, true, false);
+       else if (ret != BLK_STS_OK)
+               blk_mq_end_request(rq, ret);
+       hctx_unlock(hctx, srcu_idx);
+ }
+ blk_status_t blk_mq_request_issue_directly(struct request *rq)
+ {
+       blk_status_t ret;
+       int srcu_idx;
+       blk_qc_t unused_cookie;
+       struct blk_mq_ctx *ctx = rq->mq_ctx;
+       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+       hctx_lock(hctx, &srcu_idx);
+       ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+       hctx_unlock(hctx, srcu_idx);
+       return ret;
  }
  
  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
        } else if (q->elevator) {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
-               blk_mq_sched_insert_request(rq, false, true, true, true);
+               blk_mq_sched_insert_request(rq, false, true, true);
        } else {
                blk_mq_put_ctx(data.ctx);
                blk_mq_bio_to_request(rq, bio);
@@@ -1869,6 -2048,22 +2050,22 @@@ static size_t order_to_size(unsigned in
        return (size_t)PAGE_SIZE << order;
  }
  
+ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+                              unsigned int hctx_idx, int node)
+ {
+       int ret;
+       if (set->ops->init_request) {
+               ret = set->ops->init_request(set, rq, hctx_idx, node);
+               if (ret)
+                       return ret;
+       }
+       seqcount_init(&rq->gstate_seq);
+       u64_stats_init(&rq->aborted_gstate_sync);
+       return 0;
+ }
  int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx, unsigned int depth)
  {
                        struct request *rq = p;
  
                        tags->static_rqs[i] = rq;
-                       if (set->ops->init_request) {
-                               if (set->ops->init_request(set, rq, hctx_idx,
-                                               node)) {
-                                       tags->static_rqs[i] = NULL;
-                                       goto fail;
-                               }
+                       if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+                               tags->static_rqs[i] = NULL;
+                               goto fail;
                        }
  
                        p += rq_size;
@@@ -1994,7 -2186,8 +2188,8 @@@ static void blk_mq_exit_hctx(struct req
  {
        blk_mq_debugfs_unregister_hctx(hctx);
  
-       blk_mq_tag_idle(hctx);
+       if (blk_mq_hw_queue_mapped(hctx))
+               blk_mq_tag_idle(hctx);
  
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
                set->ops->exit_hctx(hctx, hctx_idx);
  
        if (hctx->flags & BLK_MQ_F_BLOCKING)
-               cleanup_srcu_struct(hctx->queue_rq_srcu);
+               cleanup_srcu_struct(hctx->srcu);
  
        blk_mq_remove_cpuhp(hctx);
        blk_free_flush_queue(hctx->fq);
@@@ -2074,13 -2267,11 +2269,11 @@@ static int blk_mq_init_hctx(struct requ
        if (!hctx->fq)
                goto sched_exit_hctx;
  
-       if (set->ops->init_request &&
-           set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
-                                  node))
+       if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
                goto free_fq;
  
        if (hctx->flags & BLK_MQ_F_BLOCKING)
-               init_srcu_struct(hctx->queue_rq_srcu);
+               init_srcu_struct(hctx->srcu);
  
        blk_mq_debugfs_register_hctx(q, hctx);
  
@@@ -2116,16 -2307,11 +2309,11 @@@ static void blk_mq_init_cpu_queues(stru
                INIT_LIST_HEAD(&__ctx->rq_list);
                __ctx->queue = q;
  
-               /* If the cpu isn't present, the cpu is mapped to first hctx */
-               if (!cpu_present(i))
-                       continue;
-               hctx = blk_mq_map_queue(q, i);
                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
+               hctx = blk_mq_map_queue(q, i);
                if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                        hctx->numa_node = local_memory_node(cpu_to_node(i));
        }
@@@ -2182,7 -2368,7 +2370,7 @@@ static void blk_mq_map_swqueue(struct r
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
-       for_each_present_cpu(i) {
+       for_each_possible_cpu(i) {
                hctx_idx = q->mq_map[i];
                /* unmapped hw queue can be remapped after CPU topo changed */
                if (!set->tags[hctx_idx] &&
                /*
                 * Initialize batch roundrobin counts
                 */
-               hctx->next_cpu = cpumask_first(hctx->cpumask);
+               hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+                               cpu_online_mask);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }
  }
@@@ -2369,7 -2556,7 +2558,7 @@@ static int blk_mq_hw_ctx_size(struct bl
  {
        int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
  
-       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+       BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
                           __alignof__(struct blk_mq_hw_ctx)) !=
                     sizeof(struct blk_mq_hw_ctx));
  
@@@ -2386,6 -2573,9 +2575,9 @@@ static void blk_mq_realloc_hw_ctxs(stru
        struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
  
        blk_mq_sysfs_unregister(q);
+       /* protect against switching io scheduler  */
+       mutex_lock(&q->sysfs_lock);
        for (i = 0; i < set->nr_hw_queues; i++) {
                int node;
  
                }
        }
        q->nr_hw_queues = i;
+       mutex_unlock(&q->sysfs_lock);
        blk_mq_sysfs_register(q);
  }
  
@@@ -2601,9 -2792,27 +2794,27 @@@ static int blk_mq_alloc_rq_maps(struct 
  
  static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  {
-       if (set->ops->map_queues)
+       if (set->ops->map_queues) {
+               int cpu;
+               /*
+                * transport .map_queues is usually done in the following
+                * way:
+                *
+                * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+                *      mask = get_cpu_mask(queue)
+                *      for_each_cpu(cpu, mask)
+                *              set->mq_map[cpu] = queue;
+                * }
+                *
+                * When we need to remap, the table has to be cleared for
+                * killing stale mapping since one CPU may not be mapped
+                * to any hw queue.
+                */
+               for_each_possible_cpu(cpu)
+                       set->mq_map[cpu] = 0;
                return set->ops->map_queues(set);
-       else
+       else
                return blk_mq_map_queues(set);
  }
  
@@@ -2712,6 -2921,7 +2923,7 @@@ int blk_mq_update_nr_requests(struct re
                return -EINVAL;
  
        blk_mq_freeze_queue(q);
+       blk_mq_quiesce_queue(q);
  
        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
        if (!ret)
                q->nr_requests = nr;
  
+       blk_mq_unquiesce_queue(q);
        blk_mq_unfreeze_queue(q);
  
        return ret;
@@@ -2850,7 -3061,7 +3063,7 @@@ static bool blk_mq_poll_hybrid_sleep(st
        unsigned int nsecs;
        ktime_t kt;
  
-       if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+       if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
                return false;
  
        /*
        if (!nsecs)
                return false;
  
-       set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+       rq->rq_flags |= RQF_MQ_POLL_SLEPT;
  
        /*
         * This will be replaced with the stats tracking code, using
  
        hrtimer_init_sleeper(&hs, current);
        do {
-               if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+               if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
                        break;
                set_current_state(TASK_UNINTERRUPTIBLE);
                hrtimer_start_expires(&hs.timer, mode);
@@@ -2970,12 -3181,6 +3183,6 @@@ static bool blk_mq_poll(struct request_
  
  static int __init blk_mq_init(void)
  {
-       /*
-        * See comment in block/blk.h rq_atomic_flags enum
-        */
-       BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
-                       (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        return 0;
diff --combined block/blk-throttle.c
index d19f416d61012ac032c49608f0afe463c948e8bc,c475f0fe3530667ce70f3771f406f8bc6944d8e7..c5a1316737331ba785a0c569aac1615994cb1996
@@@ -216,9 -216,9 +216,9 @@@ struct throtl_dat
  
        unsigned int scale;
  
-       struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
-       struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
-       struct latency_bucket __percpu *latency_buckets;
+       struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+       struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+       struct latency_bucket __percpu *latency_buckets[2];
        unsigned long last_calculate_time;
        unsigned long filtered_latency;
  
@@@ -1510,11 -1510,21 +1510,21 @@@ static struct cftype throtl_legacy_file
                .private = (unsigned long)&blkcg_policy_throtl,
                .seq_show = blkg_print_stat_bytes,
        },
+       {
+               .name = "throttle.io_service_bytes_recursive",
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_bytes_recursive,
+       },
        {
                .name = "throttle.io_serviced",
                .private = (unsigned long)&blkcg_policy_throtl,
                .seq_show = blkg_print_stat_ios,
        },
+       {
+               .name = "throttle.io_serviced_recursive",
+               .private = (unsigned long)&blkcg_policy_throtl,
+               .seq_show = blkg_print_stat_ios_recursive,
+       },
        { }     /* terminate */
  };
  
@@@ -2040,10 -2050,10 +2050,10 @@@ static void blk_throtl_update_idletime(
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  static void throtl_update_latency_buckets(struct throtl_data *td)
  {
-       struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
-       int i, cpu;
-       unsigned long last_latency = 0;
-       unsigned long latency;
+       struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+       int i, cpu, rw;
+       unsigned long last_latency[2] = { 0 };
+       unsigned long latency[2];
  
        if (!blk_queue_nonrot(td->queue))
                return;
        td->last_calculate_time = jiffies;
  
        memset(avg_latency, 0, sizeof(avg_latency));
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               struct latency_bucket *tmp = &td->tmp_buckets[i];
-               for_each_possible_cpu(cpu) {
-                       struct latency_bucket *bucket;
-                       /* this isn't race free, but ok in practice */
-                       bucket = per_cpu_ptr(td->latency_buckets, cpu);
-                       tmp->total_latency += bucket[i].total_latency;
-                       tmp->samples += bucket[i].samples;
-                       bucket[i].total_latency = 0;
-                       bucket[i].samples = 0;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+                       for_each_possible_cpu(cpu) {
+                               struct latency_bucket *bucket;
+                               /* this isn't race free, but ok in practice */
+                               bucket = per_cpu_ptr(td->latency_buckets[rw],
+                                       cpu);
+                               tmp->total_latency += bucket[i].total_latency;
+                               tmp->samples += bucket[i].samples;
+                               bucket[i].total_latency = 0;
+                               bucket[i].samples = 0;
+                       }
  
-               if (tmp->samples >= 32) {
-                       int samples = tmp->samples;
+                       if (tmp->samples >= 32) {
+                               int samples = tmp->samples;
  
-                       latency = tmp->total_latency;
+                               latency[rw] = tmp->total_latency;
  
-                       tmp->total_latency = 0;
-                       tmp->samples = 0;
-                       latency /= samples;
-                       if (latency == 0)
-                               continue;
-                       avg_latency[i].latency = latency;
+                               tmp->total_latency = 0;
+                               tmp->samples = 0;
+                               latency[rw] /= samples;
+                               if (latency[rw] == 0)
+                                       continue;
+                               avg_latency[rw][i].latency = latency[rw];
+                       }
                }
        }
  
-       for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-               if (!avg_latency[i].latency) {
-                       if (td->avg_buckets[i].latency < last_latency)
-                               td->avg_buckets[i].latency = last_latency;
-                       continue;
-               }
+       for (rw = READ; rw <= WRITE; rw++) {
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       if (!avg_latency[rw][i].latency) {
+                               if (td->avg_buckets[rw][i].latency < last_latency[rw])
+                                       td->avg_buckets[rw][i].latency =
+                                               last_latency[rw];
+                               continue;
+                       }
  
-               if (!td->avg_buckets[i].valid)
-                       latency = avg_latency[i].latency;
-               else
-                       latency = (td->avg_buckets[i].latency * 7 +
-                               avg_latency[i].latency) >> 3;
+                       if (!td->avg_buckets[rw][i].valid)
+                               latency[rw] = avg_latency[rw][i].latency;
+                       else
+                               latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+                                       avg_latency[rw][i].latency) >> 3;
  
-               td->avg_buckets[i].latency = max(latency, last_latency);
-               td->avg_buckets[i].valid = true;
-               last_latency = td->avg_buckets[i].latency;
+                       td->avg_buckets[rw][i].latency = max(latency[rw],
+                               last_latency[rw]);
+                       td->avg_buckets[rw][i].valid = true;
+                       last_latency[rw] = td->avg_buckets[rw][i].latency;
+               }
        }
  
        for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
                throtl_log(&td->service_queue,
-                       "Latency bucket %d: latency=%ld, valid=%d", i,
-                       td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+                       "Latency bucket %d: read latency=%ld, read valid=%d, "
+                       "write latency=%ld, write valid=%d", i,
+                       td->avg_buckets[READ][i].latency,
+                       td->avg_buckets[READ][i].valid,
+                       td->avg_buckets[WRITE][i].latency,
+                       td->avg_buckets[WRITE][i].valid);
  }
  #else
  static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@@ -2226,7 -2247,13 +2247,7 @@@ again
  out_unlock:
        spin_unlock_irq(q->queue_lock);
  out:
 -      /*
 -       * As multiple blk-throtls may stack in the same issue path, we
 -       * don't want bios to leave with the flag set.  Clear the flag if
 -       * being issued.
 -       */
 -      if (!throttled)
 -              bio_clear_flag(bio, BIO_THROTTLED);
 +      bio_set_flag(bio, BIO_THROTTLED);
  
  #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
        if (throttled || !td->track_bio_latency)
@@@ -2242,16 -2269,17 +2263,17 @@@ static void throtl_track_latency(struc
        struct latency_bucket *latency;
        int index;
  
-       if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+       if (!td || td->limit_index != LIMIT_LOW ||
+           !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
            !blk_queue_nonrot(td->queue))
                return;
  
        index = request_bucket_index(size);
  
-       latency = get_cpu_ptr(td->latency_buckets);
+       latency = get_cpu_ptr(td->latency_buckets[op]);
        latency[index].total_latency += time;
        latency[index].samples++;
-       put_cpu_ptr(td->latency_buckets);
+       put_cpu_ptr(td->latency_buckets[op]);
  }
  
  void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@@ -2270,6 -2298,7 +2292,7 @@@ void blk_throtl_bio_endio(struct bio *b
        unsigned long finish_time;
        unsigned long start_time;
        unsigned long lat;
+       int rw = bio_data_dir(bio);
  
        tg = bio->bi_cg_private;
        if (!tg)
  
                bucket = request_bucket_index(
                        blk_stat_size(&bio->bi_issue_stat));
-               threshold = tg->td->avg_buckets[bucket].latency +
+               threshold = tg->td->avg_buckets[rw][bucket].latency +
                        tg->latency_target;
                if (lat > threshold)
                        tg->bad_bio_cnt++;
@@@ -2391,9 -2420,16 +2414,16 @@@ int blk_throtl_init(struct request_queu
        td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
        if (!td)
                return -ENOMEM;
-       td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+       td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
                LATENCY_BUCKET_SIZE, __alignof__(u64));
-       if (!td->latency_buckets) {
+       if (!td->latency_buckets[READ]) {
+               kfree(td);
+               return -ENOMEM;
+       }
+       td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+               LATENCY_BUCKET_SIZE, __alignof__(u64));
+       if (!td->latency_buckets[WRITE]) {
+               free_percpu(td->latency_buckets[READ]);
                kfree(td);
                return -ENOMEM;
        }
        /* activate policy */
        ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
        if (ret) {
-               free_percpu(td->latency_buckets);
+               free_percpu(td->latency_buckets[READ]);
+               free_percpu(td->latency_buckets[WRITE]);
                kfree(td);
        }
        return ret;
@@@ -2423,7 -2460,8 +2454,8 @@@ void blk_throtl_exit(struct request_que
        BUG_ON(!q->td);
        throtl_shutdown_wq(q);
        blkcg_deactivate_policy(q, &blkcg_policy_throtl);
-       free_percpu(q->td->latency_buckets);
+       free_percpu(q->td->latency_buckets[READ]);
+       free_percpu(q->td->latency_buckets[WRITE]);
        kfree(q->td);
  }
  
@@@ -2441,15 -2479,17 +2473,17 @@@ void blk_throtl_register_queue(struct r
        } else {
                td->throtl_slice = DFL_THROTL_SLICE_HD;
                td->filtered_latency = LATENCY_FILTERED_HD;
-               for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-                       td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+               for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+                       td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+                       td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+               }
        }
  #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
        /* if no low limit, use previous default */
        td->throtl_slice = DFL_THROTL_SLICE_HD;
  #endif
  
-       td->track_bio_latency = !q->mq_ops && !q->request_fn;
+       td->track_bio_latency = !queue_is_rq_based(q);
        if (!td->track_bio_latency)
                blk_stat_enable_accounting(q);
  }
diff --combined block/blk.h
index 442098aa9463a37dad0dfccb1718eea65be6cdb3,b1771851ed92e4da9803f0efdc6f54719d36f39e..46db5dc83dcb4091ea03c045b27968543c0a699b
@@@ -119,34 -119,24 +119,24 @@@ void blk_account_io_start(struct reques
  void blk_account_io_completion(struct request *req, unsigned int bytes);
  void blk_account_io_done(struct request *req);
  
- /*
-  * Internal atomic flags for request handling
-  */
- enum rq_atomic_flags {
-       /*
-        * Keep these two bits first - not because we depend on the
-        * value of them, but we do depend on them being in the same
-        * byte of storage to ensure ordering on writes. Keeping them
-        * first will achieve that nicely.
-        */
-       REQ_ATOM_COMPLETE = 0,
-       REQ_ATOM_STARTED,
-       REQ_ATOM_POLL_SLEPT,
- };
  /*
   * EH timer and IO completion will both attempt to 'grab' the request, make
-  * sure that only one of them succeeds
+  * sure that only one of them succeeds. Steal the bottom bit of the
+  * __deadline field for this.
   */
  static inline int blk_mark_rq_complete(struct request *rq)
  {
-       return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+       return test_and_set_bit(0, &rq->__deadline);
  }
  
  static inline void blk_clear_rq_complete(struct request *rq)
  {
-       clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+       clear_bit(0, &rq->__deadline);
+ }
+ static inline bool blk_rq_is_complete(struct request *rq)
+ {
+       return test_bit(0, &rq->__deadline);
  }
  
  /*
@@@ -172,6 -162,9 +162,9 @@@ static inline void elv_deactivate_rq(st
                e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
  }
  
+ int elv_register_queue(struct request_queue *q);
+ void elv_unregister_queue(struct request_queue *q);
  struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
  
  #ifdef CONFIG_FAIL_IO_TIMEOUT
@@@ -245,6 -238,21 +238,21 @@@ static inline void req_set_nomerge(stru
                q->last_merge = NULL;
  }
  
+ /*
+  * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+  * setting the deadline clears the bottom bit, potentially clearing the
+  * completed bit. The user has to be OK with this (current ones are fine).
+  */
+ static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+ {
+       rq->__deadline = time & ~0x1UL;
+ }
+ static inline unsigned long blk_rq_deadline(struct request *rq)
+ {
+       return rq->__deadline & ~0x1UL;
+ }
  /*
   * Internal io_context interface
   */
@@@ -330,6 -338,4 +338,6 @@@ static inline void blk_queue_bounce(str
  }
  #endif /* CONFIG_BOUNCE */
  
 +extern void blk_drain_queue(struct request_queue *q);
 +
  #endif /* BLK_INTERNAL_H */
diff --combined block/bounce.c
index 1d05c422c932ad56d705f94deed6cce0891ff9d3,c35a3d7f05281e95822e26b98b19eedb275475a7..6a3e68292273b03d3aa12a82a4a5af4979e95d60
@@@ -113,45 -113,50 +113,50 @@@ int init_emergency_isa_pool(void
  static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
  {
        unsigned char *vfrom;
-       struct bio_vec tovec, *fromvec = from->bi_io_vec;
+       struct bio_vec tovec, fromvec;
        struct bvec_iter iter;
+       /*
+        * The bio of @from is created by bounce, so we can iterate
+        * its bvec from start to end, but the @from->bi_iter can't be
+        * trusted because it might be changed by splitting.
+        */
+       struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
  
        bio_for_each_segment(tovec, to, iter) {
-               if (tovec.bv_page != fromvec->bv_page) {
+               fromvec = bio_iter_iovec(from, from_iter);
+               if (tovec.bv_page != fromvec.bv_page) {
                        /*
                         * fromvec->bv_offset and fromvec->bv_len might have
                         * been modified by the block layer, so use the original
                         * copy, bounce_copy_vec already uses tovec->bv_len
                         */
-                       vfrom = page_address(fromvec->bv_page) +
+                       vfrom = page_address(fromvec.bv_page) +
                                tovec.bv_offset;
  
                        bounce_copy_vec(&tovec, vfrom);
                        flush_dcache_page(tovec.bv_page);
                }
-               fromvec++;
+               bio_advance_iter(from, &from_iter, tovec.bv_len);
        }
  }
  
  static void bounce_end_io(struct bio *bio, mempool_t *pool)
  {
        struct bio *bio_orig = bio->bi_private;
-       struct bio_vec *bvec, *org_vec;
+       struct bio_vec *bvec, orig_vec;
        int i;
-       int start = bio_orig->bi_iter.bi_idx;
+       struct bvec_iter orig_iter = bio_orig->bi_iter;
  
        /*
         * free up bounce indirect pages used
         */
        bio_for_each_segment_all(bvec, bio, i) {
-               org_vec = bio_orig->bi_io_vec + i + start;
-               if (bvec->bv_page == org_vec->bv_page)
-                       continue;
-               dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-               mempool_free(bvec->bv_page, pool);
+               orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+               if (bvec->bv_page != orig_vec.bv_page) {
+                       dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+                       mempool_free(bvec->bv_page, pool);
+               }
+               bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
        }
  
        bio_orig->bi_status = bio->bi_status;
@@@ -200,7 -205,6 +205,7 @@@ static void __blk_queue_bounce(struct r
        unsigned i = 0;
        bool bounce = false;
        int sectors = 0;
 +      bool passthrough = bio_is_passthrough(*bio_orig);
  
        bio_for_each_segment(from, *bio_orig, iter) {
                if (i++ < BIO_MAX_PAGES)
        if (!bounce)
                return;
  
 -      if (sectors < bio_sectors(*bio_orig)) {
 +      if (!passthrough && sectors < bio_sectors(*bio_orig)) {
                bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
                bio_chain(bio, *bio_orig);
                generic_make_request(*bio_orig);
                *bio_orig = bio;
        }
 -      bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
 +      bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
 +                      bounce_bio_set);
  
        bio_for_each_segment_all(to, bio, i) {
                struct page *page = to->bv_page;
diff --combined drivers/block/null_blk.c
index ad0477ae820f040affe54f4368d3a02d9da63350,5b94e530570c44d7552df2075c3ef2aebe11dfc1..6655893a3a7a8365a5feb4f035b65021d38f3847
@@@ -12,9 -12,9 +12,9 @@@
  #include <linux/slab.h>
  #include <linux/blk-mq.h>
  #include <linux/hrtimer.h>
- #include <linux/lightnvm.h>
  #include <linux/configfs.h>
  #include <linux/badblocks.h>
+ #include <linux/fault-inject.h>
  
  #define SECTOR_SHIFT          9
  #define PAGE_SECTORS_SHIFT    (PAGE_SHIFT - SECTOR_SHIFT)
  #define TICKS_PER_SEC         50ULL
  #define TIMER_INTERVAL                (NSEC_PER_SEC / TICKS_PER_SEC)
  
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static DECLARE_FAULT_ATTR(null_timeout_attr);
+ #endif
  static inline u64 mb_per_tick(int mbps)
  {
        return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
  struct nullb_cmd {
        struct list_head list;
        struct llist_node ll_list;
 -      call_single_data_t csd;
 +      struct __call_single_data csd;
        struct request *rq;
        struct bio *bio;
        unsigned int tag;
 +      blk_status_t error;
        struct nullb_queue *nq;
        struct hrtimer timer;
 -      blk_status_t error;
  };
  
  struct nullb_queue {
@@@ -107,7 -111,6 +111,6 @@@ struct nullb_device 
        unsigned int hw_queue_depth; /* queue depth */
        unsigned int index; /* index of the disk, only valid with a disk */
        unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
-       bool use_lightnvm; /* register as a LightNVM device */
        bool blocking; /* blocking blk-mq device */
        bool use_per_node_hctx; /* use per-node allocation for hardware context */
        bool power; /* power on/off the device */
@@@ -121,7 -124,6 +124,6 @@@ struct nullb 
        unsigned int index;
        struct request_queue *q;
        struct gendisk *disk;
-       struct nvm_dev *ndev;
        struct blk_mq_tag_set *tag_set;
        struct blk_mq_tag_set __tag_set;
        unsigned int queue_depth;
@@@ -139,7 -141,6 +141,6 @@@ static LIST_HEAD(nullb_list)
  static struct mutex lock;
  static int null_major;
  static DEFINE_IDA(nullb_indexes);
- static struct kmem_cache *ppa_cache;
  static struct blk_mq_tag_set tag_set;
  
  enum {
@@@ -166,6 -167,11 +167,11 @@@ static int g_home_node = NUMA_NO_NODE
  module_param_named(home_node, g_home_node, int, S_IRUGO);
  MODULE_PARM_DESC(home_node, "Home node for the device");
  
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static char g_timeout_str[80];
+ module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+ #endif
  static int g_queue_mode = NULL_Q_MQ;
  
  static int null_param_store_val(const char *str, int *val, int min, int max)
@@@ -208,10 -214,6 +214,6 @@@ static int nr_devices = 1
  module_param(nr_devices, int, S_IRUGO);
  MODULE_PARM_DESC(nr_devices, "Number of devices to register");
  
- static bool g_use_lightnvm;
- module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
- MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
  static bool g_blocking;
  module_param_named(blocking, g_blocking, bool, S_IRUGO);
  MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@@@ -345,7 -347,6 +347,6 @@@ NULLB_DEVICE_ATTR(blocksize, uint)
  NULLB_DEVICE_ATTR(irqmode, uint);
  NULLB_DEVICE_ATTR(hw_queue_depth, uint);
  NULLB_DEVICE_ATTR(index, uint);
- NULLB_DEVICE_ATTR(use_lightnvm, bool);
  NULLB_DEVICE_ATTR(blocking, bool);
  NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
  NULLB_DEVICE_ATTR(memory_backed, bool);
@@@ -455,7 -456,6 +456,6 @@@ static struct configfs_attribute *nullb
        &nullb_device_attr_irqmode,
        &nullb_device_attr_hw_queue_depth,
        &nullb_device_attr_index,
-       &nullb_device_attr_use_lightnvm,
        &nullb_device_attr_blocking,
        &nullb_device_attr_use_per_node_hctx,
        &nullb_device_attr_power,
@@@ -573,7 -573,6 +573,6 @@@ static struct nullb_device *null_alloc_
        dev->blocksize = g_bs;
        dev->irqmode = g_irqmode;
        dev->hw_queue_depth = g_hw_queue_depth;
-       dev->use_lightnvm = g_use_lightnvm;
        dev->blocking = g_blocking;
        dev->use_per_node_hctx = g_use_per_node_hctx;
        return dev;
@@@ -1352,6 -1351,12 +1351,12 @@@ static blk_qc_t null_queue_bio(struct r
        return BLK_QC_T_NONE;
  }
  
+ static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+ {
+       pr_info("null: rq %p timed out\n", rq);
+       return BLK_EH_HANDLED;
+ }
  static int null_rq_prep_fn(struct request_queue *q, struct request *req)
  {
        struct nullb *nullb = q->queuedata;
        return BLKPREP_DEFER;
  }
  
+ static bool should_timeout_request(struct request *rq)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (g_timeout_str[0])
+               return should_fail(&null_timeout_attr, 1);
+ #endif
+       return false;
+ }
  static void null_request_fn(struct request_queue *q)
  {
        struct request *rq;
        while ((rq = blk_fetch_request(q)) != NULL) {
                struct nullb_cmd *cmd = rq->special;
  
-               spin_unlock_irq(q->queue_lock);
-               null_handle_cmd(cmd);
-               spin_lock_irq(q->queue_lock);
+               if (!should_timeout_request(rq)) {
+                       spin_unlock_irq(q->queue_lock);
+                       null_handle_cmd(cmd);
+                       spin_lock_irq(q->queue_lock);
+               }
        }
  }
  
+ static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+ {
+       pr_info("null: rq %p timed out\n", rq);
+       return BLK_EH_HANDLED;
+ }
  static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
                         const struct blk_mq_queue_data *bd)
  {
  
        blk_mq_start_request(bd->rq);
  
-       return null_handle_cmd(cmd);
+       if (!should_timeout_request(bd->rq))
+               return null_handle_cmd(cmd);
+       return BLK_STS_OK;
  }
  
  static const struct blk_mq_ops null_mq_ops = {
        .queue_rq       = null_queue_rq,
        .complete       = null_softirq_done_fn,
+       .timeout        = null_timeout_rq,
  };
  
  static void cleanup_queue(struct nullb_queue *nq)
@@@ -1423,170 -1450,6 +1450,6 @@@ static void cleanup_queues(struct null
        kfree(nullb->queues);
  }
  
- #ifdef CONFIG_NVM
- static void null_lnvm_end_io(struct request *rq, blk_status_t status)
- {
-       struct nvm_rq *rqd = rq->end_io_data;
-       /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
-       rqd->error = status ? -EIO : 0;
-       nvm_end_io(rqd);
-       blk_put_request(rq);
- }
- static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
- {
-       struct request_queue *q = dev->q;
-       struct request *rq;
-       struct bio *bio = rqd->bio;
-       rq = blk_mq_alloc_request(q,
-               op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
-       if (IS_ERR(rq))
-               return -ENOMEM;
-       blk_init_request_from_bio(rq, bio);
-       rq->end_io_data = rqd;
-       blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
-       return 0;
- }
- static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
- {
-       struct nullb *nullb = dev->q->queuedata;
-       sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
-       sector_t blksize;
-       struct nvm_id_group *grp;
-       id->ver_id = 0x1;
-       id->vmnt = 0;
-       id->cap = 0x2;
-       id->dom = 0x1;
-       id->ppaf.blk_offset = 0;
-       id->ppaf.blk_len = 16;
-       id->ppaf.pg_offset = 16;
-       id->ppaf.pg_len = 16;
-       id->ppaf.sect_offset = 32;
-       id->ppaf.sect_len = 8;
-       id->ppaf.pln_offset = 40;
-       id->ppaf.pln_len = 8;
-       id->ppaf.lun_offset = 48;
-       id->ppaf.lun_len = 8;
-       id->ppaf.ch_offset = 56;
-       id->ppaf.ch_len = 8;
-       sector_div(size, nullb->dev->blocksize); /* convert size to pages */
-       size >>= 8; /* concert size to pgs pr blk */
-       grp = &id->grp;
-       grp->mtype = 0;
-       grp->fmtype = 0;
-       grp->num_ch = 1;
-       grp->num_pg = 256;
-       blksize = size;
-       size >>= 16;
-       grp->num_lun = size + 1;
-       sector_div(blksize, grp->num_lun);
-       grp->num_blk = blksize;
-       grp->num_pln = 1;
-       grp->fpg_sz = nullb->dev->blocksize;
-       grp->csecs = nullb->dev->blocksize;
-       grp->trdt = 25000;
-       grp->trdm = 25000;
-       grp->tprt = 500000;
-       grp->tprm = 500000;
-       grp->tbet = 1500000;
-       grp->tbem = 1500000;
-       grp->mpos = 0x010101; /* single plane rwe */
-       grp->cpar = nullb->dev->hw_queue_depth;
-       return 0;
- }
- static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
- {
-       mempool_t *virtmem_pool;
-       virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
-       if (!virtmem_pool) {
-               pr_err("null_blk: Unable to create virtual memory pool\n");
-               return NULL;
-       }
-       return virtmem_pool;
- }
- static void null_lnvm_destroy_dma_pool(void *pool)
- {
-       mempool_destroy(pool);
- }
- static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
-                               gfp_t mem_flags, dma_addr_t *dma_handler)
- {
-       return mempool_alloc(pool, mem_flags);
- }
- static void null_lnvm_dev_dma_free(void *pool, void *entry,
-                                                       dma_addr_t dma_handler)
- {
-       mempool_free(entry, pool);
- }
- static struct nvm_dev_ops null_lnvm_dev_ops = {
-       .identity               = null_lnvm_id,
-       .submit_io              = null_lnvm_submit_io,
-       .create_dma_pool        = null_lnvm_create_dma_pool,
-       .destroy_dma_pool       = null_lnvm_destroy_dma_pool,
-       .dev_dma_alloc          = null_lnvm_dev_dma_alloc,
-       .dev_dma_free           = null_lnvm_dev_dma_free,
-       /* Simulate nvme protocol restriction */
-       .max_phys_sect          = 64,
- };
- static int null_nvm_register(struct nullb *nullb)
- {
-       struct nvm_dev *dev;
-       int rv;
-       dev = nvm_alloc_dev(0);
-       if (!dev)
-               return -ENOMEM;
-       dev->q = nullb->q;
-       memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
-       dev->ops = &null_lnvm_dev_ops;
-       rv = nvm_register(dev);
-       if (rv) {
-               kfree(dev);
-               return rv;
-       }
-       nullb->ndev = dev;
-       return 0;
- }
- static void null_nvm_unregister(struct nullb *nullb)
- {
-       nvm_unregister(nullb->ndev);
- }
- #else
- static int null_nvm_register(struct nullb *nullb)
- {
-       pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
-       return -EINVAL;
- }
- static void null_nvm_unregister(struct nullb *nullb) {}
- #endif /* CONFIG_NVM */
  static void null_del_dev(struct nullb *nullb)
  {
        struct nullb_device *dev = nullb->dev;
  
        list_del_init(&nullb->list);
  
-       if (dev->use_lightnvm)
-               null_nvm_unregister(nullb);
-       else
-               del_gendisk(nullb->disk);
+       del_gendisk(nullb->disk);
  
        if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
                hrtimer_cancel(&nullb->bw_timer);
        if (dev->queue_mode == NULL_Q_MQ &&
            nullb->tag_set == &nullb->__tag_set)
                blk_mq_free_tag_set(nullb->tag_set);
-       if (!dev->use_lightnvm)
-               put_disk(nullb->disk);
+       put_disk(nullb->disk);
        cleanup_queues(nullb);
        if (null_cache_active(nullb))
                null_free_device_storage(nullb->dev, true);
@@@ -1775,11 -1634,6 +1634,6 @@@ static void null_validate_conf(struct n
  {
        dev->blocksize = round_down(dev->blocksize, 512);
        dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
-       if (dev->use_lightnvm && dev->blocksize != 4096)
-               dev->blocksize = 4096;
-       if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
-               dev->queue_mode = NULL_Q_MQ;
  
        if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
                if (dev->submit_queues != nr_online_nodes)
                dev->mbps = 0;
  }
  
+ static bool null_setup_fault(void)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+       if (!g_timeout_str[0])
+               return true;
+       if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+               return false;
+       null_timeout_attr.verbose = 0;
+ #endif
+       return true;
+ }
  static int null_add_dev(struct nullb_device *dev)
  {
        struct nullb *nullb;
                if (rv)
                        goto out_cleanup_queues;
  
+               if (!null_setup_fault())
+                       goto out_cleanup_queues;
+               nullb->tag_set->timeout = 5 * HZ;
                nullb->q = blk_mq_init_queue(nullb->tag_set);
                if (IS_ERR(nullb->q)) {
                        rv = -ENOMEM;
                        rv = -ENOMEM;
                        goto out_cleanup_queues;
                }
+               if (!null_setup_fault())
+                       goto out_cleanup_blk_queue;
                blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
                blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+               blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+               nullb->q->rq_timeout = 5 * HZ;
                rv = init_driver_queues(nullb);
                if (rv)
                        goto out_cleanup_blk_queue;
  
        sprintf(nullb->disk_name, "nullb%d", nullb->index);
  
-       if (dev->use_lightnvm)
-               rv = null_nvm_register(nullb);
-       else
-               rv = null_gendisk_register(nullb);
+       rv = null_gendisk_register(nullb);
        if (rv)
                goto out_cleanup_blk_queue;
  
@@@ -1938,18 -1812,6 +1812,6 @@@ static int __init null_init(void
                g_bs = PAGE_SIZE;
        }
  
-       if (g_use_lightnvm && g_bs != 4096) {
-               pr_warn("null_blk: LightNVM only supports 4k block size\n");
-               pr_warn("null_blk: defaults block size to 4k\n");
-               g_bs = 4096;
-       }
-       if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
-               pr_warn("null_blk: LightNVM only supported for blk-mq\n");
-               pr_warn("null_blk: defaults queue mode to blk-mq\n");
-               g_queue_mode = NULL_Q_MQ;
-       }
        if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
                if (g_submit_queues != nr_online_nodes) {
                        pr_warn("null_blk: submit_queues param is set to %u.\n",
                goto err_conf;
        }
  
-       if (g_use_lightnvm) {
-               ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
-                                                               0, 0, NULL);
-               if (!ppa_cache) {
-                       pr_err("null_blk: unable to create ppa cache\n");
-                       ret = -ENOMEM;
-                       goto err_ppa;
-               }
-       }
        for (i = 0; i < nr_devices; i++) {
                dev = null_alloc_dev();
                if (!dev) {
@@@ -2015,8 -1867,6 +1867,6 @@@ err_dev
                null_del_dev(nullb);
                null_free_dev(dev);
        }
-       kmem_cache_destroy(ppa_cache);
- err_ppa:
        unregister_blkdev(null_major, "nullb");
  err_conf:
        configfs_unregister_subsystem(&nullb_subsys);
@@@ -2047,8 -1897,6 +1897,6 @@@ static void __exit null_exit(void
  
        if (g_queue_mode == NULL_Q_MQ && shared_tags)
                blk_mq_free_tag_set(&tag_set);
-       kmem_cache_destroy(ppa_cache);
  }
  
  module_init(null_init);
diff --combined drivers/md/dm-crypt.c
index 554d60394c0663980d89c3bb84a48007f365470b,48332666fc38494fe813429b4fe450a2cadb3f2e..2ad429100d25df57974f2981e8b6cdcd587b7af6
@@@ -1446,7 -1446,6 +1446,6 @@@ static void crypt_free_buffer_pages(str
        bio_for_each_segment_all(bv, clone, i) {
                BUG_ON(!bv->bv_page);
                mempool_free(bv->bv_page, cc->page_pool);
-               bv->bv_page = NULL;
        }
  }
  
@@@ -1954,15 -1953,10 +1953,15 @@@ static int crypt_setkey(struct crypt_co
        /* Ignore extra keys (which are used for IV etc) */
        subkey_size = crypt_subkey_size(cc);
  
 -      if (crypt_integrity_hmac(cc))
 +      if (crypt_integrity_hmac(cc)) {
 +              if (subkey_size < cc->key_mac_size)
 +                      return -EINVAL;
 +
                crypt_copy_authenckey(cc->authenc_key, cc->key,
                                      subkey_size - cc->key_mac_size,
                                      cc->key_mac_size);
 +      }
 +
        for (i = 0; i < cc->tfms_count; i++) {
                if (crypt_integrity_hmac(cc))
                        r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
@@@ -2058,6 -2052,9 +2057,6 @@@ static int crypt_set_keyring_key(struc
  
        ret = crypt_setkey(cc);
  
 -      /* wipe the kernel key payload copy in each case */
 -      memset(cc->key, 0, cc->key_size * sizeof(u8));
 -
        if (!ret) {
                set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
                kzfree(cc->key_string);
@@@ -2525,10 -2522,6 +2524,10 @@@ static int crypt_ctr_cipher(struct dm_t
                }
        }
  
 +      /* wipe the kernel key payload copy */
 +      if (cc->key_string)
 +              memset(cc->key, 0, cc->key_size * sizeof(u8));
 +
        return ret;
  }
  
@@@ -2746,7 -2739,6 +2745,7 @@@ static int crypt_ctr(struct dm_target *
                        cc->tag_pool_max_sectors * cc->on_disk_tag_size);
                if (!cc->tag_pool) {
                        ti->error = "Cannot allocate integrity tags mempool";
 +                      ret = -ENOMEM;
                        goto bad;
                }
  
@@@ -2968,9 -2960,6 +2967,9 @@@ static int crypt_message(struct dm_targ
                                return ret;
                        if (cc->iv_gen_ops && cc->iv_gen_ops->init)
                                ret = cc->iv_gen_ops->init(cc);
 +                      /* wipe the kernel key payload copy */
 +                      if (cc->key_string)
 +                              memset(cc->key, 0, cc->key_size * sizeof(u8));
                        return ret;
                }
                if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
@@@ -3017,7 -3006,7 +3016,7 @@@ static void crypt_io_hints(struct dm_ta
  
  static struct target_type crypt_target = {
        .name   = "crypt",
 -      .version = {1, 18, 0},
 +      .version = {1, 18, 1},
        .module = THIS_MODULE,
        .ctr    = crypt_ctr,
        .dtr    = crypt_dtr,
diff --combined drivers/nvme/host/core.c
index 839650e0926af1aaaf21bafbdc1baa79cf907d76,b3af8e914570ab44fad02bcd8ba9bcbb065670d4..e8104871cbbf753a19f4d6915601b81e3350d686
@@@ -29,6 -29,9 +29,9 @@@
  #include <linux/pm_qos.h>
  #include <asm/unaligned.h>
  
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
  #include "nvme.h"
  #include "fabrics.h"
  
@@@ -65,9 -68,26 +68,26 @@@ static bool streams
  module_param(streams, bool, 0644);
  MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
  
+ /*
+  * nvme_wq - hosts nvme related works that are not reset or delete
+  * nvme_reset_wq - hosts nvme reset works
+  * nvme_delete_wq - hosts nvme delete works
+  *
+  * nvme_wq will host works such are scan, aen handling, fw activation,
+  * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+  * runs reset works which also flush works hosted on nvme_wq for
+  * serialization purposes. nvme_delete_wq host controller deletion
+  * works which flush reset works for serialization.
+  */
  struct workqueue_struct *nvme_wq;
  EXPORT_SYMBOL_GPL(nvme_wq);
  
+ struct workqueue_struct *nvme_reset_wq;
+ EXPORT_SYMBOL_GPL(nvme_reset_wq);
+ struct workqueue_struct *nvme_delete_wq;
+ EXPORT_SYMBOL_GPL(nvme_delete_wq);
  static DEFINE_IDA(nvme_subsystems_ida);
  static LIST_HEAD(nvme_subsystems);
  static DEFINE_MUTEX(nvme_subsystems_lock);
@@@ -89,13 -109,13 +109,13 @@@ int nvme_reset_ctrl(struct nvme_ctrl *c
  {
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
                return -EBUSY;
-       if (!queue_work(nvme_wq, &ctrl->reset_work))
+       if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
                return -EBUSY;
        return 0;
  }
  EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
  
static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
  {
        int ret;
  
                flush_work(&ctrl->reset_work);
        return ret;
  }
+ EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
  
  static void nvme_delete_ctrl_work(struct work_struct *work)
  {
@@@ -122,7 -143,7 +143,7 @@@ int nvme_delete_ctrl(struct nvme_ctrl *
  {
        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
                return -EBUSY;
-       if (!queue_work(nvme_wq, &ctrl->delete_work))
+       if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
                return -EBUSY;
        return 0;
  }
@@@ -157,13 -178,20 +178,20 @@@ static blk_status_t nvme_error_status(s
                return BLK_STS_OK;
        case NVME_SC_CAP_EXCEEDED:
                return BLK_STS_NOSPC;
+       case NVME_SC_LBA_RANGE:
+               return BLK_STS_TARGET;
+       case NVME_SC_BAD_ATTRIBUTES:
        case NVME_SC_ONCS_NOT_SUPPORTED:
+       case NVME_SC_INVALID_OPCODE:
+       case NVME_SC_INVALID_FIELD:
+       case NVME_SC_INVALID_NS:
                return BLK_STS_NOTSUPP;
        case NVME_SC_WRITE_FAULT:
        case NVME_SC_READ_ERROR:
        case NVME_SC_UNWRITTEN_BLOCK:
        case NVME_SC_ACCESS_DENIED:
        case NVME_SC_READ_ONLY:
+       case NVME_SC_COMPARE_FAILED:
                return BLK_STS_MEDIUM;
        case NVME_SC_GUARD_CHECK:
        case NVME_SC_APPTAG_CHECK:
@@@ -190,8 -218,12 +218,12 @@@ static inline bool nvme_req_needs_retry
  
  void nvme_complete_rq(struct request *req)
  {
-       if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
-               if (nvme_req_needs_failover(req)) {
+       blk_status_t status = nvme_error_status(req);
+       trace_nvme_complete_rq(req);
+       if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+               if (nvme_req_needs_failover(req, status)) {
                        nvme_failover_req(req);
                        return;
                }
                        return;
                }
        }
-       blk_mq_end_request(req, nvme_error_status(req));
+       blk_mq_end_request(req, status);
  }
  EXPORT_SYMBOL_GPL(nvme_complete_rq);
  
@@@ -232,6 -263,15 +263,15 @@@ bool nvme_change_ctrl_state(struct nvme
  
        old_state = ctrl->state;
        switch (new_state) {
+       case NVME_CTRL_ADMIN_ONLY:
+               switch (old_state) {
+               case NVME_CTRL_RECONNECTING:
+                       changed = true;
+                       /* FALLTHRU */
+               default:
+                       break;
+               }
+               break;
        case NVME_CTRL_LIVE:
                switch (old_state) {
                case NVME_CTRL_NEW:
                switch (old_state) {
                case NVME_CTRL_NEW:
                case NVME_CTRL_LIVE:
+               case NVME_CTRL_ADMIN_ONLY:
                        changed = true;
                        /* FALLTHRU */
                default:
        case NVME_CTRL_DELETING:
                switch (old_state) {
                case NVME_CTRL_LIVE:
+               case NVME_CTRL_ADMIN_ONLY:
                case NVME_CTRL_RESETTING:
                case NVME_CTRL_RECONNECTING:
                        changed = true;
@@@ -591,6 -633,10 +633,10 @@@ blk_status_t nvme_setup_cmd(struct nvme
        }
  
        cmd->common.command_id = req->tag;
+       if (ns)
+               trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+       else
+               trace_nvme_setup_admin_cmd(cmd);
        return ret;
  }
  EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@@ -1217,16 -1263,27 +1263,27 @@@ static int nvme_open(struct block_devic
  #ifdef CONFIG_NVME_MULTIPATH
        /* should never be called due to GENHD_FL_HIDDEN */
        if (WARN_ON_ONCE(ns->head->disk))
-               return -ENXIO;
+               goto fail;
  #endif
        if (!kref_get_unless_zero(&ns->kref))
-               return -ENXIO;
+               goto fail;
+       if (!try_module_get(ns->ctrl->ops->module))
+               goto fail_put_ns;
        return 0;
+ fail_put_ns:
+       nvme_put_ns(ns);
+ fail:
+       return -ENXIO;
  }
  
  static void nvme_release(struct gendisk *disk, fmode_t mode)
  {
-       nvme_put_ns(disk->private_data);
+       struct nvme_ns *ns = disk->private_data;
+       module_put(ns->ctrl->ops->module);
+       nvme_put_ns(ns);
  }
  
  static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@@ -1287,7 -1344,7 +1344,7 @@@ static void nvme_config_discard(struct 
        BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
                        NVME_DSM_MAX_RANGES);
  
 -      queue->limits.discard_alignment = size;
 +      queue->limits.discard_alignment = 0;
        queue->limits.discard_granularity = size;
  
        blk_queue_max_discard_sectors(queue, UINT_MAX);
@@@ -1335,7 -1392,6 +1392,7 @@@ static void nvme_update_disk_info(struc
                struct nvme_ns *ns, struct nvme_id_ns *id)
  {
        sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
 +      unsigned short bs = 1 << ns->lba_shift;
        unsigned stream_alignment = 0;
  
        if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
        blk_mq_freeze_queue(disk->queue);
        blk_integrity_unregister(disk);
  
 -      blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
 +      blk_queue_logical_block_size(disk->queue, bs);
 +      blk_queue_physical_block_size(disk->queue, bs);
 +      blk_queue_io_min(disk->queue, bs);
 +
        if (ns->ms && !ns->ext &&
            (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
                nvme_init_integrity(disk, ns->ms, ns->pi_type);
@@@ -1709,8 -1762,7 +1766,8 @@@ static void nvme_set_queue_limits(struc
                blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
                blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
        }
 -      if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
 +      if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
 +          is_power_of_2(ctrl->max_hw_sectors))
                blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
        blk_queue_virt_boundary(q, ctrl->page_size - 1);
        if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
@@@ -2052,6 -2104,22 +2109,22 @@@ static const struct attribute_group *nv
        NULL,
  };
  
+ static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+ {
+       int count = 0;
+       struct nvme_ctrl *ctrl;
+       mutex_lock(&subsys->lock);
+       list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+               if (ctrl->state != NVME_CTRL_DELETING &&
+                   ctrl->state != NVME_CTRL_DEAD)
+                       count++;
+       }
+       mutex_unlock(&subsys->lock);
+       return count;
+ }
  static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
  {
        struct nvme_subsystem *subsys, *found;
                 * Verify that the subsystem actually supports multiple
                 * controllers, else bail out.
                 */
-               if (!(id->cmic & (1 << 1))) {
+               if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
                        dev_err(ctrl->device,
                                "ignoring ctrl due to duplicate subnqn (%s).\n",
                                found->subnqn);
@@@ -2257,7 -2325,7 +2330,7 @@@ int nvme_init_identify(struct nvme_ctr
                                                 shutdown_timeout, 60);
  
                if (ctrl->shutdown_timeout != shutdown_timeout)
-                       dev_warn(ctrl->device,
+                       dev_info(ctrl->device,
                                 "Shutdown timeout set to %u seconds\n",
                                 ctrl->shutdown_timeout);
        } else
@@@ -2341,8 -2409,14 +2414,14 @@@ static int nvme_dev_open(struct inode *
        struct nvme_ctrl *ctrl =
                container_of(inode->i_cdev, struct nvme_ctrl, cdev);
  
-       if (ctrl->state != NVME_CTRL_LIVE)
+       switch (ctrl->state) {
+       case NVME_CTRL_LIVE:
+       case NVME_CTRL_ADMIN_ONLY:
+               break;
+       default:
                return -EWOULDBLOCK;
+       }
        file->private_data = ctrl;
        return 0;
  }
@@@ -2606,6 -2680,7 +2685,7 @@@ static ssize_t nvme_sysfs_show_state(st
        static const char *const state_name[] = {
                [NVME_CTRL_NEW]         = "new",
                [NVME_CTRL_LIVE]        = "live",
+               [NVME_CTRL_ADMIN_ONLY]  = "only-admin",
                [NVME_CTRL_RESETTING]   = "resetting",
                [NVME_CTRL_RECONNECTING]= "reconnecting",
                [NVME_CTRL_DELETING]    = "deleting",
@@@ -2874,6 -2949,7 +2954,6 @@@ static void nvme_alloc_ns(struct nvme_c
  
        blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
        nvme_set_queue_limits(ctrl, ns->queue);
 -      nvme_setup_streams_ns(ctrl, ns);
  
        id = nvme_identify_ns(ctrl, nsid);
        if (!id)
  
        if (nvme_init_ns_head(ns, nsid, id, &new))
                goto out_free_id;
 +      nvme_setup_streams_ns(ctrl, ns);
        
  #ifdef CONFIG_NVME_MULTIPATH
        /*
@@@ -2970,6 -3045,8 +3050,6 @@@ static void nvme_ns_remove(struct nvme_
                return;
  
        if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
 -              if (blk_get_integrity(ns->disk))
 -                      blk_integrity_unregister(ns->disk);
                nvme_mpath_remove_disk_links(ns);
                sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
                                        &nvme_ns_id_attr_group);
                        nvme_nvm_unregister_sysfs(ns);
                del_gendisk(ns->disk);
                blk_cleanup_queue(ns->queue);
 +              if (blk_get_integrity(ns->disk))
 +                      blk_integrity_unregister(ns->disk);
        }
  
        mutex_lock(&ns->ctrl->subsys->lock);
        mutex_unlock(&ns->ctrl->namespaces_mutex);
  
        synchronize_srcu(&ns->head->srcu);
 +      nvme_mpath_check_last_path(ns);
        nvme_put_ns(ns);
  }
  
@@@ -3079,6 -3153,8 +3159,8 @@@ static void nvme_scan_work(struct work_
        if (ctrl->state != NVME_CTRL_LIVE)
                return;
  
+       WARN_ON_ONCE(!ctrl->tagset);
        if (nvme_identify_ctrl(ctrl, &id))
                return;
  
  void nvme_queue_scan(struct nvme_ctrl *ctrl)
  {
        /*
-        * Do not queue new scan work when a controller is reset during
-        * removal.
+        * Only new queue scan work when admin and IO queues are both alive
         */
        if (ctrl->state == NVME_CTRL_LIVE)
                queue_work(nvme_wq, &ctrl->scan_work);
@@@ -3477,16 -3552,26 +3558,26 @@@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset)
  
  int __init nvme_core_init(void)
  {
-       int result;
+       int result = -ENOMEM;
  
        nvme_wq = alloc_workqueue("nvme-wq",
                        WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
        if (!nvme_wq)
-               return -ENOMEM;
+               goto out;
+       nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+                       WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+       if (!nvme_reset_wq)
+               goto destroy_wq;
+       nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+                       WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+       if (!nvme_delete_wq)
+               goto destroy_reset_wq;
  
        result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
        if (result < 0)
-               goto destroy_wq;
+               goto destroy_delete_wq;
  
        nvme_class = class_create(THIS_MODULE, "nvme");
        if (IS_ERR(nvme_class)) {
@@@ -3505,8 -3590,13 +3596,13 @@@ destroy_class
        class_destroy(nvme_class);
  unregister_chrdev:
        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+ destroy_delete_wq:
+       destroy_workqueue(nvme_delete_wq);
+ destroy_reset_wq:
+       destroy_workqueue(nvme_reset_wq);
  destroy_wq:
        destroy_workqueue(nvme_wq);
+ out:
        return result;
  }
  
@@@ -3516,6 -3606,8 +3612,8 @@@ void nvme_core_exit(void
        class_destroy(nvme_subsys_class);
        class_destroy(nvme_class);
        unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+       destroy_workqueue(nvme_delete_wq);
+       destroy_workqueue(nvme_reset_wq);
        destroy_workqueue(nvme_wq);
  }
  
index 894c2ccb3891e0b83e1c839f0b08f4cba5d179ae,9cee72a80472946d02899d92db08d51d021ef57a..5dd4ceefed8fe0d0897aa8dadb1d266174b2eb02
@@@ -74,7 -74,6 +74,7 @@@ static struct nvmf_host *nvmf_host_defa
                return NULL;
  
        kref_init(&host->ref);
 +      uuid_gen(&host->id);
        snprintf(host->nqn, NVMF_NQN_SIZE,
                "nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
  
@@@ -493,7 -492,7 +493,7 @@@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect
   */
  int nvmf_register_transport(struct nvmf_transport_ops *ops)
  {
-       if (!ops->create_ctrl)
+       if (!ops->create_ctrl || !ops->module)
                return -EINVAL;
  
        down_write(&nvmf_transports_rwsem);
@@@ -739,11 -738,14 +739,14 @@@ static int nvmf_parse_options(struct nv
                                ret = -ENOMEM;
                                goto out;
                        }
-                       if (uuid_parse(p, &hostid)) {
+                       ret = uuid_parse(p, &hostid);
+                       if (ret) {
                                pr_err("Invalid hostid %s\n", p);
                                ret = -EINVAL;
+                               kfree(p);
                                goto out;
                        }
+                       kfree(p);
                        break;
                case NVMF_OPT_DUP_CONNECT:
                        opts->duplicate_connect = true;
@@@ -869,32 -871,41 +872,41 @@@ nvmf_create_ctrl(struct device *dev, co
                goto out_unlock;
        }
  
+       if (!try_module_get(ops->module)) {
+               ret = -EBUSY;
+               goto out_unlock;
+       }
        ret = nvmf_check_required_opts(opts, ops->required_opts);
        if (ret)
-               goto out_unlock;
+               goto out_module_put;
        ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
                                ops->allowed_opts | ops->required_opts);
        if (ret)
-               goto out_unlock;
+               goto out_module_put;
  
        ctrl = ops->create_ctrl(dev, opts);
        if (IS_ERR(ctrl)) {
                ret = PTR_ERR(ctrl);
-               goto out_unlock;
+               goto out_module_put;
        }
  
        if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
                dev_warn(ctrl->device,
                        "controller returned incorrect NQN: \"%s\".\n",
                        ctrl->subsys->subnqn);
+               module_put(ops->module);
                up_read(&nvmf_transports_rwsem);
                nvme_delete_ctrl_sync(ctrl);
                return ERR_PTR(-EINVAL);
        }
  
+       module_put(ops->module);
        up_read(&nvmf_transports_rwsem);
        return ctrl;
  
+ out_module_put:
+       module_put(ops->module);
  out_unlock:
        up_read(&nvmf_transports_rwsem);
  out_free_opts:
diff --combined drivers/nvme/host/fc.c
index 794e66e4aa20115f4dc3a6b5fc12f706b2040bf4,b76ba4629e02a41b811fe9344d3596a7084427f1..99bf51c7e51325e25ead4f4bb0dfbeff7972e526
@@@ -2921,6 -2921,9 +2921,9 @@@ nvme_fc_delete_association(struct nvme_
        __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
        nvme_fc_free_queue(&ctrl->queues[0]);
  
+       /* re-enable the admin_q so anything new can fast fail */
+       blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
        nvme_fc_ctlr_inactive_on_rport(ctrl);
  }
  
@@@ -2935,6 -2938,9 +2938,9 @@@ nvme_fc_delete_ctrl(struct nvme_ctrl *n
         * waiting for io to terminate
         */
        nvme_fc_delete_association(ctrl);
+       /* resume the io queues so that things will fast fail */
+       nvme_start_queues(nctrl);
  }
  
  static void
@@@ -3221,6 -3227,7 +3227,6 @@@ nvme_fc_init_ctrl(struct device *dev, s
  
                /* initiate nvme ctrl ref counting teardown */
                nvme_uninit_ctrl(&ctrl->ctrl);
 -              nvme_put_ctrl(&ctrl->ctrl);
  
                /* Remove core ctrl ref. */
                nvme_put_ctrl(&ctrl->ctrl);
@@@ -3380,6 -3387,7 +3386,7 @@@ nvme_fc_create_ctrl(struct device *dev
  
  static struct nvmf_transport_ops nvme_fc_transport = {
        .name           = "fc",
+       .module         = THIS_MODULE,
        .required_opts  = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
        .allowed_opts   = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
        .create_ctrl    = nvme_fc_create_ctrl,
diff --combined drivers/nvme/host/nvme.h
index a00eabd0642738bbdbaf0b11d7f8e1747c996e3c,8e7fc1b041b7b1c9db38b3ae089f9db50fa9be2a..8e4550fa08f8bd775e7e5e8e0c169e287845509a
@@@ -32,6 -32,8 +32,8 @@@ extern unsigned int admin_timeout
  #define NVME_KATO_GRACE               10
  
  extern struct workqueue_struct *nvme_wq;
+ extern struct workqueue_struct *nvme_reset_wq;
+ extern struct workqueue_struct *nvme_delete_wq;
  
  enum {
        NVME_NS_LBA             = 0,
@@@ -119,6 -121,7 +121,7 @@@ static inline struct nvme_request *nvme
  enum nvme_ctrl_state {
        NVME_CTRL_NEW,
        NVME_CTRL_LIVE,
+       NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
        NVME_CTRL_RESETTING,
        NVME_CTRL_RECONNECTING,
        NVME_CTRL_DELETING,
@@@ -393,6 -396,7 +396,7 @@@ int nvme_set_queue_count(struct nvme_ct
  void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
  void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
  int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
  int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
  int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
  
@@@ -401,7 -405,7 +405,7 @@@ extern const struct block_device_operat
  
  #ifdef CONFIG_NVME_MULTIPATH
  void nvme_failover_req(struct request *req);
- bool nvme_req_needs_failover(struct request *req);
+ bool nvme_req_needs_failover(struct request *req, blk_status_t error);
  void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
  int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
  void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@@ -417,20 -421,12 +421,21 @@@ static inline void nvme_mpath_clear_cur
                rcu_assign_pointer(head->current_path, NULL);
  }
  struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 +
 +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 +{
 +      struct nvme_ns_head *head = ns->head;
 +
 +      if (head->disk && list_empty(&head->list))
 +              kblockd_schedule_work(&head->requeue_work);
 +}
 +
  #else
  static inline void nvme_failover_req(struct request *req)
  {
  }
- static inline bool nvme_req_needs_failover(struct request *req)
+ static inline bool nvme_req_needs_failover(struct request *req,
+                                          blk_status_t error)
  {
        return false;
  }
@@@ -455,9 -451,6 +460,9 @@@ static inline void nvme_mpath_remove_di
  {
  }
  static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 +{
 +}
 +static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
  {
  }
  #endif /* CONFIG_NVME_MULTIPATH */
diff --combined drivers/nvme/host/pci.c
index 4276ebfff22ba00fd90e8c241cc7edd56deca353,0bc6a9e48c8ea933e4b004787ecf9fafb3639688..6fe7af00a1f42a7dcb3354ac49db499cef6f9c88
@@@ -75,7 -75,7 +75,7 @@@ static void nvme_dev_disable(struct nvm
   * Represents an NVM Express device.  Each nvme_dev is a PCI function.
   */
  struct nvme_dev {
-       struct nvme_queue **queues;
+       struct nvme_queue *queues;
        struct blk_mq_tag_set tagset;
        struct blk_mq_tag_set admin_tagset;
        u32 __iomem *dbs;
@@@ -365,7 -365,7 +365,7 @@@ static int nvme_admin_init_hctx(struct 
                                unsigned int hctx_idx)
  {
        struct nvme_dev *dev = data;
-       struct nvme_queue *nvmeq = dev->queues[0];
+       struct nvme_queue *nvmeq = &dev->queues[0];
  
        WARN_ON(hctx_idx != 0);
        WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@@ -387,7 -387,7 +387,7 @@@ static int nvme_init_hctx(struct blk_mq
                          unsigned int hctx_idx)
  {
        struct nvme_dev *dev = data;
-       struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+       struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
  
        if (!nvmeq->tags)
                nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@@ -403,7 -403,7 +403,7 @@@ static int nvme_init_request(struct blk
        struct nvme_dev *dev = set->driver_data;
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
        int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
-       struct nvme_queue *nvmeq = dev->queues[queue_idx];
+       struct nvme_queue *nvmeq = &dev->queues[queue_idx];
  
        BUG_ON(!nvmeq);
        iod->nvmeq = nvmeq;
@@@ -448,34 -448,12 +448,34 @@@ static void **nvme_pci_iod_list(struct 
        return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
  }
  
 +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
 +{
 +      struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 +      int nseg = blk_rq_nr_phys_segments(req);
 +      unsigned int avg_seg_size;
 +
 +      if (nseg == 0)
 +              return false;
 +
 +      avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
 +
 +      if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
 +              return false;
 +      if (!iod->nvmeq->qid)
 +              return false;
 +      if (!sgl_threshold || avg_seg_size < sgl_threshold)
 +              return false;
 +      return true;
 +}
 +
  static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
        int nseg = blk_rq_nr_phys_segments(rq);
        unsigned int size = blk_rq_payload_bytes(rq);
  
 +      iod->use_sgl = nvme_pci_use_sgls(dev, rq);
 +
        if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
                size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
                                iod->use_sgl);
@@@ -626,6 -604,8 +626,6 @@@ static blk_status_t nvme_pci_setup_prps
        dma_addr_t prp_dma;
        int nprps, i;
  
 -      iod->use_sgl = false;
 -
        length -= (page_size - offset);
        if (length <= 0) {
                iod->first_dma = 0;
@@@ -725,19 -705,22 +725,19 @@@ static void nvme_pci_sgl_set_seg(struc
  }
  
  static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
 -              struct request *req, struct nvme_rw_command *cmd)
 +              struct request *req, struct nvme_rw_command *cmd, int entries)
  {
        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 -      int length = blk_rq_payload_bytes(req);
        struct dma_pool *pool;
        struct nvme_sgl_desc *sg_list;
        struct scatterlist *sg = iod->sg;
 -      int entries = iod->nents, i = 0;
        dma_addr_t sgl_dma;
 -
 -      iod->use_sgl = true;
 +      int i = 0;
  
        /* setting the transfer type as SGL */
        cmd->flags = NVME_CMD_SGL_METABUF;
  
 -      if (length == sg_dma_len(sg)) {
 +      if (entries == 1) {
                nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
                return BLK_STS_OK;
        }
                }
  
                nvme_pci_sgl_set_data(&sg_list[i++], sg);
 -
 -              length -= sg_dma_len(sg);
                sg = sg_next(sg);
 -              entries--;
 -      } while (length > 0);
 +      } while (--entries > 0);
  
 -      WARN_ON(entries > 0);
        return BLK_STS_OK;
  }
  
 -static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
 -{
 -      struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 -      unsigned int avg_seg_size;
 -
 -      avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
 -                      blk_rq_nr_phys_segments(req));
 -
 -      if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
 -              return false;
 -      if (!iod->nvmeq->qid)
 -              return false;
 -      if (!sgl_threshold || avg_seg_size < sgl_threshold)
 -              return false;
 -      return true;
 -}
 -
  static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
                struct nvme_command *cmnd)
  {
        enum dma_data_direction dma_dir = rq_data_dir(req) ?
                        DMA_TO_DEVICE : DMA_FROM_DEVICE;
        blk_status_t ret = BLK_STS_IOERR;
 +      int nr_mapped;
  
        sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
        iod->nents = blk_rq_map_sg(q, req, iod->sg);
                goto out;
  
        ret = BLK_STS_RESOURCE;
 -      if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
 -                              DMA_ATTR_NO_WARN))
 +      nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
 +                      DMA_ATTR_NO_WARN);
 +      if (!nr_mapped)
                goto out;
  
 -      if (nvme_pci_use_sgls(dev, req))
 -              ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
 +      if (iod->use_sgl)
 +              ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
        else
                ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
  
@@@ -1044,7 -1046,7 +1044,7 @@@ static int nvme_poll(struct blk_mq_hw_c
  static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
  {
        struct nvme_dev *dev = to_nvme_dev(ctrl);
-       struct nvme_queue *nvmeq = dev->queues[0];
+       struct nvme_queue *nvmeq = &dev->queues[0];
        struct nvme_command c;
  
        memset(&c, 0, sizeof(c));
@@@ -1138,9 -1140,14 +1138,14 @@@ static bool nvme_should_reset(struct nv
         */
        bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
  
-       /* If there is a reset ongoing, we shouldn't reset again. */
-       if (dev->ctrl.state == NVME_CTRL_RESETTING)
+       /* If there is a reset/reinit ongoing, we shouldn't reset again. */
+       switch (dev->ctrl.state) {
+       case NVME_CTRL_RESETTING:
+       case NVME_CTRL_RECONNECTING:
                return false;
+       default:
+               break;
+       }
  
        /* We shouldn't reset unless the controller is on fatal error state
         * _or_ if we lost the communication with it.
@@@ -1280,7 -1287,6 +1285,6 @@@ static void nvme_free_queue(struct nvme
        if (nvmeq->sq_cmds)
                dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
                                        nvmeq->sq_cmds, nvmeq->sq_dma_addr);
-       kfree(nvmeq);
  }
  
  static void nvme_free_queues(struct nvme_dev *dev, int lowest)
        int i;
  
        for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
-               struct nvme_queue *nvmeq = dev->queues[i];
                dev->ctrl.queue_count--;
-               dev->queues[i] = NULL;
-               nvme_free_queue(nvmeq);
+               nvme_free_queue(&dev->queues[i]);
        }
  }
  
@@@ -1323,12 -1327,7 +1325,7 @@@ static int nvme_suspend_queue(struct nv
  
  static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
  {
-       struct nvme_queue *nvmeq = dev->queues[0];
-       if (!nvmeq)
-               return;
-       if (nvme_suspend_queue(nvmeq))
-               return;
+       struct nvme_queue *nvmeq = &dev->queues[0];
  
        if (shutdown)
                nvme_shutdown_ctrl(&dev->ctrl);
@@@ -1367,7 -1366,7 +1364,7 @@@ static int nvme_cmb_qdepth(struct nvme_
  static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                int qid, int depth)
  {
-       if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+       if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
                                                      dev->ctrl.page_size);
                nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
        return 0;
  }
  
- static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
-                                                       int depth, int node)
+ static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+               int depth, int node)
  {
-       struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
-                                                       node);
-       if (!nvmeq)
-               return NULL;
+       struct nvme_queue *nvmeq = &dev->queues[qid];
+       if (dev->ctrl.queue_count > qid)
+               return 0;
  
        nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
                                          &nvmeq->cq_dma_addr, GFP_KERNEL);
        nvmeq->q_depth = depth;
        nvmeq->qid = qid;
        nvmeq->cq_vector = -1;
-       dev->queues[qid] = nvmeq;
        dev->ctrl.queue_count++;
  
-       return nvmeq;
+       return 0;
  
   free_cqdma:
        dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
                                                        nvmeq->cq_dma_addr);
   free_nvmeq:
-       kfree(nvmeq);
-       return NULL;
+       return -ENOMEM;
  }
  
  static int queue_request_irq(struct nvme_queue *nvmeq)
@@@ -1590,14 -1587,12 +1585,12 @@@ static int nvme_pci_configure_admin_que
        if (result < 0)
                return result;
  
-       nvmeq = dev->queues[0];
-       if (!nvmeq) {
-               nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
-                                       dev_to_node(dev->dev));
-               if (!nvmeq)
-                       return -ENOMEM;
-       }
+       result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+                       dev_to_node(dev->dev));
+       if (result)
+               return result;
  
+       nvmeq = &dev->queues[0];
        aqa = nvmeq->q_depth - 1;
        aqa |= aqa << 16;
  
@@@ -1627,7 -1622,7 +1620,7 @@@ static int nvme_create_io_queues(struc
  
        for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
                /* vector == qid - 1, match nvme_create_queue */
-               if (!nvme_alloc_queue(dev, i, dev->q_depth,
+               if (nvme_alloc_queue(dev, i, dev->q_depth,
                     pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
                        ret = -ENOMEM;
                        break;
  
        max = min(dev->max_qid, dev->ctrl.queue_count - 1);
        for (i = dev->online_queues; i <= max; i++) {
-               ret = nvme_create_queue(dev->queues[i], i);
+               ret = nvme_create_queue(&dev->queues[i], i);
                if (ret)
                        break;
        }
  
        /*
         * Ignore failing Create SQ/CQ commands, we can continue with less
-        * than the desired aount of queues, and even a controller without
-        * I/O queues an still be used to issue admin commands.  This might
+        * than the desired amount of queues, and even a controller without
+        * I/O queues can still be used to issue admin commands.  This might
         * be useful to upgrade a buggy firmware for example.
         */
        return ret >= 0 ? 0 : ret;
@@@ -1661,30 -1656,40 +1654,40 @@@ static ssize_t nvme_cmb_show(struct dev
  }
  static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
  
- static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+ static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
  {
-       u64 szu, size, offset;
+       u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+       return 1ULL << (12 + 4 * szu);
+ }
+ static u32 nvme_cmb_size(struct nvme_dev *dev)
+ {
+       return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+ }
+ static void nvme_map_cmb(struct nvme_dev *dev)
+ {
+       u64 size, offset;
        resource_size_t bar_size;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
-       void __iomem *cmb;
        int bar;
  
        dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-       if (!(NVME_CMB_SZ(dev->cmbsz)))
-               return NULL;
+       if (!dev->cmbsz)
+               return;
        dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
  
        if (!use_cmb_sqes)
-               return NULL;
+               return;
  
-       szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
-       size = szu * NVME_CMB_SZ(dev->cmbsz);
-       offset = szu * NVME_CMB_OFST(dev->cmbloc);
+       size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+       offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
        bar = NVME_CMB_BIR(dev->cmbloc);
        bar_size = pci_resource_len(pdev, bar);
  
        if (offset > bar_size)
-               return NULL;
+               return;
  
        /*
         * Controllers may support a CMB size larger than their BAR,
        if (size > bar_size - offset)
                size = bar_size - offset;
  
-       cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
-       if (!cmb)
-               return NULL;
+       dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+       if (!dev->cmb)
+               return;
        dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
        dev->cmb_size = size;
-       return cmb;
+       if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+                                   &dev_attr_cmb.attr, NULL))
+               dev_warn(dev->ctrl.device,
+                        "failed to add sysfs attribute for CMB\n");
  }
  
  static inline void nvme_release_cmb(struct nvme_dev *dev)
@@@ -1768,7 -1776,7 +1774,7 @@@ static int __nvme_alloc_host_mem(struc
        dma_addr_t descs_dma;
        int i = 0;
        void **bufs;
-       u64 size = 0, tmp;
+       u64 size, tmp;
  
        tmp = (preferred + chunk_size - 1);
        do_div(tmp, chunk_size);
@@@ -1851,7 -1859,7 +1857,7 @@@ static int nvme_setup_host_mem(struct n
        u64 preferred = (u64)dev->ctrl.hmpre * 4096;
        u64 min = (u64)dev->ctrl.hmmin * 4096;
        u32 enable_bits = NVME_HOST_MEM_ENABLE;
-       int ret = 0;
+       int ret;
  
        preferred = min(preferred, max);
        if (min > max) {
  
  static int nvme_setup_io_queues(struct nvme_dev *dev)
  {
-       struct nvme_queue *adminq = dev->queues[0];
+       struct nvme_queue *adminq = &dev->queues[0];
        struct pci_dev *pdev = to_pci_dev(dev->dev);
        int result, nr_io_queues;
        unsigned long size;
        if (nr_io_queues == 0)
                return 0;
  
-       if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+       if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
                result = nvme_cmb_qdepth(dev, nr_io_queues,
                                sizeof(struct nvme_command));
                if (result > 0)
@@@ -2005,9 -2013,9 +2011,9 @@@ static int nvme_delete_queue(struct nvm
        return 0;
  }
  
- static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
+ static void nvme_disable_io_queues(struct nvme_dev *dev)
  {
-       int pass;
+       int pass, queues = dev->online_queues - 1;
        unsigned long timeout;
        u8 opcode = nvme_admin_delete_sq;
  
   retry:
                timeout = ADMIN_TIMEOUT;
                for (; i > 0; i--, sent++)
-                       if (nvme_delete_queue(dev->queues[i], opcode))
+                       if (nvme_delete_queue(&dev->queues[i], opcode))
                                break;
  
                while (sent--) {
  }
  
  /*
-  * Return: error value if an error occurred setting up the queues or calling
-  * Identify Device.  0 if these succeeded, even if adding some of the
-  * namespaces failed.  At the moment, these failures are silent.  TBD which
-  * failures should be reported.
+  * return error value only when tagset allocation failed
   */
  static int nvme_dev_add(struct nvme_dev *dev)
  {
+       int ret;
        if (!dev->ctrl.tagset) {
                dev->tagset.ops = &nvme_mq_ops;
                dev->tagset.nr_hw_queues = dev->online_queues - 1;
                dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
                dev->tagset.driver_data = dev;
  
-               if (blk_mq_alloc_tag_set(&dev->tagset))
-                       return 0;
+               ret = blk_mq_alloc_tag_set(&dev->tagset);
+               if (ret) {
+                       dev_warn(dev->ctrl.device,
+                               "IO queues tagset allocation failed %d\n", ret);
+                       return ret;
+               }
                dev->ctrl.tagset = &dev->tagset;
  
                nvme_dbbuf_set(dev);
@@@ -2122,22 -2133,7 +2131,7 @@@ static int nvme_pci_enable(struct nvme_
                          "set queue depth=%u\n", dev->q_depth);
        }
  
-       /*
-        * CMBs can currently only exist on >=1.2 PCIe devices. We only
-        * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
-        * has no name we can pass NULL as final argument to
-        * sysfs_add_file_to_group.
-        */
-       if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
-               dev->cmb = nvme_map_cmb(dev);
-               if (dev->cmb) {
-                       if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
-                                                   &dev_attr_cmb.attr, NULL))
-                               dev_warn(dev->ctrl.device,
-                                        "failed to add sysfs attribute for CMB\n");
-               }
-       }
+       nvme_map_cmb(dev);
  
        pci_enable_pcie_error_reporting(pdev);
        pci_save_state(pdev);
@@@ -2170,7 -2166,7 +2164,7 @@@ static void nvme_pci_disable(struct nvm
  
  static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
  {
-       int i, queues;
+       int i;
        bool dead = true;
        struct pci_dev *pdev = to_pci_dev(dev->dev);
  
        }
        nvme_stop_queues(&dev->ctrl);
  
-       queues = dev->online_queues - 1;
-       for (i = dev->ctrl.queue_count - 1; i > 0; i--)
-               nvme_suspend_queue(dev->queues[i]);
-       if (dead) {
-               /* A device might become IO incapable very soon during
-                * probe, before the admin queue is configured. Thus,
-                * queue_count can be 0 here.
-                */
-               if (dev->ctrl.queue_count)
-                       nvme_suspend_queue(dev->queues[0]);
-       } else {
-               nvme_disable_io_queues(dev, queues);
+       if (!dead) {
+               nvme_disable_io_queues(dev);
                nvme_disable_admin_queue(dev, shutdown);
        }
+       for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+               nvme_suspend_queue(&dev->queues[i]);
        nvme_pci_disable(dev);
  
        blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@@@ -2289,6 -2277,7 +2275,7 @@@ static void nvme_reset_work(struct work
                container_of(work, struct nvme_dev, ctrl.reset_work);
        bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
        int result = -ENODEV;
+       enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
  
        if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
                goto out;
        if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                nvme_dev_disable(dev, false);
  
+       /*
+        * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+        * initializing procedure here.
+        */
+       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+               dev_warn(dev->ctrl.device,
+                       "failed to mark controller RECONNECTING\n");
+               goto out;
+       }
        result = nvme_pci_enable(dev);
        if (result)
                goto out;
                dev_warn(dev->ctrl.device, "IO queues not created\n");
                nvme_kill_queues(&dev->ctrl);
                nvme_remove_namespaces(&dev->ctrl);
+               new_state = NVME_CTRL_ADMIN_ONLY;
        } else {
                nvme_start_queues(&dev->ctrl);
                nvme_wait_freeze(&dev->ctrl);
-               nvme_dev_add(dev);
+               /* hit this only when allocate tagset fails */
+               if (nvme_dev_add(dev))
+                       new_state = NVME_CTRL_ADMIN_ONLY;
                nvme_unfreeze(&dev->ctrl);
        }
  
-       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
-               dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+       /*
+        * If only admin queue live, keep it to do further investigation or
+        * recovery.
+        */
+       if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+               dev_warn(dev->ctrl.device,
+                       "failed to mark controller state %d\n", new_state);
                goto out;
        }
  
@@@ -2468,8 -2475,9 +2473,9 @@@ static int nvme_probe(struct pci_dev *p
        dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
        if (!dev)
                return -ENOMEM;
-       dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
-                                                       GFP_KERNEL, node);
+       dev->queues = kcalloc_node(num_possible_cpus() + 1,
+                       sizeof(struct nvme_queue), GFP_KERNEL, node);
        if (!dev->queues)
                goto free;
  
        if (result)
                goto release_pools;
  
-       nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
        dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
  
-       queue_work(nvme_wq, &dev->ctrl.reset_work);
+       nvme_reset_ctrl(&dev->ctrl);
        return 0;
  
   release_pools:
@@@ -2523,7 -2531,7 +2529,7 @@@ static void nvme_reset_prepare(struct p
  static void nvme_reset_done(struct pci_dev *pdev)
  {
        struct nvme_dev *dev = pci_get_drvdata(pdev);
-       nvme_reset_ctrl(&dev->ctrl);
+       nvme_reset_ctrl_sync(&dev->ctrl);
  }
  
  static void nvme_shutdown(struct pci_dev *pdev)
diff --combined drivers/nvme/host/rdma.c
index 2a0bba7f50cf43bb76e9d1f3073e24ba1edd9e1c,6c2fdfa4c86a1d51ede0df382ef58d6e1d7364a1..2bc059f7d73c7da7ea13273aa9a0b92d1cbf2b63
@@@ -66,7 -66,6 +66,6 @@@ struct nvme_rdma_request 
        struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
        u32                     num_sge;
        int                     nents;
-       bool                    inline_data;
        struct ib_reg_wr        reg_wr;
        struct ib_cqe           reg_cqe;
        struct nvme_rdma_queue  *queue;
@@@ -974,18 -973,12 +973,18 @@@ static void nvme_rdma_error_recovery_wo
        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
        nvme_start_queues(&ctrl->ctrl);
  
 +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
 +              /* state change failure should never happen */
 +              WARN_ON_ONCE(1);
 +              return;
 +      }
 +
        nvme_rdma_reconnect_or_remove(ctrl);
  }
  
  static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
  {
 -      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
 +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
                return;
  
        queue_work(nvme_wq, &ctrl->err_work);
@@@ -1092,7 -1085,6 +1091,6 @@@ static int nvme_rdma_map_sg_inline(stru
        sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
  
-       req->inline_data = true;
        req->num_sge++;
        return 0;
  }
@@@ -1164,7 -1156,6 +1162,6 @@@ static int nvme_rdma_map_data(struct nv
        int count, ret;
  
        req->num_sge = 1;
-       req->inline_data = false;
        refcount_set(&req->ref, 2); /* send and recv completions */
  
        c->common.flags |= NVME_CMD_SGL_METABUF;
@@@ -1759,12 -1750,6 +1756,12 @@@ static void nvme_rdma_reset_ctrl_work(s
        nvme_stop_ctrl(&ctrl->ctrl);
        nvme_rdma_shutdown_ctrl(ctrl, false);
  
 +      if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
 +              /* state change failure should never happen */
 +              WARN_ON_ONCE(1);
 +              return;
 +      }
 +
        ret = nvme_rdma_configure_admin_queue(ctrl, false);
        if (ret)
                goto out_fail;
@@@ -2018,6 -2003,7 +2015,7 @@@ out_free_ctrl
  
  static struct nvmf_transport_ops nvme_rdma_transport = {
        .name           = "rdma",
+       .module         = THIS_MODULE,
        .required_opts  = NVMF_OPT_TRADDR,
        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@@@ -2040,7 -2026,7 +2038,7 @@@ static void nvme_rdma_remove_one(struc
        }
        mutex_unlock(&nvme_rdma_ctrl_mutex);
  
-       flush_workqueue(nvme_wq);
+       flush_workqueue(nvme_delete_wq);
  }
  
  static struct ib_client nvme_rdma_ib_client = {
index 6a018a0bd6ce851306dd82e5c21e680c626f99d5,9f8a6726df91502b551c02db7018cb34469cc060..34712def81b15a566bb16a7e320ec7012796cf27
@@@ -204,6 -204,10 +204,10 @@@ struct fcloop_lport 
        struct completion unreg_done;
  };
  
+ struct fcloop_lport_priv {
+       struct fcloop_lport *lport;
+ };
  struct fcloop_rport {
        struct nvme_fc_remote_port *remoteport;
        struct nvmet_fc_target_port *targetport;
@@@ -238,21 -242,32 +242,32 @@@ struct fcloop_lsreq 
        int                             status;
  };
  
+ enum {
+       INI_IO_START            = 0,
+       INI_IO_ACTIVE           = 1,
+       INI_IO_ABORTED          = 2,
+       INI_IO_COMPLETED        = 3,
+ };
  struct fcloop_fcpreq {
        struct fcloop_tport             *tport;
        struct nvmefc_fcp_req           *fcpreq;
        spinlock_t                      reqlock;
        u16                             status;
+       u32                             inistate;
        bool                            active;
        bool                            aborted;
-       struct work_struct              work;
+       struct kref                     ref;
+       struct work_struct              fcp_rcv_work;
+       struct work_struct              abort_rcv_work;
+       struct work_struct              tio_done_work;
        struct nvmefc_tgt_fcp_req       tgt_fcp_req;
  };
  
  struct fcloop_ini_fcpreq {
        struct nvmefc_fcp_req           *fcpreq;
        struct fcloop_fcpreq            *tfcp_req;
-       struct work_struct              iniwork;
+       spinlock_t                      inilock;
  };
  
  static inline struct fcloop_lsreq *
@@@ -343,17 -358,122 +358,122 @@@ fcloop_xmt_ls_rsp(struct nvmet_fc_targe
        return 0;
  }
  
- /*
-  * FCP IO operation done by initiator abort.
-  * call back up initiator "done" flows.
-  */
  static void
- fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+ fcloop_tfcp_req_free(struct kref *ref)
  {
-       struct fcloop_ini_fcpreq *inireq =
-               container_of(work, struct fcloop_ini_fcpreq, iniwork);
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(ref, struct fcloop_fcpreq, ref);
  
-       inireq->fcpreq->done(inireq->fcpreq);
+       kfree(tfcp_req);
+ }
+ static void
+ fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+ {
+       kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+ }
+ static int
+ fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+ {
+       return kref_get_unless_zero(&tfcp_req->ref);
+ }
+ static void
+ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+                       struct fcloop_fcpreq *tfcp_req, int status)
+ {
+       struct fcloop_ini_fcpreq *inireq = NULL;
+       if (fcpreq) {
+               inireq = fcpreq->private;
+               spin_lock(&inireq->inilock);
+               inireq->tfcp_req = NULL;
+               spin_unlock(&inireq->inilock);
+               fcpreq->status = status;
+               fcpreq->done(fcpreq);
+       }
+       /* release original io reference on tgt struct */
+       fcloop_tfcp_req_put(tfcp_req);
+ }
+ static void
+ fcloop_fcp_recv_work(struct work_struct *work)
+ {
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+       struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+       int ret = 0;
+       bool aborted = false;
+       spin_lock(&tfcp_req->reqlock);
+       switch (tfcp_req->inistate) {
+       case INI_IO_START:
+               tfcp_req->inistate = INI_IO_ACTIVE;
+               break;
+       case INI_IO_ABORTED:
+               aborted = true;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
+       spin_unlock(&tfcp_req->reqlock);
+       if (unlikely(aborted))
+               ret = -ECANCELED;
+       else
+               ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+                               &tfcp_req->tgt_fcp_req,
+                               fcpreq->cmdaddr, fcpreq->cmdlen);
+       if (ret)
+               fcloop_call_host_done(fcpreq, tfcp_req, ret);
+       return;
+ }
+ static void
+ fcloop_fcp_abort_recv_work(struct work_struct *work)
+ {
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+       struct nvmefc_fcp_req *fcpreq;
+       bool completed = false;
+       spin_lock(&tfcp_req->reqlock);
+       fcpreq = tfcp_req->fcpreq;
+       switch (tfcp_req->inistate) {
+       case INI_IO_ABORTED:
+               break;
+       case INI_IO_COMPLETED:
+               completed = true;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
+       spin_unlock(&tfcp_req->reqlock);
+       if (unlikely(completed)) {
+               /* remove reference taken in original abort downcall */
+               fcloop_tfcp_req_put(tfcp_req);
+               return;
+       }
+       if (tfcp_req->tport->targetport)
+               nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+                                       &tfcp_req->tgt_fcp_req);
+       spin_lock(&tfcp_req->reqlock);
+       tfcp_req->fcpreq = NULL;
+       spin_unlock(&tfcp_req->reqlock);
+       fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+       /* call_host_done releases reference for abort downcall */
  }
  
  /*
@@@ -364,20 -484,15 +484,15 @@@ static voi
  fcloop_tgt_fcprqst_done_work(struct work_struct *work)
  {
        struct fcloop_fcpreq *tfcp_req =
-               container_of(work, struct fcloop_fcpreq, work);
-       struct fcloop_tport *tport = tfcp_req->tport;
+               container_of(work, struct fcloop_fcpreq, tio_done_work);
        struct nvmefc_fcp_req *fcpreq;
  
        spin_lock(&tfcp_req->reqlock);
        fcpreq = tfcp_req->fcpreq;
+       tfcp_req->inistate = INI_IO_COMPLETED;
        spin_unlock(&tfcp_req->reqlock);
  
-       if (tport->remoteport && fcpreq) {
-               fcpreq->status = tfcp_req->status;
-               fcpreq->done(fcpreq);
-       }
-       kfree(tfcp_req);
+       fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
  }
  
  
@@@ -390,7 -505,6 +505,6 @@@ fcloop_fcp_req(struct nvme_fc_local_por
        struct fcloop_rport *rport = remoteport->private;
        struct fcloop_ini_fcpreq *inireq = fcpreq->private;
        struct fcloop_fcpreq *tfcp_req;
-       int ret = 0;
  
        if (!rport->targetport)
                return -ECONNREFUSED;
  
        inireq->fcpreq = fcpreq;
        inireq->tfcp_req = tfcp_req;
-       INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
+       spin_lock_init(&inireq->inilock);
        tfcp_req->fcpreq = fcpreq;
        tfcp_req->tport = rport->targetport->private;
+       tfcp_req->inistate = INI_IO_START;
        spin_lock_init(&tfcp_req->reqlock);
-       INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+       INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+       INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+       INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+       kref_init(&tfcp_req->ref);
  
-       ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
-                                fcpreq->cmdaddr, fcpreq->cmdlen);
+       schedule_work(&tfcp_req->fcp_rcv_work);
  
-       return ret;
+       return 0;
  }
  
  static void
@@@ -589,7 -707,7 +707,7 @@@ fcloop_fcp_req_release(struct nvmet_fc_
  {
        struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
  
-       schedule_work(&tfcp_req->work);
+       schedule_work(&tfcp_req->tio_done_work);
  }
  
  static void
@@@ -605,27 -723,47 +723,47 @@@ fcloop_fcp_abort(struct nvme_fc_local_p
                        void *hw_queue_handle,
                        struct nvmefc_fcp_req *fcpreq)
  {
-       struct fcloop_rport *rport = remoteport->private;
        struct fcloop_ini_fcpreq *inireq = fcpreq->private;
-       struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+       struct fcloop_fcpreq *tfcp_req;
+       bool abortio = true;
+       spin_lock(&inireq->inilock);
+       tfcp_req = inireq->tfcp_req;
+       if (tfcp_req)
+               fcloop_tfcp_req_get(tfcp_req);
+       spin_unlock(&inireq->inilock);
  
        if (!tfcp_req)
                /* abort has already been called */
                return;
  
-       if (rport->targetport)
-               nvmet_fc_rcv_fcp_abort(rport->targetport,
-                                       &tfcp_req->tgt_fcp_req);
        /* break initiator/target relationship for io */
        spin_lock(&tfcp_req->reqlock);
-       inireq->tfcp_req = NULL;
-       tfcp_req->fcpreq = NULL;
+       switch (tfcp_req->inistate) {
+       case INI_IO_START:
+       case INI_IO_ACTIVE:
+               tfcp_req->inistate = INI_IO_ABORTED;
+               break;
+       case INI_IO_COMPLETED:
+               abortio = false;
+               break;
+       default:
+               spin_unlock(&tfcp_req->reqlock);
+               WARN_ON(1);
+               return;
+       }
        spin_unlock(&tfcp_req->reqlock);
  
-       /* post the aborted io completion */
-       fcpreq->status = -ECANCELED;
-       schedule_work(&inireq->iniwork);
+       if (abortio)
+               /* leave the reference while the work item is scheduled */
+               WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+       else  {
+               /*
+                * as the io has already had the done callback made,
+                * nothing more to do. So release the reference taken above
+                */
+               fcloop_tfcp_req_put(tfcp_req);
+       }
  }
  
  static void
@@@ -657,7 -795,8 +795,8 @@@ fcloop_nport_get(struct fcloop_nport *n
  static void
  fcloop_localport_delete(struct nvme_fc_local_port *localport)
  {
-       struct fcloop_lport *lport = localport->private;
+       struct fcloop_lport_priv *lport_priv = localport->private;
+       struct fcloop_lport *lport = lport_priv->lport;
  
        /* release any threads waiting for the unreg to complete */
        complete(&lport->unreg_done);
@@@ -697,7 -836,7 +836,7 @@@ static struct nvme_fc_port_template fct
        .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
        .dma_boundary           = FCLOOP_DMABOUND_4G,
        /* sizes of additional private data for data structures */
-       .local_priv_sz          = sizeof(struct fcloop_lport),
+       .local_priv_sz          = sizeof(struct fcloop_lport_priv),
        .remote_priv_sz         = sizeof(struct fcloop_rport),
        .lsrqst_priv_sz         = sizeof(struct fcloop_lsreq),
        .fcprqst_priv_sz        = sizeof(struct fcloop_ini_fcpreq),
@@@ -714,8 -853,7 +853,7 @@@ static struct nvmet_fc_target_template 
        .max_dif_sgl_segments   = FCLOOP_SGL_SEGS,
        .dma_boundary           = FCLOOP_DMABOUND_4G,
        /* optional features */
-       .target_features        = NVMET_FCTGTFEAT_CMD_IN_ISR |
-                                 NVMET_FCTGTFEAT_OPDONE_IN_ISR,
+       .target_features        = 0,
        /* sizes of additional private data for data structures */
        .target_priv_sz         = sizeof(struct fcloop_tport),
  };
@@@ -728,11 -866,17 +866,17 @@@ fcloop_create_local_port(struct device 
        struct fcloop_ctrl_options *opts;
        struct nvme_fc_local_port *localport;
        struct fcloop_lport *lport;
-       int ret;
+       struct fcloop_lport_priv *lport_priv;
+       unsigned long flags;
+       int ret = -ENOMEM;
+       lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+       if (!lport)
+               return -ENOMEM;
  
        opts = kzalloc(sizeof(*opts), GFP_KERNEL);
        if (!opts)
-               return -ENOMEM;
+               goto out_free_lport;
  
        ret = fcloop_parse_options(opts, buf);
        if (ret)
  
        ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
        if (!ret) {
-               unsigned long flags;
                /* success */
-               lport = localport->private;
+               lport_priv = localport->private;
+               lport_priv->lport = lport;
                lport->localport = localport;
                INIT_LIST_HEAD(&lport->lport_list);
  
                spin_lock_irqsave(&fcloop_lock, flags);
                list_add_tail(&lport->lport_list, &fcloop_lports);
                spin_unlock_irqrestore(&fcloop_lock, flags);
-               /* mark all of the input buffer consumed */
-               ret = count;
        }
  
  out_free_opts:
        kfree(opts);
+ out_free_lport:
+       /* free only if we're going to fail */
+       if (ret)
+               kfree(lport);
        return ret ? ret : count;
  }
  
@@@ -790,6 -936,8 +936,8 @@@ __wait_localport_unreg(struct fcloop_lp
  
        wait_for_completion(&lport->unreg_done);
  
+       kfree(lport);
        return ret;
  }
  
@@@ -1085,7 -1233,7 +1233,7 @@@ fcloop_delete_target_port(struct devic
                const char *buf, size_t count)
  {
        struct fcloop_nport *nport = NULL, *tmpport;
 -      struct fcloop_tport *tport;
 +      struct fcloop_tport *tport = NULL;
        u64 nodename, portname;
        unsigned long flags;
        int ret;
diff --combined include/linux/bio.h
index 23d29b39f71e83e8a6a25540adc2e3f28702aec7,367a979fd4a6f250c6778b0dfe29b6b6450ddc6d..d0eb659fa733eb91b57a135932f45b1eea8d9975
@@@ -300,6 -300,29 +300,29 @@@ static inline void bio_get_last_bvec(st
                bv->bv_len = iter.bi_bvec_done;
  }
  
+ static inline unsigned bio_pages_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return bio->bi_vcnt;
+ }
+ static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return bio->bi_io_vec;
+ }
+ static inline struct page *bio_first_page_all(struct bio *bio)
+ {
+       return bio_first_bvec_all(bio)->bv_page;
+ }
+ static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+ {
+       WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+       return &bio->bi_io_vec[bio->bi_vcnt - 1];
+ }
  enum bip_flags {
        BIP_BLOCK_INTEGRITY     = 1 << 0, /* block layer owns integrity data */
        BIP_MAPPED_INTEGRITY    = 1 << 1, /* ref tag has been remapped */
@@@ -477,7 -500,6 +500,6 @@@ static inline void bio_flush_dcache_pag
  #endif
  
  extern void bio_copy_data(struct bio *dst, struct bio *src);
- extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
  extern void bio_free_pages(struct bio *bio);
  
  extern struct bio *bio_copy_user_iov(struct request_queue *,
@@@ -492,8 -514,6 +514,8 @@@ extern unsigned int bvec_nr_vecs(unsign
  
  #define bio_set_dev(bio, bdev)                        \
  do {                                          \
 +      if ((bio)->bi_disk != (bdev)->bd_disk)  \
 +              bio_clear_flag(bio, BIO_THROTTLED);\
        (bio)->bi_disk = (bdev)->bd_disk;       \
        (bio)->bi_partno = (bdev)->bd_partno;   \
  } while (0)
index 9e7d8bd776d227d2ba92b137af7230300f5b1d4a,2d973ac54b09f084c35e499c15302fd3066f4823..c5d3db0d83f8ac1adf177f0f92c7bf3ed0e2c261
@@@ -39,6 -39,34 +39,34 @@@ typedef u8 __bitwise blk_status_t
  
  #define BLK_STS_AGAIN         ((__force blk_status_t)12)
  
+ /**
+  * blk_path_error - returns true if error may be path related
+  * @error: status the request was completed with
+  *
+  * Description:
+  *     This classifies block error status into non-retryable errors and ones
+  *     that may be successful if retried on a failover path.
+  *
+  * Return:
+  *     %false - retrying failover path will not help
+  *     %true  - may succeed if retried
+  */
+ static inline bool blk_path_error(blk_status_t error)
+ {
+       switch (error) {
+       case BLK_STS_NOTSUPP:
+       case BLK_STS_NOSPC:
+       case BLK_STS_TARGET:
+       case BLK_STS_NEXUS:
+       case BLK_STS_MEDIUM:
+       case BLK_STS_PROTECTION:
+               return false;
+       }
+       /* Anything else could be a path failure, so should be retried */
+       return true;
+ }
  struct blk_issue_stat {
        u64 stat;
  };
@@@ -50,6 -78,8 +78,6 @@@
  struct bio {
        struct bio              *bi_next;       /* request queue link */
        struct gendisk          *bi_disk;
 -      u8                      bi_partno;
 -      blk_status_t            bi_status;
        unsigned int            bi_opf;         /* bottom bits req flags,
                                                 * top bits REQ_OP. Use
                                                 * accessors.
@@@ -57,8 -87,8 +85,8 @@@
        unsigned short          bi_flags;       /* status, etc and bvec pool number */
        unsigned short          bi_ioprio;
        unsigned short          bi_write_hint;
 -
 -      struct bvec_iter        bi_iter;
 +      blk_status_t            bi_status;
 +      u8                      bi_partno;
  
        /* Number of segments in this BIO after
         * physical address coalescing is performed.
        unsigned int            bi_seg_front_size;
        unsigned int            bi_seg_back_size;
  
 -      atomic_t                __bi_remaining;
 +      struct bvec_iter        bi_iter;
  
 +      atomic_t                __bi_remaining;
        bio_end_io_t            *bi_end_io;
  
        void                    *bi_private;
diff --combined include/linux/blkdev.h
index 0ce8a372d5069a7aca7810429a968d20e923d3d1,afc43fb63c1604009f9539fd81ae55502a7a4a16..4f3df807cf8f73076ca6e735b901b14360528aa6
@@@ -27,6 -27,8 +27,8 @@@
  #include <linux/percpu-refcount.h>
  #include <linux/scatterlist.h>
  #include <linux/blkzoned.h>
+ #include <linux/seqlock.h>
+ #include <linux/u64_stats_sync.h>
  
  struct module;
  struct scsi_ioctl_command;
@@@ -121,6 -123,12 +123,12 @@@ typedef __u32 __bitwise req_flags_t
  /* Look at ->special_vec for the actual data payload instead of the
     bio chain. */
  #define RQF_SPECIAL_PAYLOAD   ((__force req_flags_t)(1 << 18))
+ /* The per-zone write lock is held for this request */
+ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+ /* timeout is expired */
+ #define RQF_MQ_TIMEOUT_EXPIRED        ((__force req_flags_t)(1 << 20))
+ /* already slept for hybrid poll */
+ #define RQF_MQ_POLL_SLEPT     ((__force req_flags_t)(1 << 21))
  
  /* flags that prevent us from merging requests: */
  #define RQF_NOMERGE_FLAGS \
   * especially blk_mq_rq_ctx_init() to take care of the added fields.
   */
  struct request {
-       struct list_head queuelist;
-       union {
-               struct __call_single_data csd;
-               u64 fifo_time;
-       };
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
  
  
        int internal_tag;
  
-       unsigned long atomic_flags;
        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        int tag;
        struct bio *bio;
        struct bio *biotail;
  
+       struct list_head queuelist;
        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
        struct hd_struct *part;
        unsigned long start_time;
        struct blk_issue_stat issue_stat;
- #ifdef CONFIG_BLK_CGROUP
-       struct request_list *rl;                /* rl this rq is alloced from */
-       unsigned long long start_time_ns;
-       unsigned long long io_start_time_ns;    /* when passed to hardware */
- #endif
        /* Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
  #if defined(CONFIG_BLK_DEV_INTEGRITY)
        unsigned short nr_integrity_segments;
  #endif
  
+       unsigned short write_hint;
        unsigned short ioprio;
  
        unsigned int timeout;
  
        unsigned int extra_len; /* length of alignment and padding */
  
-       unsigned short write_hint;
+       /*
+        * On blk-mq, the lower bits of ->gstate (generation number and
+        * state) carry the MQ_RQ_* state value and the upper bits the
+        * generation number which is monotonically incremented and used to
+        * distinguish the reuse instances.
+        *
+        * ->gstate_seq allows updates to ->gstate and other fields
+        * (currently ->deadline) during request start to be read
+        * atomically from the timeout path, so that it can operate on a
+        * coherent set of information.
+        */
+       seqcount_t gstate_seq;
+       u64 gstate;
+       /*
+        * ->aborted_gstate is used by the timeout to claim a specific
+        * recycle instance of this request.  See blk_mq_timeout_work().
+        */
+       struct u64_stats_sync aborted_gstate_sync;
+       u64 aborted_gstate;
+       /* access through blk_rq_set_deadline, blk_rq_deadline */
+       unsigned long __deadline;
  
-       unsigned long deadline;
        struct list_head timeout_list;
  
 -              call_single_data_t csd;
+       union {
++              struct __call_single_data csd;
+               u64 fifo_time;
+       };
        /*
         * completion callback.
         */
  
        /* for bidi */
        struct request *next_rq;
+ #ifdef CONFIG_BLK_CGROUP
+       struct request_list *rl;                /* rl this rq is alloced from */
+       unsigned long long start_time_ns;
+       unsigned long long io_start_time_ns;    /* when passed to hardware */
+ #endif
  };
  
 +static inline bool blk_op_is_scsi(unsigned int op)
 +{
 +      return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
 +}
 +
 +static inline bool blk_op_is_private(unsigned int op)
 +{
 +      return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
 +}
 +
  static inline bool blk_rq_is_scsi(struct request *rq)
  {
 -      return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
 +      return blk_op_is_scsi(req_op(rq));
  }
  
  static inline bool blk_rq_is_private(struct request *rq)
  {
 -      return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
 +      return blk_op_is_private(req_op(rq));
  }
  
  static inline bool blk_rq_is_passthrough(struct request *rq)
        return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
  }
  
 +static inline bool bio_is_passthrough(struct bio *bio)
 +{
 +      unsigned op = bio_op(bio);
 +
 +      return blk_op_is_scsi(op) || blk_op_is_private(op);
 +}
 +
  static inline unsigned short req_get_ioprio(struct request *req)
  {
        return req->ioprio;
@@@ -563,6 -577,22 +594,22 @@@ struct request_queue 
  
        struct queue_limits     limits;
  
+       /*
+        * Zoned block device information for request dispatch control.
+        * nr_zones is the total number of zones of the device. This is always
+        * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+        * bits which indicates if a zone is conventional (bit clear) or
+        * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+        * bits which indicates if a zone is write locked, that is, if a write
+        * request targeting the zone was dispatched. All three fields are
+        * initialized by the low level device driver (e.g. scsi/sd.c).
+        * Stacking drivers (device mappers) may or may not initialize
+        * these fields.
+        */
+       unsigned int            nr_zones;
+       unsigned long           *seq_zones_bitmap;
+       unsigned long           *seq_zones_wlock;
        /*
         * sg stuff
         */
@@@ -807,6 -837,27 +854,27 @@@ static inline unsigned int blk_queue_zo
        return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
  }
  
+ static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+ {
+       return q->nr_zones;
+ }
+ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+                                            sector_t sector)
+ {
+       if (!blk_queue_is_zoned(q))
+               return 0;
+       return sector >> ilog2(q->limits.chunk_sectors);
+ }
+ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+                                        sector_t sector)
+ {
+       if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+               return false;
+       return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+ }
  static inline bool rq_is_sync(struct request *rq)
  {
        return op_is_sync(rq->cmd_flags);
@@@ -965,7 -1016,7 +1033,7 @@@ extern int blk_rq_prep_clone(struct req
  extern void blk_rq_unprep_clone(struct request *rq);
  extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
                                     struct request *rq);
 -extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
 +extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
  extern void blk_delay_queue(struct request_queue *, unsigned long);
  extern void blk_queue_split(struct request_queue *, struct bio **);
  extern void blk_recount_segments(struct request_queue *, struct bio *);
@@@ -1046,6 -1097,16 +1114,16 @@@ static inline unsigned int blk_rq_cur_s
        return blk_rq_cur_bytes(rq) >> 9;
  }
  
+ static inline unsigned int blk_rq_zone_no(struct request *rq)
+ {
+       return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+ }
+ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+ {
+       return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+ }
  /*
   * Some commands like WRITE SAME have a payload or data transfer size which
   * is different from the size of the request.  Any driver that supports such
@@@ -1595,7 -1656,15 +1673,15 @@@ static inline unsigned int bdev_zone_se
  
        if (q)
                return blk_queue_zone_sectors(q);
+       return 0;
+ }
+ static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+ {
+       struct request_queue *q = bdev_get_queue(bdev);
  
+       if (q)
+               return blk_queue_nr_zones(q);
        return 0;
  }
  
@@@ -1731,8 -1800,6 +1817,6 @@@ static inline bool req_gap_front_merge(
  
  int kblockd_schedule_work(struct work_struct *work);
  int kblockd_schedule_work_on(int cpu, struct work_struct *work);
- int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
  int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
  
  #ifdef CONFIG_BLK_CGROUP
@@@ -1971,6 -2038,60 +2055,60 @@@ extern int __blkdev_driver_ioctl(struc
  extern int bdev_read_page(struct block_device *, sector_t, struct page *);
  extern int bdev_write_page(struct block_device *, sector_t, struct page *,
                                                struct writeback_control *);
+ #ifdef CONFIG_BLK_DEV_ZONED
+ bool blk_req_needs_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_unlock(struct request *rq);
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+       if (blk_req_needs_zone_write_lock(rq))
+               __blk_req_zone_write_lock(rq);
+ }
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+       if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+               __blk_req_zone_write_unlock(rq);
+ }
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+       return rq->q->seq_zones_wlock &&
+               test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+ }
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+       if (!blk_req_needs_zone_write_lock(rq))
+               return true;
+       return !blk_req_zone_is_write_locked(rq);
+ }
+ #else
+ static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+ {
+       return false;
+ }
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+ }
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+ }
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+       return false;
+ }
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+       return true;
+ }
+ #endif /* CONFIG_BLK_DEV_ZONED */
  #else /* CONFIG_BLOCK */
  
  struct block_device;
diff --combined kernel/power/swap.c
index a46be1261c095e74fa49f363b917ae7996a29cbd,96c736313faab8e536257fc191a800753166a7c2..11b4282c2d2031d8208d53917f78d3aec5323bf0
@@@ -240,7 -240,7 +240,7 @@@ static void hib_init_batch(struct hib_b
  static void hib_end_io(struct bio *bio)
  {
        struct hib_bio_batch *hb = bio->bi_private;
-       struct page *page = bio->bi_io_vec[0].bv_page;
+       struct page *page = bio_first_page_all(bio);
  
        if (bio->bi_status) {
                pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
@@@ -879,7 -879,7 +879,7 @@@ out_clean
   *    space avaiable from the resume partition.
   */
  
 -static int enough_swap(unsigned int nr_pages, unsigned int flags)
 +static int enough_swap(unsigned int nr_pages)
  {
        unsigned int free_swap = count_swap_pages(root_swap, 1);
        unsigned int required;
@@@ -915,7 -915,7 +915,7 @@@ int swsusp_write(unsigned int flags
                return error;
        }
        if (flags & SF_NOCOMPRESS_MODE) {
 -              if (!enough_swap(pages, flags)) {
 +              if (!enough_swap(pages)) {
                        pr_err("Not enough free swap\n");
                        error = -ENOSPC;
                        goto out_finish;