bio->bi_disk = bio_src->bi_disk;
bio->bi_partno = bio_src->bi_partno;
bio_set_flag(bio, BIO_CLONED);
+ if (bio_flagged(bio_src, BIO_THROTTLED))
+ bio_set_flag(bio, BIO_THROTTLED);
bio->bi_opf = bio_src->bi_opf;
bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
}
EXPORT_SYMBOL(bio_advance);
- /**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
- int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
- {
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, i) {
- bv->bv_page = alloc_page(gfp_mask);
- if (!bv->bv_page) {
- while (--bv >= bio->bi_io_vec)
- __free_page(bv->bv_page);
- return -ENOMEM;
- }
- }
-
- return 0;
- }
- EXPORT_SYMBOL(bio_alloc_pages);
-
/**
* bio_copy_data - copy contents of data buffers from one chain of bios to
* another
bio_advance(bio, split->bi_iter.bi_size);
if (bio_flagged(bio, BIO_TRACE_COMPLETION))
- bio_set_flag(bio, BIO_TRACE_COMPLETION);
+ bio_set_flag(split, BIO_TRACE_COMPLETION);
return split;
}
rq->start_time = jiffies;
set_start_time_ns(rq);
rq->part = NULL;
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
}
EXPORT_SYMBOL(blk_rq_init);
}
}
+void blk_drain_queue(struct request_queue *q)
+{
+ spin_lock_irq(q->queue_lock);
+ __blk_drain_queue(q, true);
+ spin_unlock_irq(q->queue_lock);
+}
+
/**
* blk_queue_bypass_start - enter queue bypass mode
* @q: queue of interest
*/
blk_freeze_queue(q);
spin_lock_irq(lock);
- if (!q->mq_ops)
- __blk_drain_queue(q, true);
queue_flag_set(QUEUE_FLAG_DEAD, q);
spin_unlock_irq(lock);
+ /*
+ * make sure all in-progress dispatch are completed because
+ * blk_freeze_queue() can only complete all requests, and
+ * dispatch may still be in-progress since we dispatch requests
+ * from more than one contexts
+ */
+ if (q->mq_ops)
+ blk_mq_quiesce_queue(q);
+
/* for synchronous bio-based driver finish in-flight integrity i/o */
blk_flush_integrity();
lockdep_assert_held(q->queue_lock);
+ blk_req_zone_write_unlock(req);
blk_pm_put_request(req);
elv_completed_request(q, req);
#endif /* CONFIG_FAIL_MAKE_REQUEST */
+ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+ {
+ if (part->policy && op_is_write(bio_op(bio))) {
+ char b[BDEVNAME_SIZE];
+
+ printk(KERN_ERR
+ "generic_make_request: Trying to write "
+ "to read-only block-device %s (partno %d)\n",
+ bio_devname(bio, b), part->partno);
+ return true;
+ }
+
+ return false;
+ }
+
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
struct hd_struct *p;
int ret = 0;
+ rcu_read_lock();
+ p = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (unlikely(!p || should_fail_request(p, bio->bi_iter.bi_size) ||
+ bio_check_ro(bio, p))) {
+ ret = -EIO;
+ goto out;
+ }
+
/*
* Zone reset does not include bi_size so bio_sectors() is always 0.
* Include a test for the reset op code and perform the remap if needed.
*/
- if (!bio->bi_partno ||
- (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET))
- return 0;
+ if (!bio_sectors(bio) && bio_op(bio) != REQ_OP_ZONE_RESET)
+ goto out;
- rcu_read_lock();
- p = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (likely(p && !should_fail_request(p, bio->bi_iter.bi_size))) {
- bio->bi_iter.bi_sector += p->start_sect;
- bio->bi_partno = 0;
- trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
- bio->bi_iter.bi_sector - p->start_sect);
- } else {
- printk("%s: fail for partition %d\n", __func__, bio->bi_partno);
- ret = -EIO;
- }
- rcu_read_unlock();
+ bio->bi_iter.bi_sector += p->start_sect;
+ bio->bi_partno = 0;
+ trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
+ bio->bi_iter.bi_sector - p->start_sect);
+ out:
+ rcu_read_unlock();
return ret;
}
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue is not a request based queue.
*/
-
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
goto end_io;
- if (blk_partition_remap(bio))
- goto end_io;
+ if (!bio->bi_partno) {
+ if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ goto end_io;
+ } else {
+ if (blk_partition_remap(bio))
+ goto end_io;
+ }
if (bio_check_eod(bio, nr_sectors))
goto end_io;
* bypass a potential scheduler on the bottom device for
* insert.
*/
- blk_mq_request_bypass_insert(rq, true);
- return BLK_STS_OK;
+ return blk_mq_request_issue_directly(rq);
}
spin_lock_irqsave(q->queue_lock, flags);
wbt_issue(req->q->rq_wb, &req->issue_stat);
}
- BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
+ BUG_ON(blk_rq_is_complete(req));
blk_add_timer(req);
}
EXPORT_SYMBOL(blk_start_request);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
- int kblockd_schedule_delayed_work(struct delayed_work *dwork,
- unsigned long delay)
- {
- return queue_delayed_work(kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work);
-
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
- unsigned long delay)
- {
- return queue_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
- }
- EXPORT_SYMBOL(kblockd_schedule_delayed_work_on);
-
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
#include "blk.h"
/*
- * Append a bio to a passthrough request. Only works can be merged into
- * the request based on the driver constraints.
+ * Append a bio to a passthrough request. Only works if the bio can be merged
+ * into the request based on the driver constraints.
*/
-int blk_rq_append_bio(struct request *rq, struct bio *bio)
+int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
- blk_queue_bounce(rq->q, &bio);
+ struct bio *orig_bio = *bio;
+
+ blk_queue_bounce(rq->q, bio);
if (!rq->bio) {
- blk_rq_bio_prep(rq->q, rq, bio);
+ blk_rq_bio_prep(rq->q, rq, *bio);
} else {
- if (!ll_back_merge_fn(rq->q, rq, bio))
+ if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ if (orig_bio != *bio) {
+ bio_put(*bio);
+ *bio = orig_bio;
+ }
return -EINVAL;
+ }
- rq->biotail->bi_next = bio;
- rq->biotail = bio;
- rq->__data_len += bio->bi_iter.bi_size;
+ rq->biotail->bi_next = *bio;
+ rq->biotail = *bio;
+ rq->__data_len += (*bio)->bi_iter.bi_size;
}
return 0;
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
- ret = blk_rq_append_bio(rq, bio);
- bio_get(bio);
+ ret = blk_rq_append_bio(rq, &bio);
if (ret) {
- bio_endio(bio);
__blk_rq_unmap_user(orig_bio);
- bio_put(bio);
return ret;
}
+ bio_get(bio);
return 0;
}
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
- int ret;
+ int ret = -EINVAL;
if (!iter_is_iovec(iter))
goto fail;
__blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
- return -EINVAL;
+ return ret;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
int do_copy = 0;
- struct bio *bio;
+ struct bio *bio, *orig_bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
if (do_copy)
rq->rq_flags |= RQF_COPY_USER;
- ret = blk_rq_append_bio(rq, bio);
+ orig_bio = bio;
+ ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
/* request is too big */
- bio_put(bio);
+ bio_put(orig_bio);
return ret;
}
{
struct mq_inflight *mi = priv;
- if (test_bit(REQ_ATOM_STARTED, &rq->atomic_flags) &&
- !test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
+ if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) {
/*
* index[0] counts the specific partition that was asked
* for. index[1] counts the ones that are active on the
* exported to drivers as the only user for unfreeze is blk_mq.
*/
blk_freeze_queue_start(q);
+ if (!q->mq_ops)
+ blk_drain_queue(q);
blk_mq_freeze_queue_wait(q);
}
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->queue_rq_srcu);
+ synchronize_srcu(hctx->srcu);
else
rcu = true;
}
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
-
- rq->rq_flags = 0;
+ req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->tag = -1;
rq->internal_tag = tag;
} else {
if (blk_mq_tag_busy(data->hctx)) {
- rq->rq_flags = RQF_MQ_INFLIGHT;
+ rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
data->hctx->tags->rqs[rq->tag] = rq;
}
- INIT_LIST_HEAD(&rq->queuelist);
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
+ rq->rq_flags = rq_flags;
+ rq->cpu = -1;
rq->cmd_flags = op;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
rq->rq_flags |= RQF_IO_STAT;
- /* do not touch atomic flags, it needs atomic ops against the timer */
- rq->cpu = -1;
+ INIT_LIST_HEAD(&rq->queuelist);
INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
rq->start_time = jiffies;
- #ifdef CONFIG_BLK_CGROUP
- rq->rl = NULL;
- set_start_time_ns(rq);
- rq->io_start_time_ns = 0;
- #endif
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
rq->special = NULL;
/* tag was already set */
rq->extra_len = 0;
+ rq->__deadline = 0;
INIT_LIST_HEAD(&rq->timeout_list);
rq->timeout = 0;
rq->end_io_data = NULL;
rq->next_rq = NULL;
+ #ifdef CONFIG_BLK_CGROUP
+ rq->rl = NULL;
+ set_start_time_ns(rq);
+ rq->io_start_time_ns = 0;
+ #endif
+
data->ctx->rq_dispatched[op_is_sync(op)]++;
return rq;
}
blk_queue_exit(q);
return ERR_PTR(-EXDEV);
}
- cpu = cpumask_first(alloc_data.hctx->cpumask);
+ cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
rq = blk_mq_get_request(q, NULL, op, &alloc_data);
if (blk_rq_rl(rq))
blk_put_rl(blk_rq_rl(rq));
- clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (rq->tag != -1)
blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
if (sched_tag != -1)
bool shared = false;
int cpu;
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
+ blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
+
if (rq->internal_tag != -1)
blk_mq_sched_completed_request(rq);
if (rq->rq_flags & RQF_STATS) {
put_cpu();
}
+ static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+ __releases(hctx->srcu)
+ {
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ rcu_read_unlock();
+ else
+ srcu_read_unlock(hctx->srcu, srcu_idx);
+ }
+
+ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+ __acquires(hctx->srcu)
+ {
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+ /* shut up gcc false positive */
+ *srcu_idx = 0;
+ rcu_read_lock();
+ } else
+ *srcu_idx = srcu_read_lock(hctx->srcu);
+ }
+
+ static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
+ {
+ unsigned long flags;
+
+ /*
+ * blk_mq_rq_aborted_gstate() is used from the completion path and
+ * can thus be called from irq context. u64_stats_fetch in the
+ * middle of update on the same CPU leads to lockup. Disable irq
+ * while updating.
+ */
+ local_irq_save(flags);
+ u64_stats_update_begin(&rq->aborted_gstate_sync);
+ rq->aborted_gstate = gstate;
+ u64_stats_update_end(&rq->aborted_gstate_sync);
+ local_irq_restore(flags);
+ }
+
+ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
+ {
+ unsigned int start;
+ u64 aborted_gstate;
+
+ do {
+ start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
+ aborted_gstate = rq->aborted_gstate;
+ } while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));
+
+ return aborted_gstate;
+ }
+
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
void blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+ int srcu_idx;
if (unlikely(blk_should_fake_timeout(q)))
return;
- if (!blk_mark_rq_complete(rq))
+
+ /*
+ * If @rq->aborted_gstate equals the current instance, timeout is
+ * claiming @rq and we lost. This is synchronized through
+ * hctx_lock(). See blk_mq_timeout_work() for details.
+ *
+ * Completion path never blocks and we can directly use RCU here
+ * instead of hctx_lock() which can be either RCU or SRCU.
+ * However, that would complicate paths which want to synchronize
+ * against us. Let stay in sync with the issue path so that
+ * hctx_lock() covers both issue and completion paths.
+ */
+ hctx_lock(hctx, &srcu_idx);
+ if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
__blk_mq_complete_request(rq);
+ hctx_unlock(hctx, srcu_idx);
}
EXPORT_SYMBOL(blk_mq_complete_request);
int blk_mq_request_started(struct request *rq)
{
- return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
+ return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);
wbt_issue(q->rq_wb, &rq->issue_stat);
}
- blk_add_timer(rq);
-
- WARN_ON_ONCE(test_bit(REQ_ATOM_STARTED, &rq->atomic_flags));
+ WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
/*
- * Mark us as started and clear complete. Complete might have been
- * set if requeue raced with timeout, which then marked it as
- * complete. So be sure to clear complete again when we start
- * the request, otherwise we'll ignore the completion event.
+ * Mark @rq in-flight which also advances the generation number,
+ * and register for timeout. Protect with a seqcount to allow the
+ * timeout path to read both @rq->gstate and @rq->deadline
+ * coherently.
*
- * Ensure that ->deadline is visible before we set STARTED, such that
- * blk_mq_check_expired() is guaranteed to observe our ->deadline when
- * it observes STARTED.
+ * This is the only place where a request is marked in-flight. If
+ * the timeout path reads an in-flight @rq->gstate, the
+ * @rq->deadline it reads together under @rq->gstate_seq is
+ * guaranteed to be the matching one.
*/
- smp_wmb();
- set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) {
- /*
- * Coherence order guarantees these consecutive stores to a
- * single variable propagate in the specified order. Thus the
- * clear_bit() is ordered _after_ the set bit. See
- * blk_mq_check_expired().
- *
- * (the bits must be part of the same byte for this to be
- * true).
- */
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
- }
+ preempt_disable();
+ write_seqcount_begin(&rq->gstate_seq);
+
+ blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
+ blk_add_timer(rq);
+
+ write_seqcount_end(&rq->gstate_seq);
+ preempt_enable();
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
EXPORT_SYMBOL(blk_mq_start_request);
/*
- * When we reach here because queue is busy, REQ_ATOM_COMPLETE
- * flag isn't set yet, so there may be race with timeout handler,
- * but given rq->deadline is just set in .queue_rq() under
- * this situation, the race won't be possible in reality because
- * rq->timeout should be set as big enough to cover the window
- * between blk_mq_start_request() called from .queue_rq() and
- * clearing REQ_ATOM_STARTED here.
+ * When we reach here because queue is busy, it's safe to change the state
+ * to IDLE without checking @rq->aborted_gstate because we should still be
+ * holding the RCU read lock and thus protected against timeout.
*/
static void __blk_mq_requeue_request(struct request *rq)
{
wbt_requeue(q->rq_wb, &rq->issue_stat);
blk_mq_sched_requeue_request(rq);
- if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
+ if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
+ blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
if (q->dma_drain_size && blk_rq_bytes(rq))
rq->nr_phys_segments--;
}
rq->rq_flags &= ~RQF_SOFTBARRIER;
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, true, false, false, true);
+ blk_mq_sched_insert_request(rq, true, false, false);
}
while (!list_empty(&rq_list)) {
rq = list_entry(rq_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false, true);
+ blk_mq_sched_insert_request(rq, false, false, false);
}
blk_mq_run_hw_queues(q, false);
void blk_mq_kick_requeue_list(struct request_queue *q)
{
- kblockd_schedule_delayed_work(&q->requeue_work, 0);
+ kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);
struct blk_mq_timeout_data {
unsigned long next;
unsigned int next_set;
+ unsigned int nr_expired;
};
- void blk_mq_rq_timed_out(struct request *req, bool reserved)
+ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
const struct blk_mq_ops *ops = req->q->mq_ops;
enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
- /*
- * We know that complete is set at this point. If STARTED isn't set
- * anymore, then the request isn't active and the "timeout" should
- * just be ignored. This can happen due to the bitflag ordering.
- * Timeout first checks if STARTED is set, and if it is, assumes
- * the request is active. But if we race with completion, then
- * both flags will get cleared. So check here again, and ignore
- * a timeout event with a request that isn't active.
- */
- if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
- return;
+ req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;
if (ops->timeout)
ret = ops->timeout(req, reserved);
__blk_mq_complete_request(req);
break;
case BLK_EH_RESET_TIMER:
+ /*
+ * As nothing prevents from completion happening while
+ * ->aborted_gstate is set, this may lead to ignored
+ * completions and further spurious timeouts.
+ */
+ blk_mq_rq_update_aborted_gstate(req, 0);
blk_add_timer(req);
- blk_clear_rq_complete(req);
break;
case BLK_EH_NOT_HANDLED:
break;
struct request *rq, void *priv, bool reserved)
{
struct blk_mq_timeout_data *data = priv;
- unsigned long deadline;
+ unsigned long gstate, deadline;
+ int start;
- if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
- return;
+ might_sleep();
- /*
- * Ensures that if we see STARTED we must also see our
- * up-to-date deadline, see blk_mq_start_request().
- */
- smp_rmb();
+ if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
+ return;
- deadline = READ_ONCE(rq->deadline);
+ /* read coherent snapshots of @rq->state_gen and @rq->deadline */
+ while (true) {
+ start = read_seqcount_begin(&rq->gstate_seq);
+ gstate = READ_ONCE(rq->gstate);
+ deadline = blk_rq_deadline(rq);
+ if (!read_seqcount_retry(&rq->gstate_seq, start))
+ break;
+ cond_resched();
+ }
- /*
- * The rq being checked may have been freed and reallocated
- * out already here, we avoid this race by checking rq->deadline
- * and REQ_ATOM_COMPLETE flag together:
- *
- * - if rq->deadline is observed as new value because of
- * reusing, the rq won't be timed out because of timing.
- * - if rq->deadline is observed as previous value,
- * REQ_ATOM_COMPLETE flag won't be cleared in reuse path
- * because we put a barrier between setting rq->deadline
- * and clearing the flag in blk_mq_start_request(), so
- * this rq won't be timed out too.
- */
- if (time_after_eq(jiffies, deadline)) {
- if (!blk_mark_rq_complete(rq)) {
- /*
- * Again coherence order ensures that consecutive reads
- * from the same variable must be in that order. This
- * ensures that if we see COMPLETE clear, we must then
- * see STARTED set and we'll ignore this timeout.
- *
- * (There's also the MB implied by the test_and_clear())
- */
- blk_mq_rq_timed_out(rq, reserved);
- }
+ /* if in-flight && overdue, mark for abortion */
+ if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
+ time_after_eq(jiffies, deadline)) {
+ blk_mq_rq_update_aborted_gstate(rq, gstate);
+ data->nr_expired++;
+ hctx->nr_expired++;
} else if (!data->next_set || time_after(data->next, deadline)) {
data->next = deadline;
data->next_set = 1;
}
}
+ static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
+ struct request *rq, void *priv, bool reserved)
+ {
+ /*
+ * We marked @rq->aborted_gstate and waited for RCU. If there were
+ * completions that we lost to, they would have finished and
+ * updated @rq->gstate by now; otherwise, the completion path is
+ * now guaranteed to see @rq->aborted_gstate and yield. If
+ * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
+ */
+ if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
+ READ_ONCE(rq->gstate) == rq->aborted_gstate)
+ blk_mq_rq_timed_out(rq, reserved);
+ }
+
static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
struct blk_mq_timeout_data data = {
.next = 0,
.next_set = 0,
+ .nr_expired = 0,
};
+ struct blk_mq_hw_ctx *hctx;
int i;
/* A deadlock might occur if a request is stuck requiring a
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
+ /* scan for the expired ones and set their ->aborted_gstate */
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+ if (data.nr_expired) {
+ bool has_rcu = false;
+
+ /*
+ * Wait till everyone sees ->aborted_gstate. The
+ * sequential waits for SRCUs aren't ideal. If this ever
+ * becomes a problem, we can add per-hw_ctx rcu_head and
+ * wait in parallel.
+ */
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (!hctx->nr_expired)
+ continue;
+
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+ has_rcu = true;
+ else
+ synchronize_srcu(hctx->srcu);
+
+ hctx->nr_expired = 0;
+ }
+ if (has_rcu)
+ synchronize_rcu();
+
+ /* terminate the ones we won */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+ }
+
if (data.next_set) {
data.next = blk_rq_timeout(round_jiffies_up(data.next));
mod_timer(&q->timeout, data.next);
} else {
- struct blk_mq_hw_ctx *hctx;
-
+ /*
+ * Request timeouts are handled as a forward rolling timer. If
+ * we end up here it means that no requests are pending and
+ * also that no request has been pending for a while. Mark
+ * each hctx as idle.
+ */
queue_for_each_hw_ctx(q, hctx, i) {
/* the hctx may be unmapped, so check it here */
if (blk_mq_hw_queue_mapped(hctx))
/*
* Mark us waiting for a tag. For shared tags, this involves hooking us into
- * the tag wakeups. For non-shared tags, we can simply mark us nedeing a
- * restart. For both caes, take care to check the condition again after
+ * the tag wakeups. For non-shared tags, we can simply mark us needing a
+ * restart. For both cases, take care to check the condition again after
* marking us as waiting.
*/
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
struct request *rq)
{
struct blk_mq_hw_ctx *this_hctx = *hctx;
- bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
struct sbq_wait_state *ws;
wait_queue_entry_t *wait;
bool ret;
- if (!shared_tags) {
+ if (!(this_hctx->flags & BLK_MQ_F_TAG_SHARED)) {
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
- } else {
- wait = &this_hctx->dispatch_wait;
- if (!list_empty_careful(&wait->entry))
- return false;
- spin_lock(&this_hctx->lock);
- if (!list_empty(&wait->entry)) {
- spin_unlock(&this_hctx->lock);
- return false;
- }
+ /*
+ * It's possible that a tag was freed in the window between the
+ * allocation failure and adding the hardware queue to the wait
+ * queue.
+ *
+ * Don't clear RESTART here, someone else could have set it.
+ * At most this will cost an extra queue run.
+ */
+ return blk_mq_get_driver_tag(rq, hctx, false);
+ }
- ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
- add_wait_queue(&ws->wait, wait);
+ wait = &this_hctx->dispatch_wait;
+ if (!list_empty_careful(&wait->entry))
+ return false;
+
+ spin_lock(&this_hctx->lock);
+ if (!list_empty(&wait->entry)) {
+ spin_unlock(&this_hctx->lock);
+ return false;
}
+ ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
+ add_wait_queue(&ws->wait, wait);
+
/*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
*/
ret = blk_mq_get_driver_tag(rq, hctx, false);
-
- if (!shared_tags) {
- /*
- * Don't clear RESTART here, someone else could have set it.
- * At most this will cost an extra queue run.
- */
- return ret;
- } else {
- if (!ret) {
- spin_unlock(&this_hctx->lock);
- return false;
- }
-
- /*
- * We got a tag, remove ourselves from the wait queue to ensure
- * someone else gets the wakeup.
- */
- spin_lock_irq(&ws->wait.lock);
- list_del_init(&wait->entry);
- spin_unlock_irq(&ws->wait.lock);
+ if (!ret) {
spin_unlock(&this_hctx->lock);
- return true;
+ return false;
}
+
+ /*
+ * We got a tag, remove ourselves from the wait queue to ensure
+ * someone else gets the wakeup.
+ */
+ spin_lock_irq(&ws->wait.lock);
+ list_del_init(&wait->entry);
+ spin_unlock_irq(&ws->wait.lock);
+ spin_unlock(&this_hctx->lock);
+
+ return true;
}
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
/*
* We should be running this queue from one of the CPUs that
* are mapped to it.
+ *
+ * There are at least two related races now between setting
+ * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
+ * __blk_mq_run_hw_queue():
+ *
+ * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
+ * but later it becomes online, then this warning is harmless
+ * at all
+ *
+ * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
+ * but later it becomes offline, then the warning can't be
+ * triggered, and we depend on blk-mq timeout handler to
+ * handle dispatched requests to this hctx
*/
- WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
- cpu_online(hctx->next_cpu));
+ if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
+ cpu_online(hctx->next_cpu)) {
+ printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
+ raw_smp_processor_id(),
+ cpumask_empty(hctx->cpumask) ? "inactive": "active");
+ dump_stack();
+ }
/*
* We can't run the queue inline with ints disabled. Ensure that
*/
WARN_ON_ONCE(in_interrupt());
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- blk_mq_sched_dispatch_requests(hctx);
- rcu_read_unlock();
- } else {
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- blk_mq_sched_dispatch_requests(hctx);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
- }
+ hctx_lock(hctx, &srcu_idx);
+ blk_mq_sched_dispatch_requests(hctx);
+ hctx_unlock(hctx, srcu_idx);
}
/*
*/
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
+ bool tried = false;
+
if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) {
int next_cpu;
-
- next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+ select_cpu:
+ next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+ cpu_online_mask);
if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(hctx->cpumask);
+ next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
- hctx->next_cpu = next_cpu;
+ /*
+ * No online CPU is found, so have to make sure hctx->next_cpu
+ * is set correctly for not breaking workqueue.
+ */
+ if (next_cpu >= nr_cpu_ids)
+ hctx->next_cpu = cpumask_first(hctx->cpumask);
+ else
+ hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
+ /*
+ * Do unbound schedule if we can't find a online CPU for this hctx,
+ * and it should only happen in the path of handling CPU DEAD.
+ */
+ if (!cpu_online(hctx->next_cpu)) {
+ if (!tried) {
+ tried = true;
+ goto select_cpu;
+ }
+
+ /*
+ * Make sure to re-select CPU next time once after CPUs
+ * in hctx->cpumask become online again.
+ */
+ hctx->next_cpu_batch = 1;
+ return WORK_CPU_UNBOUND;
+ }
return hctx->next_cpu;
}
put_cpu();
}
- kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
- &hctx->run_work,
- msecs_to_jiffies(msecs));
+ kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
+ msecs_to_jiffies(msecs));
}
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- if (blk_mq_hctx_has_pending(hctx)) {
+ int srcu_idx;
+ bool need_run;
+
+ /*
+ * When queue is quiesced, we may be switching io scheduler, or
+ * updating nr_hw_queues, or other things, and we can't run queue
+ * any more, even __blk_mq_hctx_has_pending() can't be called safely.
+ *
+ * And queue will be rerun in blk_mq_unquiesce_queue() if it is
+ * quiesced.
+ */
+ hctx_lock(hctx, &srcu_idx);
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx);
+ hctx_unlock(hctx, srcu_idx);
+
+ if (need_run) {
__blk_mq_delay_run_hw_queue(hctx, async, 0);
return true;
}
return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true);
}
- static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie, bool may_sleep)
+ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
};
blk_qc_t new_cookie;
blk_status_t ret;
+
+ new_cookie = request_to_qc_t(hctx, rq);
+
+ /*
+ * For OK queue, we are done. For error, caller may kill it.
+ * Any other error (busy), just add it to our list as we
+ * previously would have done.
+ */
+ ret = q->mq_ops->queue_rq(hctx, &bd);
+ switch (ret) {
+ case BLK_STS_OK:
+ *cookie = new_cookie;
+ break;
+ case BLK_STS_RESOURCE:
+ __blk_mq_requeue_request(rq);
+ break;
+ default:
+ *cookie = BLK_QC_T_NONE;
+ break;
+ }
+
+ return ret;
+ }
+
+ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+ struct request *rq,
+ blk_qc_t *cookie,
+ bool bypass_insert)
+ {
+ struct request_queue *q = rq->q;
bool run_queue = true;
- /* RCU or SRCU read lock is needed before checking quiesced flag */
+ /*
+ * RCU or SRCU read lock is needed before checking quiesced flag.
+ *
+ * When queue is stopped or quiesced, ignore 'bypass_insert' from
+ * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
+ * and avoid driver to try to dispatch again.
+ */
if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
run_queue = false;
+ bypass_insert = false;
goto insert;
}
- if (q->elevator)
+ if (q->elevator && !bypass_insert)
goto insert;
if (!blk_mq_get_driver_tag(rq, NULL, false))
goto insert;
}
- new_cookie = request_to_qc_t(hctx, rq);
-
- /*
- * For OK queue, we are done. For error, kill it. Any other
- * error (busy), just add it to our list as we previously
- * would have done
- */
- ret = q->mq_ops->queue_rq(hctx, &bd);
- switch (ret) {
- case BLK_STS_OK:
- *cookie = new_cookie;
- return;
- case BLK_STS_RESOURCE:
- __blk_mq_requeue_request(rq);
- goto insert;
- default:
- *cookie = BLK_QC_T_NONE;
- blk_mq_end_request(rq, ret);
- return;
- }
-
+ return __blk_mq_issue_directly(hctx, rq, cookie);
insert:
- blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
+ if (bypass_insert)
+ return BLK_STS_RESOURCE;
+
+ blk_mq_sched_insert_request(rq, false, run_queue, false);
+ return BLK_STS_OK;
}
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- rcu_read_lock();
- __blk_mq_try_issue_directly(hctx, rq, cookie, false);
- rcu_read_unlock();
- } else {
- unsigned int srcu_idx;
+ blk_status_t ret;
+ int srcu_idx;
- might_sleep();
+ might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
- srcu_idx = srcu_read_lock(hctx->queue_rq_srcu);
- __blk_mq_try_issue_directly(hctx, rq, cookie, true);
- srcu_read_unlock(hctx->queue_rq_srcu, srcu_idx);
- }
+ hctx_lock(hctx, &srcu_idx);
+
+ ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+ if (ret == BLK_STS_RESOURCE)
+ blk_mq_sched_insert_request(rq, false, true, false);
+ else if (ret != BLK_STS_OK)
+ blk_mq_end_request(rq, ret);
+
+ hctx_unlock(hctx, srcu_idx);
+ }
+
+ blk_status_t blk_mq_request_issue_directly(struct request *rq)
+ {
+ blk_status_t ret;
+ int srcu_idx;
+ blk_qc_t unused_cookie;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+ hctx_lock(hctx, &srcu_idx);
+ ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+ hctx_unlock(hctx, srcu_idx);
+
+ return ret;
}
static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
} else if (q->elevator) {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
- blk_mq_sched_insert_request(rq, false, true, true, true);
+ blk_mq_sched_insert_request(rq, false, true, true);
} else {
blk_mq_put_ctx(data.ctx);
blk_mq_bio_to_request(rq, bio);
return (size_t)PAGE_SIZE << order;
}
+ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
+ unsigned int hctx_idx, int node)
+ {
+ int ret;
+
+ if (set->ops->init_request) {
+ ret = set->ops->init_request(set, rq, hctx_idx, node);
+ if (ret)
+ return ret;
+ }
+
+ seqcount_init(&rq->gstate_seq);
+ u64_stats_init(&rq->aborted_gstate_sync);
+ return 0;
+ }
+
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth)
{
struct request *rq = p;
tags->static_rqs[i] = rq;
- if (set->ops->init_request) {
- if (set->ops->init_request(set, rq, hctx_idx,
- node)) {
- tags->static_rqs[i] = NULL;
- goto fail;
- }
+ if (blk_mq_init_request(set, rq, hctx_idx, node)) {
+ tags->static_rqs[i] = NULL;
+ goto fail;
}
p += rq_size;
{
blk_mq_debugfs_unregister_hctx(hctx);
- blk_mq_tag_idle(hctx);
+ if (blk_mq_hw_queue_mapped(hctx))
+ blk_mq_tag_idle(hctx);
if (set->ops->exit_request)
set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
set->ops->exit_hctx(hctx, hctx_idx);
if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->queue_rq_srcu);
+ cleanup_srcu_struct(hctx->srcu);
blk_mq_remove_cpuhp(hctx);
blk_free_flush_queue(hctx->fq);
if (!hctx->fq)
goto sched_exit_hctx;
- if (set->ops->init_request &&
- set->ops->init_request(set, hctx->fq->flush_rq, hctx_idx,
- node))
+ if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, node))
goto free_fq;
if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->queue_rq_srcu);
+ init_srcu_struct(hctx->srcu);
blk_mq_debugfs_register_hctx(q, hctx);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
- /* If the cpu isn't present, the cpu is mapped to first hctx */
- if (!cpu_present(i))
- continue;
-
- hctx = blk_mq_map_queue(q, i);
-
/*
* Set local node, IFF we have more than one hw queue. If
* not, we remain on the home node of the device
*/
+ hctx = blk_mq_map_queue(q, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
hctx->numa_node = local_memory_node(cpu_to_node(i));
}
*
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
- for_each_present_cpu(i) {
+ for_each_possible_cpu(i) {
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
/*
* Initialize batch roundrobin counts
*/
- hctx->next_cpu = cpumask_first(hctx->cpumask);
+ hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+ cpu_online_mask);
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
}
}
{
int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, queue_rq_srcu),
+ BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
__alignof__(struct blk_mq_hw_ctx)) !=
sizeof(struct blk_mq_hw_ctx));
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
blk_mq_sysfs_unregister(q);
+
+ /* protect against switching io scheduler */
+ mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int node;
}
}
q->nr_hw_queues = i;
+ mutex_unlock(&q->sysfs_lock);
blk_mq_sysfs_register(q);
}
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
- if (set->ops->map_queues)
+ if (set->ops->map_queues) {
+ int cpu;
+ /*
+ * transport .map_queues is usually done in the following
+ * way:
+ *
+ * for (queue = 0; queue < set->nr_hw_queues; queue++) {
+ * mask = get_cpu_mask(queue)
+ * for_each_cpu(cpu, mask)
+ * set->mq_map[cpu] = queue;
+ * }
+ *
+ * When we need to remap, the table has to be cleared for
+ * killing stale mapping since one CPU may not be mapped
+ * to any hw queue.
+ */
+ for_each_possible_cpu(cpu)
+ set->mq_map[cpu] = 0;
+
return set->ops->map_queues(set);
- else
+ } else
return blk_mq_map_queues(set);
}
return -EINVAL;
blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!ret)
q->nr_requests = nr;
+ blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
return ret;
unsigned int nsecs;
ktime_t kt;
- if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
+ if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
return false;
/*
if (!nsecs)
return false;
- set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
+ rq->rq_flags |= RQF_MQ_POLL_SLEPT;
/*
* This will be replaced with the stats tracking code, using
hrtimer_init_sleeper(&hs, current);
do {
- if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
+ if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
hrtimer_start_expires(&hs.timer, mode);
static int __init blk_mq_init(void)
{
- /*
- * See comment in block/blk.h rq_atomic_flags enum
- */
- BUILD_BUG_ON((REQ_ATOM_STARTED / BITS_PER_BYTE) !=
- (REQ_ATOM_COMPLETE / BITS_PER_BYTE));
-
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
return 0;
unsigned int scale;
- struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE];
- struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE];
- struct latency_bucket __percpu *latency_buckets;
+ struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
+ struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
+ struct latency_bucket __percpu *latency_buckets[2];
unsigned long last_calculate_time;
unsigned long filtered_latency;
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_bytes,
},
+ {
+ .name = "throttle.io_service_bytes_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_bytes_recursive,
+ },
{
.name = "throttle.io_serviced",
.private = (unsigned long)&blkcg_policy_throtl,
.seq_show = blkg_print_stat_ios,
},
+ {
+ .name = "throttle.io_serviced_recursive",
+ .private = (unsigned long)&blkcg_policy_throtl,
+ .seq_show = blkg_print_stat_ios_recursive,
+ },
{ } /* terminate */
};
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
- struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE];
- int i, cpu;
- unsigned long last_latency = 0;
- unsigned long latency;
+ struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
+ int i, cpu, rw;
+ unsigned long last_latency[2] = { 0 };
+ unsigned long latency[2];
if (!blk_queue_nonrot(td->queue))
return;
td->last_calculate_time = jiffies;
memset(avg_latency, 0, sizeof(avg_latency));
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- struct latency_bucket *tmp = &td->tmp_buckets[i];
-
- for_each_possible_cpu(cpu) {
- struct latency_bucket *bucket;
-
- /* this isn't race free, but ok in practice */
- bucket = per_cpu_ptr(td->latency_buckets, cpu);
- tmp->total_latency += bucket[i].total_latency;
- tmp->samples += bucket[i].samples;
- bucket[i].total_latency = 0;
- bucket[i].samples = 0;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
+
+ for_each_possible_cpu(cpu) {
+ struct latency_bucket *bucket;
+
+ /* this isn't race free, but ok in practice */
+ bucket = per_cpu_ptr(td->latency_buckets[rw],
+ cpu);
+ tmp->total_latency += bucket[i].total_latency;
+ tmp->samples += bucket[i].samples;
+ bucket[i].total_latency = 0;
+ bucket[i].samples = 0;
+ }
- if (tmp->samples >= 32) {
- int samples = tmp->samples;
+ if (tmp->samples >= 32) {
+ int samples = tmp->samples;
- latency = tmp->total_latency;
+ latency[rw] = tmp->total_latency;
- tmp->total_latency = 0;
- tmp->samples = 0;
- latency /= samples;
- if (latency == 0)
- continue;
- avg_latency[i].latency = latency;
+ tmp->total_latency = 0;
+ tmp->samples = 0;
+ latency[rw] /= samples;
+ if (latency[rw] == 0)
+ continue;
+ avg_latency[rw][i].latency = latency[rw];
+ }
}
}
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
- if (!avg_latency[i].latency) {
- if (td->avg_buckets[i].latency < last_latency)
- td->avg_buckets[i].latency = last_latency;
- continue;
- }
+ for (rw = READ; rw <= WRITE; rw++) {
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ if (!avg_latency[rw][i].latency) {
+ if (td->avg_buckets[rw][i].latency < last_latency[rw])
+ td->avg_buckets[rw][i].latency =
+ last_latency[rw];
+ continue;
+ }
- if (!td->avg_buckets[i].valid)
- latency = avg_latency[i].latency;
- else
- latency = (td->avg_buckets[i].latency * 7 +
- avg_latency[i].latency) >> 3;
+ if (!td->avg_buckets[rw][i].valid)
+ latency[rw] = avg_latency[rw][i].latency;
+ else
+ latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
+ avg_latency[rw][i].latency) >> 3;
- td->avg_buckets[i].latency = max(latency, last_latency);
- td->avg_buckets[i].valid = true;
- last_latency = td->avg_buckets[i].latency;
+ td->avg_buckets[rw][i].latency = max(latency[rw],
+ last_latency[rw]);
+ td->avg_buckets[rw][i].valid = true;
+ last_latency[rw] = td->avg_buckets[rw][i].latency;
+ }
}
for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
throtl_log(&td->service_queue,
- "Latency bucket %d: latency=%ld, valid=%d", i,
- td->avg_buckets[i].latency, td->avg_buckets[i].valid);
+ "Latency bucket %d: read latency=%ld, read valid=%d, "
+ "write latency=%ld, write valid=%d", i,
+ td->avg_buckets[READ][i].latency,
+ td->avg_buckets[READ][i].valid,
+ td->avg_buckets[WRITE][i].latency,
+ td->avg_buckets[WRITE][i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
out_unlock:
spin_unlock_irq(q->queue_lock);
out:
- /*
- * As multiple blk-throtls may stack in the same issue path, we
- * don't want bios to leave with the flag set. Clear the flag if
- * being issued.
- */
- if (!throttled)
- bio_clear_flag(bio, BIO_THROTTLED);
+ bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
struct latency_bucket *latency;
int index;
- if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ ||
+ if (!td || td->limit_index != LIMIT_LOW ||
+ !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
!blk_queue_nonrot(td->queue))
return;
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets);
+ latency = get_cpu_ptr(td->latency_buckets[op]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets);
+ put_cpu_ptr(td->latency_buckets[op]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
unsigned long finish_time;
unsigned long start_time;
unsigned long lat;
+ int rw = bio_data_dir(bio);
tg = bio->bi_cg_private;
if (!tg)
bucket = request_bucket_index(
blk_stat_size(&bio->bi_issue_stat));
- threshold = tg->td->avg_buckets[bucket].latency +
+ threshold = tg->td->avg_buckets[rw][bucket].latency +
tg->latency_target;
if (lat > threshold)
tg->bad_bio_cnt++;
td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
if (!td)
return -ENOMEM;
- td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) *
+ td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
LATENCY_BUCKET_SIZE, __alignof__(u64));
- if (!td->latency_buckets) {
+ if (!td->latency_buckets[READ]) {
+ kfree(td);
+ return -ENOMEM;
+ }
+ td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
+ LATENCY_BUCKET_SIZE, __alignof__(u64));
+ if (!td->latency_buckets[WRITE]) {
+ free_percpu(td->latency_buckets[READ]);
kfree(td);
return -ENOMEM;
}
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret) {
- free_percpu(td->latency_buckets);
+ free_percpu(td->latency_buckets[READ]);
+ free_percpu(td->latency_buckets[WRITE]);
kfree(td);
}
return ret;
BUG_ON(!q->td);
throtl_shutdown_wq(q);
blkcg_deactivate_policy(q, &blkcg_policy_throtl);
- free_percpu(q->td->latency_buckets);
+ free_percpu(q->td->latency_buckets[READ]);
+ free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
} else {
td->throtl_slice = DFL_THROTL_SLICE_HD;
td->filtered_latency = LATENCY_FILTERED_HD;
- for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
- td->avg_buckets[i].latency = DFL_HD_BASELINE_LATENCY;
+ for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
+ td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
+ td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
+ }
}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;
#endif
- td->track_bio_latency = !q->mq_ops && !q->request_fn;
+ td->track_bio_latency = !queue_is_rq_based(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
}
void blk_account_io_completion(struct request *req, unsigned int bytes);
void blk_account_io_done(struct request *req);
- /*
- * Internal atomic flags for request handling
- */
- enum rq_atomic_flags {
- /*
- * Keep these two bits first - not because we depend on the
- * value of them, but we do depend on them being in the same
- * byte of storage to ensure ordering on writes. Keeping them
- * first will achieve that nicely.
- */
- REQ_ATOM_COMPLETE = 0,
- REQ_ATOM_STARTED,
-
- REQ_ATOM_POLL_SLEPT,
- };
-
/*
* EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds
+ * sure that only one of them succeeds. Steal the bottom bit of the
+ * __deadline field for this.
*/
static inline int blk_mark_rq_complete(struct request *rq)
{
- return test_and_set_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ return test_and_set_bit(0, &rq->__deadline);
}
static inline void blk_clear_rq_complete(struct request *rq)
{
- clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
+ clear_bit(0, &rq->__deadline);
+ }
+
+ static inline bool blk_rq_is_complete(struct request *rq)
+ {
+ return test_bit(0, &rq->__deadline);
}
/*
e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
}
+ int elv_register_queue(struct request_queue *q);
+ void elv_unregister_queue(struct request_queue *q);
+
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
#ifdef CONFIG_FAIL_IO_TIMEOUT
q->last_merge = NULL;
}
+ /*
+ * Steal a bit from this field for legacy IO path atomic IO marking. Note that
+ * setting the deadline clears the bottom bit, potentially clearing the
+ * completed bit. The user has to be OK with this (current ones are fine).
+ */
+ static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
+ {
+ rq->__deadline = time & ~0x1UL;
+ }
+
+ static inline unsigned long blk_rq_deadline(struct request *rq)
+ {
+ return rq->__deadline & ~0x1UL;
+ }
+
/*
* Internal io_context interface
*/
}
#endif /* CONFIG_BOUNCE */
+extern void blk_drain_queue(struct request_queue *q);
+
#endif /* BLK_INTERNAL_H */
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
unsigned char *vfrom;
- struct bio_vec tovec, *fromvec = from->bi_io_vec;
+ struct bio_vec tovec, fromvec;
struct bvec_iter iter;
+ /*
+ * The bio of @from is created by bounce, so we can iterate
+ * its bvec from start to end, but the @from->bi_iter can't be
+ * trusted because it might be changed by splitting.
+ */
+ struct bvec_iter from_iter = BVEC_ITER_ALL_INIT;
bio_for_each_segment(tovec, to, iter) {
- if (tovec.bv_page != fromvec->bv_page) {
+ fromvec = bio_iter_iovec(from, from_iter);
+ if (tovec.bv_page != fromvec.bv_page) {
/*
* fromvec->bv_offset and fromvec->bv_len might have
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
- vfrom = page_address(fromvec->bv_page) +
+ vfrom = page_address(fromvec.bv_page) +
tovec.bv_offset;
bounce_copy_vec(&tovec, vfrom);
flush_dcache_page(tovec.bv_page);
}
-
- fromvec++;
+ bio_advance_iter(from, &from_iter, tovec.bv_len);
}
}
static void bounce_end_io(struct bio *bio, mempool_t *pool)
{
struct bio *bio_orig = bio->bi_private;
- struct bio_vec *bvec, *org_vec;
+ struct bio_vec *bvec, orig_vec;
int i;
- int start = bio_orig->bi_iter.bi_idx;
+ struct bvec_iter orig_iter = bio_orig->bi_iter;
/*
* free up bounce indirect pages used
*/
bio_for_each_segment_all(bvec, bio, i) {
- org_vec = bio_orig->bi_io_vec + i + start;
-
- if (bvec->bv_page == org_vec->bv_page)
- continue;
-
- dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, pool);
+ orig_vec = bio_iter_iovec(bio_orig, orig_iter);
+ if (bvec->bv_page != orig_vec.bv_page) {
+ dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
+ mempool_free(bvec->bv_page, pool);
+ }
+ bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
}
bio_orig->bi_status = bio->bi_status;
unsigned i = 0;
bool bounce = false;
int sectors = 0;
+ bool passthrough = bio_is_passthrough(*bio_orig);
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_PAGES)
if (!bounce)
return;
- if (sectors < bio_sectors(*bio_orig)) {
+ if (!passthrough && sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, bounce_bio_split);
bio_chain(bio, *bio_orig);
generic_make_request(*bio_orig);
*bio_orig = bio;
}
- bio = bio_clone_bioset(*bio_orig, GFP_NOIO, bounce_bio_set);
+ bio = bio_clone_bioset(*bio_orig, GFP_NOIO, passthrough ? NULL :
+ bounce_bio_set);
bio_for_each_segment_all(to, bio, i) {
struct page *page = to->bv_page;
#include <linux/slab.h>
#include <linux/blk-mq.h>
#include <linux/hrtimer.h>
- #include <linux/lightnvm.h>
#include <linux/configfs.h>
#include <linux/badblocks.h>
+ #include <linux/fault-inject.h>
#define SECTOR_SHIFT 9
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
#define TICKS_PER_SEC 50ULL
#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static DECLARE_FAULT_ATTR(null_timeout_attr);
+ #endif
+
static inline u64 mb_per_tick(int mbps)
{
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
struct nullb_cmd {
struct list_head list;
struct llist_node ll_list;
- call_single_data_t csd;
+ struct __call_single_data csd;
struct request *rq;
struct bio *bio;
unsigned int tag;
+ blk_status_t error;
struct nullb_queue *nq;
struct hrtimer timer;
- blk_status_t error;
};
struct nullb_queue {
unsigned int hw_queue_depth; /* queue depth */
unsigned int index; /* index of the disk, only valid with a disk */
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
- bool use_lightnvm; /* register as a LightNVM device */
bool blocking; /* blocking blk-mq device */
bool use_per_node_hctx; /* use per-node allocation for hardware context */
bool power; /* power on/off the device */
unsigned int index;
struct request_queue *q;
struct gendisk *disk;
- struct nvm_dev *ndev;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
unsigned int queue_depth;
static struct mutex lock;
static int null_major;
static DEFINE_IDA(nullb_indexes);
- static struct kmem_cache *ppa_cache;
static struct blk_mq_tag_set tag_set;
enum {
module_param_named(home_node, g_home_node, int, S_IRUGO);
MODULE_PARM_DESC(home_node, "Home node for the device");
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ static char g_timeout_str[80];
+ module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+ #endif
+
static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max)
module_param(nr_devices, int, S_IRUGO);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
- static bool g_use_lightnvm;
- module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
- MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
-
static bool g_blocking;
module_param_named(blocking, g_blocking, bool, S_IRUGO);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
NULLB_DEVICE_ATTR(irqmode, uint);
NULLB_DEVICE_ATTR(hw_queue_depth, uint);
NULLB_DEVICE_ATTR(index, uint);
- NULLB_DEVICE_ATTR(use_lightnvm, bool);
NULLB_DEVICE_ATTR(blocking, bool);
NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
NULLB_DEVICE_ATTR(memory_backed, bool);
&nullb_device_attr_irqmode,
&nullb_device_attr_hw_queue_depth,
&nullb_device_attr_index,
- &nullb_device_attr_use_lightnvm,
&nullb_device_attr_blocking,
&nullb_device_attr_use_per_node_hctx,
&nullb_device_attr_power,
dev->blocksize = g_bs;
dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth;
- dev->use_lightnvm = g_use_lightnvm;
dev->blocking = g_blocking;
dev->use_per_node_hctx = g_use_per_node_hctx;
return dev;
return BLK_QC_T_NONE;
}
+ static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+ {
+ pr_info("null: rq %p timed out\n", rq);
+ return BLK_EH_HANDLED;
+ }
+
static int null_rq_prep_fn(struct request_queue *q, struct request *req)
{
struct nullb *nullb = q->queuedata;
return BLKPREP_DEFER;
}
+ static bool should_timeout_request(struct request *rq)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ if (g_timeout_str[0])
+ return should_fail(&null_timeout_attr, 1);
+ #endif
+
+ return false;
+ }
+
static void null_request_fn(struct request_queue *q)
{
struct request *rq;
while ((rq = blk_fetch_request(q)) != NULL) {
struct nullb_cmd *cmd = rq->special;
- spin_unlock_irq(q->queue_lock);
- null_handle_cmd(cmd);
- spin_lock_irq(q->queue_lock);
+ if (!should_timeout_request(rq)) {
+ spin_unlock_irq(q->queue_lock);
+ null_handle_cmd(cmd);
+ spin_lock_irq(q->queue_lock);
+ }
}
}
+ static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+ {
+ pr_info("null: rq %p timed out\n", rq);
+ return BLK_EH_HANDLED;
+ }
+
static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
blk_mq_start_request(bd->rq);
- return null_handle_cmd(cmd);
+ if (!should_timeout_request(bd->rq))
+ return null_handle_cmd(cmd);
+
+ return BLK_STS_OK;
}
static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.complete = null_softirq_done_fn,
+ .timeout = null_timeout_rq,
};
static void cleanup_queue(struct nullb_queue *nq)
kfree(nullb->queues);
}
- #ifdef CONFIG_NVM
-
- static void null_lnvm_end_io(struct request *rq, blk_status_t status)
- {
- struct nvm_rq *rqd = rq->end_io_data;
-
- /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
- rqd->error = status ? -EIO : 0;
- nvm_end_io(rqd);
-
- blk_put_request(rq);
- }
-
- static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
- {
- struct request_queue *q = dev->q;
- struct request *rq;
- struct bio *bio = rqd->bio;
-
- rq = blk_mq_alloc_request(q,
- op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq))
- return -ENOMEM;
-
- blk_init_request_from_bio(rq, bio);
-
- rq->end_io_data = rqd;
-
- blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
-
- return 0;
- }
-
- static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
- {
- struct nullb *nullb = dev->q->queuedata;
- sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
- sector_t blksize;
- struct nvm_id_group *grp;
-
- id->ver_id = 0x1;
- id->vmnt = 0;
- id->cap = 0x2;
- id->dom = 0x1;
-
- id->ppaf.blk_offset = 0;
- id->ppaf.blk_len = 16;
- id->ppaf.pg_offset = 16;
- id->ppaf.pg_len = 16;
- id->ppaf.sect_offset = 32;
- id->ppaf.sect_len = 8;
- id->ppaf.pln_offset = 40;
- id->ppaf.pln_len = 8;
- id->ppaf.lun_offset = 48;
- id->ppaf.lun_len = 8;
- id->ppaf.ch_offset = 56;
- id->ppaf.ch_len = 8;
-
- sector_div(size, nullb->dev->blocksize); /* convert size to pages */
- size >>= 8; /* concert size to pgs pr blk */
- grp = &id->grp;
- grp->mtype = 0;
- grp->fmtype = 0;
- grp->num_ch = 1;
- grp->num_pg = 256;
- blksize = size;
- size >>= 16;
- grp->num_lun = size + 1;
- sector_div(blksize, grp->num_lun);
- grp->num_blk = blksize;
- grp->num_pln = 1;
-
- grp->fpg_sz = nullb->dev->blocksize;
- grp->csecs = nullb->dev->blocksize;
- grp->trdt = 25000;
- grp->trdm = 25000;
- grp->tprt = 500000;
- grp->tprm = 500000;
- grp->tbet = 1500000;
- grp->tbem = 1500000;
- grp->mpos = 0x010101; /* single plane rwe */
- grp->cpar = nullb->dev->hw_queue_depth;
-
- return 0;
- }
-
- static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
- {
- mempool_t *virtmem_pool;
-
- virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
- if (!virtmem_pool) {
- pr_err("null_blk: Unable to create virtual memory pool\n");
- return NULL;
- }
-
- return virtmem_pool;
- }
-
- static void null_lnvm_destroy_dma_pool(void *pool)
- {
- mempool_destroy(pool);
- }
-
- static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
- gfp_t mem_flags, dma_addr_t *dma_handler)
- {
- return mempool_alloc(pool, mem_flags);
- }
-
- static void null_lnvm_dev_dma_free(void *pool, void *entry,
- dma_addr_t dma_handler)
- {
- mempool_free(entry, pool);
- }
-
- static struct nvm_dev_ops null_lnvm_dev_ops = {
- .identity = null_lnvm_id,
- .submit_io = null_lnvm_submit_io,
-
- .create_dma_pool = null_lnvm_create_dma_pool,
- .destroy_dma_pool = null_lnvm_destroy_dma_pool,
- .dev_dma_alloc = null_lnvm_dev_dma_alloc,
- .dev_dma_free = null_lnvm_dev_dma_free,
-
- /* Simulate nvme protocol restriction */
- .max_phys_sect = 64,
- };
-
- static int null_nvm_register(struct nullb *nullb)
- {
- struct nvm_dev *dev;
- int rv;
-
- dev = nvm_alloc_dev(0);
- if (!dev)
- return -ENOMEM;
-
- dev->q = nullb->q;
- memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
- dev->ops = &null_lnvm_dev_ops;
-
- rv = nvm_register(dev);
- if (rv) {
- kfree(dev);
- return rv;
- }
- nullb->ndev = dev;
- return 0;
- }
-
- static void null_nvm_unregister(struct nullb *nullb)
- {
- nvm_unregister(nullb->ndev);
- }
- #else
- static int null_nvm_register(struct nullb *nullb)
- {
- pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
- return -EINVAL;
- }
- static void null_nvm_unregister(struct nullb *nullb) {}
- #endif /* CONFIG_NVM */
-
static void null_del_dev(struct nullb *nullb)
{
struct nullb_device *dev = nullb->dev;
list_del_init(&nullb->list);
- if (dev->use_lightnvm)
- null_nvm_unregister(nullb);
- else
- del_gendisk(nullb->disk);
+ del_gendisk(nullb->disk);
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer);
if (dev->queue_mode == NULL_Q_MQ &&
nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
- if (!dev->use_lightnvm)
- put_disk(nullb->disk);
+ put_disk(nullb->disk);
cleanup_queues(nullb);
if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true);
{
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
- if (dev->use_lightnvm && dev->blocksize != 4096)
- dev->blocksize = 4096;
-
- if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
- dev->queue_mode = NULL_Q_MQ;
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes)
dev->mbps = 0;
}
+ static bool null_setup_fault(void)
+ {
+ #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ if (!g_timeout_str[0])
+ return true;
+
+ if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+ return false;
+
+ null_timeout_attr.verbose = 0;
+ #endif
+ return true;
+ }
+
static int null_add_dev(struct nullb_device *dev)
{
struct nullb *nullb;
if (rv)
goto out_cleanup_queues;
+ if (!null_setup_fault())
+ goto out_cleanup_queues;
+
+ nullb->tag_set->timeout = 5 * HZ;
nullb->q = blk_mq_init_queue(nullb->tag_set);
if (IS_ERR(nullb->q)) {
rv = -ENOMEM;
rv = -ENOMEM;
goto out_cleanup_queues;
}
+
+ if (!null_setup_fault())
+ goto out_cleanup_blk_queue;
+
blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+ blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+ nullb->q->rq_timeout = 5 * HZ;
rv = init_driver_queues(nullb);
if (rv)
goto out_cleanup_blk_queue;
sprintf(nullb->disk_name, "nullb%d", nullb->index);
- if (dev->use_lightnvm)
- rv = null_nvm_register(nullb);
- else
- rv = null_gendisk_register(nullb);
-
+ rv = null_gendisk_register(nullb);
if (rv)
goto out_cleanup_blk_queue;
g_bs = PAGE_SIZE;
}
- if (g_use_lightnvm && g_bs != 4096) {
- pr_warn("null_blk: LightNVM only supports 4k block size\n");
- pr_warn("null_blk: defaults block size to 4k\n");
- g_bs = 4096;
- }
-
- if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
- pr_warn("null_blk: LightNVM only supported for blk-mq\n");
- pr_warn("null_blk: defaults queue mode to blk-mq\n");
- g_queue_mode = NULL_Q_MQ;
- }
-
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.\n",
goto err_conf;
}
- if (g_use_lightnvm) {
- ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
- 0, 0, NULL);
- if (!ppa_cache) {
- pr_err("null_blk: unable to create ppa cache\n");
- ret = -ENOMEM;
- goto err_ppa;
- }
- }
-
for (i = 0; i < nr_devices; i++) {
dev = null_alloc_dev();
if (!dev) {
null_del_dev(nullb);
null_free_dev(dev);
}
- kmem_cache_destroy(ppa_cache);
- err_ppa:
unregister_blkdev(null_major, "nullb");
err_conf:
configfs_unregister_subsystem(&nullb_subsys);
if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
-
- kmem_cache_destroy(ppa_cache);
}
module_init(null_init);
bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
- bv->bv_page = NULL;
}
}
/* Ignore extra keys (which are used for IV etc) */
subkey_size = crypt_subkey_size(cc);
- if (crypt_integrity_hmac(cc))
+ if (crypt_integrity_hmac(cc)) {
+ if (subkey_size < cc->key_mac_size)
+ return -EINVAL;
+
crypt_copy_authenckey(cc->authenc_key, cc->key,
subkey_size - cc->key_mac_size,
cc->key_mac_size);
+ }
+
for (i = 0; i < cc->tfms_count; i++) {
if (crypt_integrity_hmac(cc))
r = crypto_aead_setkey(cc->cipher_tfm.tfms_aead[i],
ret = crypt_setkey(cc);
- /* wipe the kernel key payload copy in each case */
- memset(cc->key, 0, cc->key_size * sizeof(u8));
-
if (!ret) {
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
kzfree(cc->key_string);
}
}
+ /* wipe the kernel key payload copy */
+ if (cc->key_string)
+ memset(cc->key, 0, cc->key_size * sizeof(u8));
+
return ret;
}
cc->tag_pool_max_sectors * cc->on_disk_tag_size);
if (!cc->tag_pool) {
ti->error = "Cannot allocate integrity tags mempool";
+ ret = -ENOMEM;
goto bad;
}
return ret;
if (cc->iv_gen_ops && cc->iv_gen_ops->init)
ret = cc->iv_gen_ops->init(cc);
+ /* wipe the kernel key payload copy */
+ if (cc->key_string)
+ memset(cc->key, 0, cc->key_size * sizeof(u8));
return ret;
}
if (argc == 2 && !strcasecmp(argv[1], "wipe")) {
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 18, 0},
+ .version = {1, 18, 1},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
+
#include "nvme.h"
#include "fabrics.h"
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+ /*
+ * nvme_wq - hosts nvme related works that are not reset or delete
+ * nvme_reset_wq - hosts nvme reset works
+ * nvme_delete_wq - hosts nvme delete works
+ *
+ * nvme_wq will host works such are scan, aen handling, fw activation,
+ * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * runs reset works which also flush works hosted on nvme_wq for
+ * serialization purposes. nvme_delete_wq host controller deletion
+ * works which flush reset works for serialization.
+ */
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
+ struct workqueue_struct *nvme_reset_wq;
+ EXPORT_SYMBOL_GPL(nvme_reset_wq);
+
+ struct workqueue_struct *nvme_delete_wq;
+ EXPORT_SYMBOL_GPL(nvme_delete_wq);
+
static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return -EBUSY;
- if (!queue_work(nvme_wq, &ctrl->reset_work))
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
- static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{
int ret;
flush_work(&ctrl->reset_work);
return ret;
}
+ EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
static void nvme_delete_ctrl_work(struct work_struct *work)
{
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
- if (!queue_work(nvme_wq, &ctrl->delete_work))
+ if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
}
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
return BLK_STS_NOSPC;
+ case NVME_SC_LBA_RANGE:
+ return BLK_STS_TARGET;
+ case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_ONCS_NOT_SUPPORTED:
+ case NVME_SC_INVALID_OPCODE:
+ case NVME_SC_INVALID_FIELD:
+ case NVME_SC_INVALID_NS:
return BLK_STS_NOTSUPP;
case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR:
case NVME_SC_UNWRITTEN_BLOCK:
case NVME_SC_ACCESS_DENIED:
case NVME_SC_READ_ONLY:
+ case NVME_SC_COMPARE_FAILED:
return BLK_STS_MEDIUM;
case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK:
void nvme_complete_rq(struct request *req)
{
- if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
- if (nvme_req_needs_failover(req)) {
+ blk_status_t status = nvme_error_status(req);
+
+ trace_nvme_complete_rq(req);
+
+ if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+ if (nvme_req_needs_failover(req, status)) {
nvme_failover_req(req);
return;
}
return;
}
}
-
- blk_mq_end_request(req, nvme_error_status(req));
+ blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
old_state = ctrl->state;
switch (new_state) {
+ case NVME_CTRL_ADMIN_ONLY:
+ switch (old_state) {
+ case NVME_CTRL_RECONNECTING:
+ changed = true;
+ /* FALLTHRU */
+ default:
+ break;
+ }
+ break;
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
changed = true;
/* FALLTHRU */
default:
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
}
cmd->common.command_id = req->tag;
+ if (ns)
+ trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+ else
+ trace_nvme_setup_admin_cmd(cmd);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
#ifdef CONFIG_NVME_MULTIPATH
/* should never be called due to GENHD_FL_HIDDEN */
if (WARN_ON_ONCE(ns->head->disk))
- return -ENXIO;
+ goto fail;
#endif
if (!kref_get_unless_zero(&ns->kref))
- return -ENXIO;
+ goto fail;
+ if (!try_module_get(ns->ctrl->ops->module))
+ goto fail_put_ns;
+
return 0;
+
+ fail_put_ns:
+ nvme_put_ns(ns);
+ fail:
+ return -ENXIO;
}
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
- nvme_put_ns(disk->private_data);
+ struct nvme_ns *ns = disk->private_data;
+
+ module_put(ns->ctrl->ops->module);
+ nvme_put_ns(ns);
}
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- queue->limits.discard_alignment = size;
+ queue->limits.discard_alignment = 0;
queue->limits.discard_granularity = size;
blk_queue_max_discard_sectors(queue, UINT_MAX);
struct nvme_ns *ns, struct nvme_id_ns *id)
{
sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ unsigned short bs = 1 << ns->lba_shift;
unsigned stream_alignment = 0;
if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
- blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+ blk_queue_logical_block_size(disk->queue, bs);
+ blk_queue_physical_block_size(disk->queue, bs);
+ blk_queue_io_min(disk->queue, bs);
+
if (ns->ms && !ns->ext &&
(ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns->ms, ns->pi_type);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- if (ctrl->quirks & NVME_QUIRK_STRIPE_SIZE)
+ if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ is_power_of_2(ctrl->max_hw_sectors))
blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
blk_queue_virt_boundary(q, ctrl->page_size - 1);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
NULL,
};
+ static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+ {
+ int count = 0;
+ struct nvme_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DEAD)
+ count++;
+ }
+ mutex_unlock(&subsys->lock);
+
+ return count;
+ }
+
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
struct nvme_subsystem *subsys, *found;
* Verify that the subsystem actually supports multiple
* controllers, else bail out.
*/
- if (!(id->cmic & (1 << 1))) {
+ if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
dev_err(ctrl->device,
"ignoring ctrl due to duplicate subnqn (%s).\n",
found->subnqn);
shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout)
- dev_warn(ctrl->device,
+ dev_info(ctrl->device,
"Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout);
} else
struct nvme_ctrl *ctrl =
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
- if (ctrl->state != NVME_CTRL_LIVE)
+ switch (ctrl->state) {
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
+ break;
+ default:
return -EWOULDBLOCK;
+ }
+
file->private_data = ctrl;
return 0;
}
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
+ [NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_RECONNECTING]= "reconnecting",
[NVME_CTRL_DELETING] = "deleting",
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- nvme_setup_streams_ns(ctrl, ns);
id = nvme_identify_ns(ctrl, nsid);
if (!id)
if (nvme_init_ns_head(ns, nsid, id, &new))
goto out_free_id;
+ nvme_setup_streams_ns(ctrl, ns);
#ifdef CONFIG_NVME_MULTIPATH
/*
return;
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
- if (blk_get_integrity(ns->disk))
- blk_integrity_unregister(ns->disk);
nvme_mpath_remove_disk_links(ns);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group);
nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
+ if (blk_get_integrity(ns->disk))
+ blk_integrity_unregister(ns->disk);
}
mutex_lock(&ns->ctrl->subsys->lock);
mutex_unlock(&ns->ctrl->namespaces_mutex);
synchronize_srcu(&ns->head->srcu);
+ nvme_mpath_check_last_path(ns);
nvme_put_ns(ns);
}
if (ctrl->state != NVME_CTRL_LIVE)
return;
+ WARN_ON_ONCE(!ctrl->tagset);
+
if (nvme_identify_ctrl(ctrl, &id))
return;
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
/*
- * Do not queue new scan work when a controller is reset during
- * removal.
+ * Only new queue scan work when admin and IO queues are both alive
*/
if (ctrl->state == NVME_CTRL_LIVE)
queue_work(nvme_wq, &ctrl->scan_work);
int __init nvme_core_init(void)
{
- int result;
+ int result = -ENOMEM;
nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq)
- return -ENOMEM;
+ goto out;
+
+ nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+ WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!nvme_reset_wq)
+ goto destroy_wq;
+
+ nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+ WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!nvme_delete_wq)
+ goto destroy_reset_wq;
result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
if (result < 0)
- goto destroy_wq;
+ goto destroy_delete_wq;
nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) {
class_destroy(nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+ destroy_delete_wq:
+ destroy_workqueue(nvme_delete_wq);
+ destroy_reset_wq:
+ destroy_workqueue(nvme_reset_wq);
destroy_wq:
destroy_workqueue(nvme_wq);
+ out:
return result;
}
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+ destroy_workqueue(nvme_delete_wq);
+ destroy_workqueue(nvme_reset_wq);
destroy_workqueue(nvme_wq);
}
return NULL;
kref_init(&host->ref);
+ uuid_gen(&host->id);
snprintf(host->nqn, NVMF_NQN_SIZE,
"nqn.2014-08.org.nvmexpress:uuid:%pUb", &host->id);
*/
int nvmf_register_transport(struct nvmf_transport_ops *ops)
{
- if (!ops->create_ctrl)
+ if (!ops->create_ctrl || !ops->module)
return -EINVAL;
down_write(&nvmf_transports_rwsem);
ret = -ENOMEM;
goto out;
}
- if (uuid_parse(p, &hostid)) {
+ ret = uuid_parse(p, &hostid);
+ if (ret) {
pr_err("Invalid hostid %s\n", p);
ret = -EINVAL;
+ kfree(p);
goto out;
}
+ kfree(p);
break;
case NVMF_OPT_DUP_CONNECT:
opts->duplicate_connect = true;
goto out_unlock;
}
+ if (!try_module_get(ops->module)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
ret = nvmf_check_required_opts(opts, ops->required_opts);
if (ret)
- goto out_unlock;
+ goto out_module_put;
ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
ops->allowed_opts | ops->required_opts);
if (ret)
- goto out_unlock;
+ goto out_module_put;
ctrl = ops->create_ctrl(dev, opts);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
- goto out_unlock;
+ goto out_module_put;
}
if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
dev_warn(ctrl->device,
"controller returned incorrect NQN: \"%s\".\n",
ctrl->subsys->subnqn);
+ module_put(ops->module);
up_read(&nvmf_transports_rwsem);
nvme_delete_ctrl_sync(ctrl);
return ERR_PTR(-EINVAL);
}
+ module_put(ops->module);
up_read(&nvmf_transports_rwsem);
return ctrl;
+ out_module_put:
+ module_put(ops->module);
out_unlock:
up_read(&nvmf_transports_rwsem);
out_free_opts:
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
nvme_fc_free_queue(&ctrl->queues[0]);
+ /* re-enable the admin_q so anything new can fast fail */
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
nvme_fc_ctlr_inactive_on_rport(ctrl);
}
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
+
+ /* resume the io queues so that things will fast fail */
+ nvme_start_queues(nctrl);
}
static void
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
/* Remove core ctrl ref. */
nvme_put_ctrl(&ctrl->ctrl);
static struct nvmf_transport_ops nvme_fc_transport = {
.name = "fc",
+ .module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
.allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_fc_create_ctrl,
#define NVME_KATO_GRACE 10
extern struct workqueue_struct *nvme_wq;
+ extern struct workqueue_struct *nvme_reset_wq;
+ extern struct workqueue_struct *nvme_delete_wq;
enum {
NVME_NS_LBA = 0,
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
+ NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING,
NVME_CTRL_RECONNECTING,
NVME_CTRL_DELETING,
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
#ifdef CONFIG_NVME_MULTIPATH
void nvme_failover_req(struct request *req);
- bool nvme_req_needs_failover(struct request *req);
+ bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
rcu_assign_pointer(head->current_path, NULL);
}
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
+
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
+{
+ struct nvme_ns_head *head = ns->head;
+
+ if (head->disk && list_empty(&head->list))
+ kblockd_schedule_work(&head->requeue_work);
+}
+
#else
static inline void nvme_failover_req(struct request *req)
{
}
- static inline bool nvme_req_needs_failover(struct request *req)
+ static inline bool nvme_req_needs_failover(struct request *req,
+ blk_status_t error)
{
return false;
}
{
}
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
#endif /* CONFIG_NVME_MULTIPATH */
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
- struct nvme_queue **queues;
+ struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
if (!nvmeq->tags)
nvmeq->tags = &dev->tagset.tags[hctx_idx];
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
- struct nvme_queue *nvmeq = dev->queues[queue_idx];
+ struct nvme_queue *nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
}
+static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ int nseg = blk_rq_nr_phys_segments(req);
+ unsigned int avg_seg_size;
+
+ if (nseg == 0)
+ return false;
+
+ avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+
+ if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
+ return false;
+ if (!iod->nvmeq->qid)
+ return false;
+ if (!sgl_threshold || avg_seg_size < sgl_threshold)
+ return false;
+ return true;
+}
+
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
int nseg = blk_rq_nr_phys_segments(rq);
unsigned int size = blk_rq_payload_bytes(rq);
+ iod->use_sgl = nvme_pci_use_sgls(dev, rq);
+
if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg,
iod->use_sgl);
dma_addr_t prp_dma;
int nprps, i;
- iod->use_sgl = false;
-
length -= (page_size - offset);
if (length <= 0) {
iod->first_dma = 0;
}
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmd)
+ struct request *req, struct nvme_rw_command *cmd, int entries)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int length = blk_rq_payload_bytes(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sg = iod->sg;
- int entries = iod->nents, i = 0;
dma_addr_t sgl_dma;
-
- iod->use_sgl = true;
+ int i = 0;
/* setting the transfer type as SGL */
cmd->flags = NVME_CMD_SGL_METABUF;
- if (length == sg_dma_len(sg)) {
+ if (entries == 1) {
nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
return BLK_STS_OK;
}
}
nvme_pci_sgl_set_data(&sg_list[i++], sg);
-
- length -= sg_dma_len(sg);
sg = sg_next(sg);
- entries--;
- } while (length > 0);
+ } while (--entries > 0);
- WARN_ON(entries > 0);
return BLK_STS_OK;
}
-static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int avg_seg_size;
-
- avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req),
- blk_rq_nr_phys_segments(req));
-
- if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
- return false;
- if (!iod->nvmeq->qid)
- return false;
- if (!sgl_threshold || avg_seg_size < sgl_threshold)
- return false;
- return true;
-}
-
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
enum dma_data_direction dma_dir = rq_data_dir(req) ?
DMA_TO_DEVICE : DMA_FROM_DEVICE;
blk_status_t ret = BLK_STS_IOERR;
+ int nr_mapped;
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
iod->nents = blk_rq_map_sg(q, req, iod->sg);
goto out;
ret = BLK_STS_RESOURCE;
- if (!dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
- DMA_ATTR_NO_WARN))
+ nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
+ DMA_ATTR_NO_WARN);
+ if (!nr_mapped)
goto out;
- if (nvme_pci_use_sgls(dev, req))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ if (iod->use_sgl)
+ ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];
struct nvme_command c;
memset(&c, 0, sizeof(c));
*/
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
- /* If there is a reset ongoing, we shouldn't reset again. */
- if (dev->ctrl.state == NVME_CTRL_RESETTING)
+ /* If there is a reset/reinit ongoing, we shouldn't reset again. */
+ switch (dev->ctrl.state) {
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_RECONNECTING:
return false;
+ default:
+ break;
+ }
/* We shouldn't reset unless the controller is on fatal error state
* _or_ if we lost the communication with it.
if (nvmeq->sq_cmds)
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
- kfree(nvmeq);
}
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
int i;
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
- struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--;
- dev->queues[i] = NULL;
- nvme_free_queue(nvmeq);
+ nvme_free_queue(&dev->queues[i]);
}
}
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
- struct nvme_queue *nvmeq = dev->queues[0];
-
- if (!nvmeq)
- return;
- if (nvme_suspend_queue(nvmeq))
- return;
+ struct nvme_queue *nvmeq = &dev->queues[0];
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
{
- if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+ if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
dev->ctrl.page_size);
nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
return 0;
}
- static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
- int depth, int node)
+ static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+ int depth, int node)
{
- struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
- node);
- if (!nvmeq)
- return NULL;
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
+ if (dev->ctrl.queue_count > qid)
+ return 0;
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
- dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;
- return nvmeq;
+ return 0;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
- kfree(nvmeq);
- return NULL;
+ return -ENOMEM;
}
static int queue_request_irq(struct nvme_queue *nvmeq)
if (result < 0)
return result;
- nvmeq = dev->queues[0];
- if (!nvmeq) {
- nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
- dev_to_node(dev->dev));
- if (!nvmeq)
- return -ENOMEM;
- }
+ result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+ dev_to_node(dev->dev));
+ if (result)
+ return result;
+ nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
/* vector == qid - 1, match nvme_create_queue */
- if (!nvme_alloc_queue(dev, i, dev->q_depth,
+ if (nvme_alloc_queue(dev, i, dev->q_depth,
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
ret = -ENOMEM;
break;
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
- ret = nvme_create_queue(dev->queues[i], i);
+ ret = nvme_create_queue(&dev->queues[i], i);
if (ret)
break;
}
/*
* Ignore failing Create SQ/CQ commands, we can continue with less
- * than the desired aount of queues, and even a controller without
- * I/O queues an still be used to issue admin commands. This might
+ * than the desired amount of queues, and even a controller without
+ * I/O queues can still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example.
*/
return ret >= 0 ? 0 : ret;
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
- static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+ static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{
- u64 szu, size, offset;
+ u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+
+ return 1ULL << (12 + 4 * szu);
+ }
+
+ static u32 nvme_cmb_size(struct nvme_dev *dev)
+ {
+ return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+ }
+
+ static void nvme_map_cmb(struct nvme_dev *dev)
+ {
+ u64 size, offset;
resource_size_t bar_size;
struct pci_dev *pdev = to_pci_dev(dev->dev);
- void __iomem *cmb;
int bar;
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
- if (!(NVME_CMB_SZ(dev->cmbsz)))
- return NULL;
+ if (!dev->cmbsz)
+ return;
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
if (!use_cmb_sqes)
- return NULL;
+ return;
- szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
- size = szu * NVME_CMB_SZ(dev->cmbsz);
- offset = szu * NVME_CMB_OFST(dev->cmbloc);
+ size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+ offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
bar = NVME_CMB_BIR(dev->cmbloc);
bar_size = pci_resource_len(pdev, bar);
if (offset > bar_size)
- return NULL;
+ return;
/*
* Controllers may support a CMB size larger than their BAR,
if (size > bar_size - offset)
size = bar_size - offset;
- cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
- if (!cmb)
- return NULL;
-
+ dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+ if (!dev->cmb)
+ return;
dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
dev->cmb_size = size;
- return cmb;
+
+ if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+ &dev_attr_cmb.attr, NULL))
+ dev_warn(dev->ctrl.device,
+ "failed to add sysfs attribute for CMB\n");
}
static inline void nvme_release_cmb(struct nvme_dev *dev)
dma_addr_t descs_dma;
int i = 0;
void **bufs;
- u64 size = 0, tmp;
+ u64 size, tmp;
tmp = (preferred + chunk_size - 1);
do_div(tmp, chunk_size);
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
u64 min = (u64)dev->ctrl.hmmin * 4096;
u32 enable_bits = NVME_HOST_MEM_ENABLE;
- int ret = 0;
+ int ret;
preferred = min(preferred, max);
if (min > max) {
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
- struct nvme_queue *adminq = dev->queues[0];
+ struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues;
unsigned long size;
if (nr_io_queues == 0)
return 0;
- if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+ if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
return 0;
}
- static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
+ static void nvme_disable_io_queues(struct nvme_dev *dev)
{
- int pass;
+ int pass, queues = dev->online_queues - 1;
unsigned long timeout;
u8 opcode = nvme_admin_delete_sq;
retry:
timeout = ADMIN_TIMEOUT;
for (; i > 0; i--, sent++)
- if (nvme_delete_queue(dev->queues[i], opcode))
+ if (nvme_delete_queue(&dev->queues[i], opcode))
break;
while (sent--) {
}
/*
- * Return: error value if an error occurred setting up the queues or calling
- * Identify Device. 0 if these succeeded, even if adding some of the
- * namespaces failed. At the moment, these failures are silent. TBD which
- * failures should be reported.
+ * return error value only when tagset allocation failed
*/
static int nvme_dev_add(struct nvme_dev *dev)
{
+ int ret;
+
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
- if (blk_mq_alloc_tag_set(&dev->tagset))
- return 0;
+ ret = blk_mq_alloc_tag_set(&dev->tagset);
+ if (ret) {
+ dev_warn(dev->ctrl.device,
+ "IO queues tagset allocation failed %d\n", ret);
+ return ret;
+ }
dev->ctrl.tagset = &dev->tagset;
nvme_dbbuf_set(dev);
"set queue depth=%u\n", dev->q_depth);
}
- /*
- * CMBs can currently only exist on >=1.2 PCIe devices. We only
- * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
- * has no name we can pass NULL as final argument to
- * sysfs_add_file_to_group.
- */
-
- if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
- dev->cmb = nvme_map_cmb(dev);
- if (dev->cmb) {
- if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
- &dev_attr_cmb.attr, NULL))
- dev_warn(dev->ctrl.device,
- "failed to add sysfs attribute for CMB\n");
- }
- }
+ nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- int i, queues;
+ int i;
bool dead = true;
struct pci_dev *pdev = to_pci_dev(dev->dev);
}
nvme_stop_queues(&dev->ctrl);
- queues = dev->online_queues - 1;
- for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(dev->queues[i]);
-
- if (dead) {
- /* A device might become IO incapable very soon during
- * probe, before the admin queue is configured. Thus,
- * queue_count can be 0 here.
- */
- if (dev->ctrl.queue_count)
- nvme_suspend_queue(dev->queues[0]);
- } else {
- nvme_disable_io_queues(dev, queues);
+ if (!dead) {
+ nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
}
+ for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+ nvme_suspend_queue(&dev->queues[i]);
+
nvme_pci_disable(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result = -ENODEV;
+ enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
goto out;
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
+ /*
+ * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+ * initializing procedure here.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller RECONNECTING\n");
+ goto out;
+ }
+
result = nvme_pci_enable(dev);
if (result)
goto out;
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
+ new_state = NVME_CTRL_ADMIN_ONLY;
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- nvme_dev_add(dev);
+ /* hit this only when allocate tagset fails */
+ if (nvme_dev_add(dev))
+ new_state = NVME_CTRL_ADMIN_ONLY;
nvme_unfreeze(&dev->ctrl);
}
- if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
- dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+ /*
+ * If only admin queue live, keep it to do further investigation or
+ * recovery.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller state %d\n", new_state);
goto out;
}
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
- dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
- GFP_KERNEL, node);
+
+ dev->queues = kcalloc_node(num_possible_cpus() + 1,
+ sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
if (result)
goto release_pools;
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- queue_work(nvme_wq, &dev->ctrl.reset_work);
+ nvme_reset_ctrl(&dev->ctrl);
+
return 0;
release_pools:
static void nvme_reset_done(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_reset_ctrl(&dev->ctrl);
+ nvme_reset_ctrl_sync(&dev->ctrl);
}
static void nvme_shutdown(struct pci_dev *pdev)
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
u32 num_sge;
int nents;
- bool inline_data;
struct ib_reg_wr reg_wr;
struct ib_cqe reg_cqe;
struct nvme_rdma_queue *queue;
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_start_queues(&ctrl->ctrl);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ /* state change failure should never happen */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
nvme_rdma_reconnect_or_remove(ctrl);
}
static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
{
- if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
queue_work(nvme_wq, &ctrl->err_work);
sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
- req->inline_data = true;
req->num_sge++;
return 0;
}
int count, ret;
req->num_sge = 1;
- req->inline_data = false;
refcount_set(&req->ref, 2); /* send and recv completions */
c->common.flags |= NVME_CMD_SGL_METABUF;
nvme_stop_ctrl(&ctrl->ctrl);
nvme_rdma_shutdown_ctrl(ctrl, false);
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) {
+ /* state change failure should never happen */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
ret = nvme_rdma_configure_admin_queue(ctrl, false);
if (ret)
goto out_fail;
static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma",
+ .module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
}
mutex_unlock(&nvme_rdma_ctrl_mutex);
- flush_workqueue(nvme_wq);
+ flush_workqueue(nvme_delete_wq);
}
static struct ib_client nvme_rdma_ib_client = {
struct completion unreg_done;
};
+ struct fcloop_lport_priv {
+ struct fcloop_lport *lport;
+ };
+
struct fcloop_rport {
struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport;
int status;
};
+ enum {
+ INI_IO_START = 0,
+ INI_IO_ACTIVE = 1,
+ INI_IO_ABORTED = 2,
+ INI_IO_COMPLETED = 3,
+ };
+
struct fcloop_fcpreq {
struct fcloop_tport *tport;
struct nvmefc_fcp_req *fcpreq;
spinlock_t reqlock;
u16 status;
+ u32 inistate;
bool active;
bool aborted;
- struct work_struct work;
+ struct kref ref;
+ struct work_struct fcp_rcv_work;
+ struct work_struct abort_rcv_work;
+ struct work_struct tio_done_work;
struct nvmefc_tgt_fcp_req tgt_fcp_req;
};
struct fcloop_ini_fcpreq {
struct nvmefc_fcp_req *fcpreq;
struct fcloop_fcpreq *tfcp_req;
- struct work_struct iniwork;
+ spinlock_t inilock;
};
static inline struct fcloop_lsreq *
return 0;
}
- /*
- * FCP IO operation done by initiator abort.
- * call back up initiator "done" flows.
- */
static void
- fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+ fcloop_tfcp_req_free(struct kref *ref)
{
- struct fcloop_ini_fcpreq *inireq =
- container_of(work, struct fcloop_ini_fcpreq, iniwork);
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(ref, struct fcloop_fcpreq, ref);
- inireq->fcpreq->done(inireq->fcpreq);
+ kfree(tfcp_req);
+ }
+
+ static void
+ fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+ {
+ kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+ }
+
+ static int
+ fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+ {
+ return kref_get_unless_zero(&tfcp_req->ref);
+ }
+
+ static void
+ fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+ struct fcloop_fcpreq *tfcp_req, int status)
+ {
+ struct fcloop_ini_fcpreq *inireq = NULL;
+
+ if (fcpreq) {
+ inireq = fcpreq->private;
+ spin_lock(&inireq->inilock);
+ inireq->tfcp_req = NULL;
+ spin_unlock(&inireq->inilock);
+
+ fcpreq->status = status;
+ fcpreq->done(fcpreq);
+ }
+
+ /* release original io reference on tgt struct */
+ fcloop_tfcp_req_put(tfcp_req);
+ }
+
+ static void
+ fcloop_fcp_recv_work(struct work_struct *work)
+ {
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+ struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ int ret = 0;
+ bool aborted = false;
+
+ spin_lock(&tfcp_req->reqlock);
+ switch (tfcp_req->inistate) {
+ case INI_IO_START:
+ tfcp_req->inistate = INI_IO_ACTIVE;
+ break;
+ case INI_IO_ABORTED:
+ aborted = true;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(aborted))
+ ret = -ECANCELED;
+ else
+ ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req,
+ fcpreq->cmdaddr, fcpreq->cmdlen);
+ if (ret)
+ fcloop_call_host_done(fcpreq, tfcp_req, ret);
+
+ return;
+ }
+
+ static void
+ fcloop_fcp_abort_recv_work(struct work_struct *work)
+ {
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+ struct nvmefc_fcp_req *fcpreq;
+ bool completed = false;
+
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ switch (tfcp_req->inistate) {
+ case INI_IO_ABORTED:
+ break;
+ case INI_IO_COMPLETED:
+ completed = true;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(completed)) {
+ /* remove reference taken in original abort downcall */
+ fcloop_tfcp_req_put(tfcp_req);
+ return;
+ }
+
+ if (tfcp_req->tport->targetport)
+ nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req);
+
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->fcpreq = NULL;
+ spin_unlock(&tfcp_req->reqlock);
+
+ fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+ /* call_host_done releases reference for abort downcall */
}
/*
fcloop_tgt_fcprqst_done_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
- container_of(work, struct fcloop_fcpreq, work);
- struct fcloop_tport *tport = tfcp_req->tport;
+ container_of(work, struct fcloop_fcpreq, tio_done_work);
struct nvmefc_fcp_req *fcpreq;
spin_lock(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq;
+ tfcp_req->inistate = INI_IO_COMPLETED;
spin_unlock(&tfcp_req->reqlock);
- if (tport->remoteport && fcpreq) {
- fcpreq->status = tfcp_req->status;
- fcpreq->done(fcpreq);
- }
-
- kfree(tfcp_req);
+ fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
}
struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
struct fcloop_fcpreq *tfcp_req;
- int ret = 0;
if (!rport->targetport)
return -ECONNREFUSED;
inireq->fcpreq = fcpreq;
inireq->tfcp_req = tfcp_req;
- INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
+ spin_lock_init(&inireq->inilock);
+
tfcp_req->fcpreq = fcpreq;
tfcp_req->tport = rport->targetport->private;
+ tfcp_req->inistate = INI_IO_START;
spin_lock_init(&tfcp_req->reqlock);
- INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+ INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+ INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+ INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+ kref_init(&tfcp_req->ref);
- ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
- fcpreq->cmdaddr, fcpreq->cmdlen);
+ schedule_work(&tfcp_req->fcp_rcv_work);
- return ret;
+ return 0;
}
static void
{
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
- schedule_work(&tfcp_req->work);
+ schedule_work(&tfcp_req->tio_done_work);
}
static void
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
- struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
- struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+ struct fcloop_fcpreq *tfcp_req;
+ bool abortio = true;
+
+ spin_lock(&inireq->inilock);
+ tfcp_req = inireq->tfcp_req;
+ if (tfcp_req)
+ fcloop_tfcp_req_get(tfcp_req);
+ spin_unlock(&inireq->inilock);
if (!tfcp_req)
/* abort has already been called */
return;
- if (rport->targetport)
- nvmet_fc_rcv_fcp_abort(rport->targetport,
- &tfcp_req->tgt_fcp_req);
-
/* break initiator/target relationship for io */
spin_lock(&tfcp_req->reqlock);
- inireq->tfcp_req = NULL;
- tfcp_req->fcpreq = NULL;
+ switch (tfcp_req->inistate) {
+ case INI_IO_START:
+ case INI_IO_ACTIVE:
+ tfcp_req->inistate = INI_IO_ABORTED;
+ break;
+ case INI_IO_COMPLETED:
+ abortio = false;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
spin_unlock(&tfcp_req->reqlock);
- /* post the aborted io completion */
- fcpreq->status = -ECANCELED;
- schedule_work(&inireq->iniwork);
+ if (abortio)
+ /* leave the reference while the work item is scheduled */
+ WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+ else {
+ /*
+ * as the io has already had the done callback made,
+ * nothing more to do. So release the reference taken above
+ */
+ fcloop_tfcp_req_put(tfcp_req);
+ }
}
static void
static void
fcloop_localport_delete(struct nvme_fc_local_port *localport)
{
- struct fcloop_lport *lport = localport->private;
+ struct fcloop_lport_priv *lport_priv = localport->private;
+ struct fcloop_lport *lport = lport_priv->lport;
/* release any threads waiting for the unreg to complete */
complete(&lport->unreg_done);
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* sizes of additional private data for data structures */
- .local_priv_sz = sizeof(struct fcloop_lport),
+ .local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* optional features */
- .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR |
- NVMET_FCTGTFEAT_OPDONE_IN_ISR,
+ .target_features = 0,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
};
struct fcloop_ctrl_options *opts;
struct nvme_fc_local_port *localport;
struct fcloop_lport *lport;
- int ret;
+ struct fcloop_lport_priv *lport_priv;
+ unsigned long flags;
+ int ret = -ENOMEM;
+
+ lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+ if (!lport)
+ return -ENOMEM;
opts = kzalloc(sizeof(*opts), GFP_KERNEL);
if (!opts)
- return -ENOMEM;
+ goto out_free_lport;
ret = fcloop_parse_options(opts, buf);
if (ret)
ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
if (!ret) {
- unsigned long flags;
-
/* success */
- lport = localport->private;
+ lport_priv = localport->private;
+ lport_priv->lport = lport;
+
lport->localport = localport;
INIT_LIST_HEAD(&lport->lport_list);
spin_lock_irqsave(&fcloop_lock, flags);
list_add_tail(&lport->lport_list, &fcloop_lports);
spin_unlock_irqrestore(&fcloop_lock, flags);
-
- /* mark all of the input buffer consumed */
- ret = count;
}
out_free_opts:
kfree(opts);
+ out_free_lport:
+ /* free only if we're going to fail */
+ if (ret)
+ kfree(lport);
+
return ret ? ret : count;
}
wait_for_completion(&lport->unreg_done);
+ kfree(lport);
+
return ret;
}
const char *buf, size_t count)
{
struct fcloop_nport *nport = NULL, *tmpport;
- struct fcloop_tport *tport;
+ struct fcloop_tport *tport = NULL;
u64 nodename, portname;
unsigned long flags;
int ret;
bv->bv_len = iter.bi_bvec_done;
}
+ static inline unsigned bio_pages_all(struct bio *bio)
+ {
+ WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+ return bio->bi_vcnt;
+ }
+
+ static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
+ {
+ WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+ return bio->bi_io_vec;
+ }
+
+ static inline struct page *bio_first_page_all(struct bio *bio)
+ {
+ return bio_first_bvec_all(bio)->bv_page;
+ }
+
+ static inline struct bio_vec *bio_last_bvec_all(struct bio *bio)
+ {
+ WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
+ return &bio->bi_io_vec[bio->bi_vcnt - 1];
+ }
+
enum bip_flags {
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
#endif
extern void bio_copy_data(struct bio *dst, struct bio *src);
- extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
extern void bio_free_pages(struct bio *bio);
extern struct bio *bio_copy_user_iov(struct request_queue *,
#define bio_set_dev(bio, bdev) \
do { \
+ if ((bio)->bi_disk != (bdev)->bd_disk) \
+ bio_clear_flag(bio, BIO_THROTTLED);\
(bio)->bi_disk = (bdev)->bd_disk; \
(bio)->bi_partno = (bdev)->bd_partno; \
} while (0)
#define BLK_STS_AGAIN ((__force blk_status_t)12)
+ /**
+ * blk_path_error - returns true if error may be path related
+ * @error: status the request was completed with
+ *
+ * Description:
+ * This classifies block error status into non-retryable errors and ones
+ * that may be successful if retried on a failover path.
+ *
+ * Return:
+ * %false - retrying failover path will not help
+ * %true - may succeed if retried
+ */
+ static inline bool blk_path_error(blk_status_t error)
+ {
+ switch (error) {
+ case BLK_STS_NOTSUPP:
+ case BLK_STS_NOSPC:
+ case BLK_STS_TARGET:
+ case BLK_STS_NEXUS:
+ case BLK_STS_MEDIUM:
+ case BLK_STS_PROTECTION:
+ return false;
+ }
+
+ /* Anything else could be a path failure, so should be retried */
+ return true;
+ }
+
struct blk_issue_stat {
u64 stat;
};
struct bio {
struct bio *bi_next; /* request queue link */
struct gendisk *bi_disk;
- u8 bi_partno;
- blk_status_t bi_status;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
unsigned short bi_flags; /* status, etc and bvec pool number */
unsigned short bi_ioprio;
unsigned short bi_write_hint;
-
- struct bvec_iter bi_iter;
+ blk_status_t bi_status;
+ u8 bi_partno;
/* Number of segments in this BIO after
* physical address coalescing is performed.
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
- atomic_t __bi_remaining;
+ struct bvec_iter bi_iter;
+ atomic_t __bi_remaining;
bio_end_io_t *bi_end_io;
void *bi_private;
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>
+ #include <linux/seqlock.h>
+ #include <linux/u64_stats_sync.h>
struct module;
struct scsi_ioctl_command;
/* Look at ->special_vec for the actual data payload instead of the
bio chain. */
#define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18))
+ /* The per-zone write lock is held for this request */
+ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19))
+ /* timeout is expired */
+ #define RQF_MQ_TIMEOUT_EXPIRED ((__force req_flags_t)(1 << 20))
+ /* already slept for hybrid poll */
+ #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 21))
/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
* especially blk_mq_rq_ctx_init() to take care of the added fields.
*/
struct request {
- struct list_head queuelist;
- union {
- struct __call_single_data csd;
- u64 fifo_time;
- };
-
struct request_queue *q;
struct blk_mq_ctx *mq_ctx;
int internal_tag;
- unsigned long atomic_flags;
-
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
int tag;
struct bio *bio;
struct bio *biotail;
+ struct list_head queuelist;
+
/*
* The hash is used inside the scheduler, and killed once the
* request reaches the dispatch list. The ipi_list is only used
struct hd_struct *part;
unsigned long start_time;
struct blk_issue_stat issue_stat;
- #ifdef CONFIG_BLK_CGROUP
- struct request_list *rl; /* rl this rq is alloced from */
- unsigned long long start_time_ns;
- unsigned long long io_start_time_ns; /* when passed to hardware */
- #endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments;
+
#if defined(CONFIG_BLK_DEV_INTEGRITY)
unsigned short nr_integrity_segments;
#endif
+ unsigned short write_hint;
unsigned short ioprio;
unsigned int timeout;
unsigned int extra_len; /* length of alignment and padding */
- unsigned short write_hint;
+ /*
+ * On blk-mq, the lower bits of ->gstate (generation number and
+ * state) carry the MQ_RQ_* state value and the upper bits the
+ * generation number which is monotonically incremented and used to
+ * distinguish the reuse instances.
+ *
+ * ->gstate_seq allows updates to ->gstate and other fields
+ * (currently ->deadline) during request start to be read
+ * atomically from the timeout path, so that it can operate on a
+ * coherent set of information.
+ */
+ seqcount_t gstate_seq;
+ u64 gstate;
+
+ /*
+ * ->aborted_gstate is used by the timeout to claim a specific
+ * recycle instance of this request. See blk_mq_timeout_work().
+ */
+ struct u64_stats_sync aborted_gstate_sync;
+ u64 aborted_gstate;
+
+ /* access through blk_rq_set_deadline, blk_rq_deadline */
+ unsigned long __deadline;
- unsigned long deadline;
struct list_head timeout_list;
- call_single_data_t csd;
+ union {
++ struct __call_single_data csd;
+ u64 fifo_time;
+ };
+
/*
* completion callback.
*/
/* for bidi */
struct request *next_rq;
+
+ #ifdef CONFIG_BLK_CGROUP
+ struct request_list *rl; /* rl this rq is alloced from */
+ unsigned long long start_time_ns;
+ unsigned long long io_start_time_ns; /* when passed to hardware */
+ #endif
};
+static inline bool blk_op_is_scsi(unsigned int op)
+{
+ return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT;
+}
+
+static inline bool blk_op_is_private(unsigned int op)
+{
+ return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
+}
+
static inline bool blk_rq_is_scsi(struct request *rq)
{
- return req_op(rq) == REQ_OP_SCSI_IN || req_op(rq) == REQ_OP_SCSI_OUT;
+ return blk_op_is_scsi(req_op(rq));
}
static inline bool blk_rq_is_private(struct request *rq)
{
- return req_op(rq) == REQ_OP_DRV_IN || req_op(rq) == REQ_OP_DRV_OUT;
+ return blk_op_is_private(req_op(rq));
}
static inline bool blk_rq_is_passthrough(struct request *rq)
return blk_rq_is_scsi(rq) || blk_rq_is_private(rq);
}
+static inline bool bio_is_passthrough(struct bio *bio)
+{
+ unsigned op = bio_op(bio);
+
+ return blk_op_is_scsi(op) || blk_op_is_private(op);
+}
+
static inline unsigned short req_get_ioprio(struct request *req)
{
return req->ioprio;
struct queue_limits limits;
+ /*
+ * Zoned block device information for request dispatch control.
+ * nr_zones is the total number of zones of the device. This is always
+ * 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones
+ * bits which indicates if a zone is conventional (bit clear) or
+ * sequential (bit set). seq_zones_wlock is a bitmap of nr_zones
+ * bits which indicates if a zone is write locked, that is, if a write
+ * request targeting the zone was dispatched. All three fields are
+ * initialized by the low level device driver (e.g. scsi/sd.c).
+ * Stacking drivers (device mappers) may or may not initialize
+ * these fields.
+ */
+ unsigned int nr_zones;
+ unsigned long *seq_zones_bitmap;
+ unsigned long *seq_zones_wlock;
+
/*
* sg stuff
*/
return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
}
+ static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
+ {
+ return q->nr_zones;
+ }
+
+ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
+ sector_t sector)
+ {
+ if (!blk_queue_is_zoned(q))
+ return 0;
+ return sector >> ilog2(q->limits.chunk_sectors);
+ }
+
+ static inline bool blk_queue_zone_is_seq(struct request_queue *q,
+ sector_t sector)
+ {
+ if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap)
+ return false;
+ return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap);
+ }
+
static inline bool rq_is_sync(struct request *rq)
{
return op_is_sync(rq->cmd_flags);
extern void blk_rq_unprep_clone(struct request *rq);
extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
struct request *rq);
-extern int blk_rq_append_bio(struct request *rq, struct bio *bio);
+extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
extern void blk_delay_queue(struct request_queue *, unsigned long);
extern void blk_queue_split(struct request_queue *, struct bio **);
extern void blk_recount_segments(struct request_queue *, struct bio *);
return blk_rq_cur_bytes(rq) >> 9;
}
+ static inline unsigned int blk_rq_zone_no(struct request *rq)
+ {
+ return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
+ }
+
+ static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
+ {
+ return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq));
+ }
+
/*
* Some commands like WRITE SAME have a payload or data transfer size which
* is different from the size of the request. Any driver that supports such
if (q)
return blk_queue_zone_sectors(q);
+ return 0;
+ }
+
+ static inline unsigned int bdev_nr_zones(struct block_device *bdev)
+ {
+ struct request_queue *q = bdev_get_queue(bdev);
+ if (q)
+ return blk_queue_nr_zones(q);
return 0;
}
int kblockd_schedule_work(struct work_struct *work);
int kblockd_schedule_work_on(int cpu, struct work_struct *work);
- int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay);
- int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay);
#ifdef CONFIG_BLK_CGROUP
extern int bdev_read_page(struct block_device *, sector_t, struct page *);
extern int bdev_write_page(struct block_device *, sector_t, struct page *,
struct writeback_control *);
+
+ #ifdef CONFIG_BLK_DEV_ZONED
+ bool blk_req_needs_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_lock(struct request *rq);
+ void __blk_req_zone_write_unlock(struct request *rq);
+
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+ if (blk_req_needs_zone_write_lock(rq))
+ __blk_req_zone_write_lock(rq);
+ }
+
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+ if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
+ __blk_req_zone_write_unlock(rq);
+ }
+
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+ return rq->q->seq_zones_wlock &&
+ test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock);
+ }
+
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+ if (!blk_req_needs_zone_write_lock(rq))
+ return true;
+ return !blk_req_zone_is_write_locked(rq);
+ }
+ #else
+ static inline bool blk_req_needs_zone_write_lock(struct request *rq)
+ {
+ return false;
+ }
+
+ static inline void blk_req_zone_write_lock(struct request *rq)
+ {
+ }
+
+ static inline void blk_req_zone_write_unlock(struct request *rq)
+ {
+ }
+ static inline bool blk_req_zone_is_write_locked(struct request *rq)
+ {
+ return false;
+ }
+
+ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
+ {
+ return true;
+ }
+ #endif /* CONFIG_BLK_DEV_ZONED */
+
#else /* CONFIG_BLOCK */
struct block_device;
static void hib_end_io(struct bio *bio)
{
struct hib_bio_batch *hb = bio->bi_private;
- struct page *page = bio->bi_io_vec[0].bv_page;
+ struct page *page = bio_first_page_all(bio);
if (bio->bi_status) {
pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
* space avaiable from the resume partition.
*/
-static int enough_swap(unsigned int nr_pages, unsigned int flags)
+static int enough_swap(unsigned int nr_pages)
{
unsigned int free_swap = count_swap_pages(root_swap, 1);
unsigned int required;
return error;
}
if (flags & SF_NOCOMPRESS_MODE) {
- if (!enough_swap(pages, flags)) {
+ if (!enough_swap(pages)) {
pr_err("Not enough free swap\n");
error = -ENOSPC;
goto out_finish;