From: Linus Torvalds Date: Sun, 8 Sep 2013 03:19:02 +0000 (-0700) Subject: Merge git://git.infradead.org/users/willy/linux-nvme X-Git-Tag: v3.12-rc1~98 X-Git-Url: https://asedeno.scripts.mit.edu/gitweb/?a=commitdiff_plain;h=b409624ad5a99c2e84df6657bd0f7931ac470d2d;hp=c4c17252283a13c0d63a8d9df828da109c116411;p=linux.git Merge git://git.infradead.org/users/willy/linux-nvme Pull NVM Express driver update from Matthew Wilcox. * git://git.infradead.org/users/willy/linux-nvme: NVMe: Merge issue on character device bring-up NVMe: Handle ioremap failure NVMe: Add pci suspend/resume driver callbacks NVMe: Use normal shutdown NVMe: Separate controller init from disk discovery NVMe: Separate queue alloc/free from create/delete NVMe: Group pci related actions in functions NVMe: Disk stats for read/write commands only NVMe: Bring up cdev on set feature failure NVMe: Fix checkpatch issues NVMe: Namespace IDs are unsigned NVMe: Update nvme_id_power_state with latest spec NVMe: Split header file into user-visible and kernel-visible pieces NVMe: Call nvme_process_cq from submission path NVMe: Remove "process_cq did something" message NVMe: Return correct value from interrupt handler NVMe: Disk IO statistics NVMe: Restructure MSI / MSI-X setup NVMe: Use kzalloc instead of kmalloc+memset --- diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index ce79a590b45b..da52092980e2 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +80,9 @@ struct nvme_queue { u16 sq_head; u16 sq_tail; u16 cq_head; - u16 cq_phase; + u8 cq_phase; + u8 cqe_seen; + u8 q_suspended; unsigned long cmdid_data[]; }; @@ -115,6 +118,11 @@ static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; } +static unsigned nvme_queue_extra(int depth) +{ + return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); +} + /** * alloc_cmdid() - Allocate a Command ID * @nvmeq: The queue that will be used for this command @@ -285,6 +293,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->npages = -1; iod->length = nbytes; iod->nents = 0; + iod->start_time = jiffies; } return iod; @@ -308,6 +317,30 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } +static void nvme_start_io_acct(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); +} + +static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); +} + static void bio_completion(struct nvme_dev *dev, void *ctx, struct nvme_completion *cqe) { @@ -315,9 +348,11 @@ static void bio_completion(struct nvme_dev *dev, void *ctx, struct bio *bio = iod->private; u16 status = le16_to_cpup(&cqe->status) >> 1; - if (iod->nents) + if (iod->nents) { dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + nvme_end_io_acct(bio, iod->start_time); + } nvme_free_iod(dev, iod); if (status) bio_endio(bio, -EIO); @@ -422,10 +457,8 @@ static void nvme_bio_pair_endio(struct bio *bio, int err) if (atomic_dec_and_test(&bp->cnt)) { bio_endio(bp->parent, bp->err); - if (bp->bv1) - kfree(bp->bv1); - if (bp->bv2) - kfree(bp->bv2); + kfree(bp->bv1); + kfree(bp->bv2); kfree(bp); } } @@ -695,6 +728,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + nvme_start_io_acct(bio); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); @@ -709,26 +743,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, return result; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) -{ - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - spin_lock_irq(&nvmeq->q_lock); - if (bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (bio_list_empty(&nvmeq->sq_cong)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); -} - -static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) +static int nvme_process_cq(struct nvme_queue *nvmeq) { u16 head, phase; @@ -758,13 +773,40 @@ static irqreturn_t nvme_process_cq(struct nvme_queue *nvmeq) * a big problem. */ if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) - return IRQ_NONE; + return 0; writel(head, nvmeq->q_db + (1 << nvmeq->dev->db_stride)); nvmeq->cq_head = head; nvmeq->cq_phase = phase; - return IRQ_HANDLED; + nvmeq->cqe_seen = 1; + return 1; +} + +static void nvme_make_request(struct request_queue *q, struct bio *bio) +{ + struct nvme_ns *ns = q->queuedata; + struct nvme_queue *nvmeq = get_nvmeq(ns->dev); + int result = -EBUSY; + + if (!nvmeq) { + put_nvmeq(NULL); + bio_endio(bio, -EIO); + return; + } + + spin_lock_irq(&nvmeq->q_lock); + if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) + result = nvme_submit_bio_queue(nvmeq, ns, bio); + if (unlikely(result)) { + if (bio_list_empty(&nvmeq->sq_cong)) + add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); + bio_list_add(&nvmeq->sq_cong, bio); + } + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + put_nvmeq(nvmeq); } static irqreturn_t nvme_irq(int irq, void *data) @@ -772,7 +814,9 @@ static irqreturn_t nvme_irq(int irq, void *data) irqreturn_t result; struct nvme_queue *nvmeq = data; spin_lock(&nvmeq->q_lock); - result = nvme_process_cq(nvmeq); + nvme_process_cq(nvmeq); + result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; + nvmeq->cqe_seen = 0; spin_unlock(&nvmeq->q_lock); return result; } @@ -986,8 +1030,15 @@ static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) } } -static void nvme_free_queue_mem(struct nvme_queue *nvmeq) +static void nvme_free_queue(struct nvme_queue *nvmeq) { + spin_lock_irq(&nvmeq->q_lock); + while (bio_list_peek(&nvmeq->sq_cong)) { + struct bio *bio = bio_list_pop(&nvmeq->sq_cong); + bio_endio(bio, -EIO); + } + spin_unlock_irq(&nvmeq->q_lock); + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), @@ -995,17 +1046,28 @@ static void nvme_free_queue_mem(struct nvme_queue *nvmeq) kfree(nvmeq); } -static void nvme_free_queue(struct nvme_dev *dev, int qid) +static void nvme_free_queues(struct nvme_dev *dev) +{ + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) { + nvme_free_queue(dev->queues[i]); + dev->queue_count--; + dev->queues[i] = NULL; + } +} + +static void nvme_disable_queue(struct nvme_dev *dev, int qid) { struct nvme_queue *nvmeq = dev->queues[qid]; int vector = dev->entry[nvmeq->cq_vector].vector; spin_lock_irq(&nvmeq->q_lock); - nvme_cancel_ios(nvmeq, false); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); + if (nvmeq->q_suspended) { + spin_unlock_irq(&nvmeq->q_lock); + return; } + nvmeq->q_suspended = 1; spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); @@ -1017,15 +1079,17 @@ static void nvme_free_queue(struct nvme_dev *dev, int qid) adapter_delete_cq(dev, qid); } - nvme_free_queue_mem(nvmeq); + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + nvme_cancel_ios(nvmeq, false); + spin_unlock_irq(&nvmeq->q_lock); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = DIV_ROUND_UP(depth, 8) + (depth * - sizeof(struct nvme_cmd_info)); + unsigned extra = nvme_queue_extra(depth); struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); if (!nvmeq) return NULL; @@ -1052,6 +1116,8 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; + nvmeq->q_suspended = 1; + dev->queue_count++; return nvmeq; @@ -1075,18 +1141,29 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, IRQF_DISABLED | IRQF_SHARED, name, nvmeq); } -static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, - int cq_size, int vector) +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { - int result; - struct nvme_queue *nvmeq = nvme_alloc_queue(dev, qid, cq_size, vector); + struct nvme_dev *dev = nvmeq->dev; + unsigned extra = nvme_queue_extra(nvmeq->q_depth); - if (!nvmeq) - return ERR_PTR(-ENOMEM); + nvmeq->sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid << (dev->db_stride + 1)]; + memset(nvmeq->cmdid_data, 0, extra); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + nvme_cancel_ios(nvmeq, false); + nvmeq->q_suspended = 0; +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) - goto free_nvmeq; + return result; result = adapter_alloc_sq(dev, qid, nvmeq); if (result < 0) @@ -1096,19 +1173,17 @@ static struct nvme_queue *nvme_create_queue(struct nvme_dev *dev, int qid, if (result < 0) goto release_sq; - return nvmeq; + spin_lock(&nvmeq->q_lock); + nvme_init_queue(nvmeq, qid); + spin_unlock(&nvmeq->q_lock); + + return result; release_sq: adapter_delete_sq(dev, qid); release_cq: adapter_delete_cq(dev, qid); - free_nvmeq: - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); - return ERR_PTR(result); + return result; } static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) @@ -1152,6 +1227,30 @@ static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) return nvme_wait_ready(dev, cap, true); } +static int nvme_shutdown_ctrl(struct nvme_dev *dev) +{ + unsigned long timeout; + u32 cc; + + cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; + writel(cc, &dev->bar->cc); + + timeout = 2 * HZ + jiffies; + while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != + NVME_CSTS_SHST_CMPLT) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return 0; +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1159,16 +1258,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; - dev->dbs = ((void __iomem *)dev->bar) + 4096; - dev->db_stride = NVME_CAP_STRIDE(cap); - result = nvme_disable_ctrl(dev, cap); if (result < 0) return result; - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); - if (!nvmeq) - return -ENOMEM; + nvmeq = dev->queues[0]; + if (!nvmeq) { + nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + if (!nvmeq) + return -ENOMEM; + dev->queues[0] = nvmeq; + } aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1185,17 +1285,15 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) result = nvme_enable_ctrl(dev, cap); if (result) - goto free_q; + return result; result = queue_request_irq(dev, nvmeq, "nvme admin"); if (result) - goto free_q; - - dev->queues[0] = nvmeq; - return result; + return result; - free_q: - nvme_free_queue_mem(nvmeq); + spin_lock(&nvmeq->q_lock); + nvme_init_queue(nvmeq, 0); + spin_unlock(&nvmeq->q_lock); return result; } @@ -1314,7 +1412,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.appmask = cpu_to_le16(io.appmask); if (meta_len) { - meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len); + meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, + meta_len); if (IS_ERR(meta_iod)) { status = PTR_ERR(meta_iod); meta_iod = NULL; @@ -1356,6 +1455,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) put_nvmeq(nvmeq); if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; + else if (!nvmeq || nvmeq->q_suspended) + status = -EBUSY; else status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT); @@ -1453,6 +1554,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, switch (cmd) { case NVME_IOCTL_ID: + force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: return nvme_user_admin_cmd(ns->dev, (void __user *)arg); @@ -1506,10 +1608,12 @@ static int nvme_kthread(void *data) if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvme_process_cq(nvmeq)) - printk("process_cq did something\n"); + if (nvmeq->q_suspended) + goto unlock; + nvme_process_cq(nvmeq); nvme_cancel_ios(nvmeq, true); nvme_resubmit_bios(nvmeq); + unlock: spin_unlock_irq(&nvmeq->q_lock); } } @@ -1556,7 +1660,7 @@ static void nvme_config_discard(struct nvme_ns *ns) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } -static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid, +static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, struct nvme_id_ns *id, struct nvme_lba_range_type *rt) { struct nvme_ns *ns; @@ -1631,14 +1735,19 @@ static int set_queue_count(struct nvme_dev *dev, int count) status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, &result); if (status) - return -EIO; + return status < 0 ? -EIO : -EBUSY; return min(result & 0xffff, result >> 16) + 1; } +static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); +} + static int nvme_setup_io_queues(struct nvme_dev *dev) { struct pci_dev *pdev = dev->pci_dev; - int result, cpu, i, nr_io_queues, db_bar_size, q_depth, q_count; + int result, cpu, i, vecs, nr_io_queues, size, q_depth; nr_io_queues = num_online_cpus(); result = set_queue_count(dev, nr_io_queues); @@ -1647,53 +1756,80 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (result < nr_io_queues) nr_io_queues = result; - q_count = nr_io_queues; - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, dev->queues[0]); - - db_bar_size = 4096 + ((nr_io_queues + 1) << (dev->db_stride + 3)); - if (db_bar_size > 8192) { + size = db_bar_size(dev, nr_io_queues); + if (size > 8192) { iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(pdev, 0), db_bar_size); + do { + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (dev->bar) + break; + if (!--nr_io_queues) + return -ENOMEM; + size = db_bar_size(dev, nr_io_queues); + } while (1); dev->dbs = ((void __iomem *)dev->bar) + 4096; dev->queues[0]->q_db = dev->dbs; } - for (i = 0; i < nr_io_queues; i++) + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, dev->queues[0]); + + vecs = nr_io_queues; + for (i = 0; i < vecs; i++) dev->entry[i].entry = i; for (;;) { - result = pci_enable_msix(pdev, dev->entry, nr_io_queues); - if (result == 0) { - break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 0; + result = pci_enable_msix(pdev, dev->entry, vecs); + if (result <= 0) break; - } + vecs = result; } - if (nr_io_queues == 0) { - nr_io_queues = q_count; + if (result < 0) { + vecs = nr_io_queues; + if (vecs > 32) + vecs = 32; for (;;) { - result = pci_enable_msi_block(pdev, nr_io_queues); + result = pci_enable_msi_block(pdev, vecs); if (result == 0) { - for (i = 0; i < nr_io_queues; i++) + for (i = 0; i < vecs; i++) dev->entry[i].vector = i + pdev->irq; break; - } else if (result > 0) { - nr_io_queues = result; - continue; - } else { - nr_io_queues = 1; + } else if (result < 0) { + vecs = 1; break; } + vecs = result; } } + /* + * Should investigate if there's a performance win from allocating + * more queues than interrupt vectors; it might allow the submission + * path to scale better, even if the receive path is limited by the + * number of interrupts. + */ + nr_io_queues = vecs; + result = queue_request_irq(dev, dev->queues[0], "nvme admin"); - /* XXX: handle failure here */ + if (result) { + dev->queues[0]->q_suspended = 1; + goto free_queues; + } + + /* Free previously allocated queues that are no longer usable */ + spin_lock(&dev_list_lock); + for (i = dev->queue_count - 1; i > nr_io_queues; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + spin_lock(&nvmeq->q_lock); + nvme_cancel_ios(nvmeq, false); + spin_unlock(&nvmeq->q_lock); + + nvme_free_queue(nvmeq); + dev->queue_count--; + dev->queues[i] = NULL; + } + spin_unlock(&dev_list_lock); cpu = cpumask_first(cpu_online_mask); for (i = 0; i < nr_io_queues; i++) { @@ -1703,11 +1839,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1, NVME_Q_DEPTH); - for (i = 0; i < nr_io_queues; i++) { - dev->queues[i + 1] = nvme_create_queue(dev, i + 1, q_depth, i); - if (IS_ERR(dev->queues[i + 1])) - return PTR_ERR(dev->queues[i + 1]); - dev->queue_count++; + for (i = dev->queue_count - 1; i < nr_io_queues; i++) { + dev->queues[i + 1] = nvme_alloc_queue(dev, i + 1, q_depth, i); + if (!dev->queues[i + 1]) { + result = -ENOMEM; + goto free_queues; + } } for (; i < num_possible_cpus(); i++) { @@ -1715,15 +1852,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->queues[i + 1] = dev->queues[target + 1]; } - return 0; -} + for (i = 1; i < dev->queue_count; i++) { + result = nvme_create_queue(dev->queues[i], i); + if (result) { + for (--i; i > 0; i--) + nvme_disable_queue(dev, i); + goto free_queues; + } + } -static void nvme_free_queues(struct nvme_dev *dev) -{ - int i; + return 0; - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_free_queue(dev, i); + free_queues: + nvme_free_queues(dev); + return result; } /* @@ -1734,7 +1876,8 @@ static void nvme_free_queues(struct nvme_dev *dev) */ static int nvme_dev_add(struct nvme_dev *dev) { - int res, nn, i; + int res; + unsigned nn, i; struct nvme_ns *ns; struct nvme_id_ctrl *ctrl; struct nvme_id_ns *id_ns; @@ -1742,10 +1885,6 @@ static int nvme_dev_add(struct nvme_dev *dev) dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - res = nvme_setup_io_queues(dev); - if (res) - return res; - mem = dma_alloc_coherent(&dev->pci_dev->dev, 8192, &dma_addr, GFP_KERNEL); if (!mem) @@ -1796,23 +1935,86 @@ static int nvme_dev_add(struct nvme_dev *dev) return res; } -static int nvme_dev_remove(struct nvme_dev *dev) +static int nvme_dev_map(struct nvme_dev *dev) { - struct nvme_ns *ns, *next; + int bars, result = -ENOMEM; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_enable_device_mem(pdev)) + return result; + + dev->entry[0].vector = pdev->irq; + pci_set_master(pdev); + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (pci_request_selected_regions(pdev, bars, "nvme")) + goto disable_pci; + + if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); + else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) + dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); + else + goto disable_pci; + + pci_set_drvdata(pdev, dev); + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto disable; + + dev->db_stride = NVME_CAP_STRIDE(readq(&dev->bar->cap)); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + return 0; + + disable: + pci_release_regions(pdev); + disable_pci: + pci_disable_device(pdev); + return result; +} + +static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); + + if (dev->bar) { + iounmap(dev->bar); + dev->bar = NULL; + } + + pci_release_regions(dev->pci_dev); + if (pci_is_enabled(dev->pci_dev)) + pci_disable_device(dev->pci_dev); +} + +static void nvme_dev_shutdown(struct nvme_dev *dev) +{ + int i; + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_disable_queue(dev, i); spin_lock(&dev_list_lock); - list_del(&dev->node); + list_del_init(&dev->node); spin_unlock(&dev_list_lock); + if (dev->bar) + nvme_shutdown_ctrl(dev); + nvme_dev_unmap(dev); +} + +static void nvme_dev_remove(struct nvme_dev *dev) +{ + struct nvme_ns *ns, *next; + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); del_gendisk(ns->disk); nvme_ns_free(ns); } - - nvme_free_queues(dev); - - return 0; } static int nvme_setup_prp_pools(struct nvme_dev *dev) @@ -1872,15 +2074,10 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); nvme_dev_remove(dev); - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); - iounmap(dev->bar); + nvme_dev_shutdown(dev); + nvme_free_queues(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); - pci_disable_device(dev->pci_dev); - pci_release_regions(dev->pci_dev); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -1921,9 +2118,40 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static int nvme_dev_start(struct nvme_dev *dev) +{ + int result; + + result = nvme_dev_map(dev); + if (result) + return result; + + result = nvme_configure_admin_queue(dev); + if (result) + goto unmap; + + spin_lock(&dev_list_lock); + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + result = nvme_setup_io_queues(dev); + if (result && result != -EBUSY) + goto disable; + + return result; + + disable: + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + spin_unlock(&dev_list_lock); + unmap: + nvme_dev_unmap(dev); + return result; +} + static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int bars, result = -ENOMEM; + int result = -ENOMEM; struct nvme_dev *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -1938,53 +2166,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - if (pci_enable_device_mem(pdev)) - goto free; - pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable; - INIT_LIST_HEAD(&dev->namespaces); dev->pci_dev = pdev; - pci_set_drvdata(pdev, dev); - - if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(64))) - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64)); - else if (!dma_set_mask(&pdev->dev, DMA_BIT_MASK(32))) - dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(32)); - else - goto disable; - result = nvme_set_instance(dev); if (result) - goto disable; - - dev->entry[0].vector = pdev->irq; + goto free; result = nvme_setup_prp_pools(dev); if (result) - goto disable_msix; + goto release; - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) { - result = -ENOMEM; - goto disable_msix; + result = nvme_dev_start(dev); + if (result) { + if (result == -EBUSY) + goto create_cdev; + goto release_pools; } - result = nvme_configure_admin_queue(dev); - if (result) - goto unmap; - dev->queue_count++; - - spin_lock(&dev_list_lock); - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - result = nvme_dev_add(dev); if (result) - goto delete; + goto shutdown; + create_cdev: scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; dev->miscdev.parent = &pdev->dev; @@ -1999,24 +2202,13 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) remove: nvme_dev_remove(dev); - delete: - spin_lock(&dev_list_lock); - list_del(&dev->node); - spin_unlock(&dev_list_lock); - + shutdown: + nvme_dev_shutdown(dev); + release_pools: nvme_free_queues(dev); - unmap: - iounmap(dev->bar); - disable_msix: - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); - nvme_release_instance(dev); nvme_release_prp_pools(dev); - disable: - pci_disable_device(pdev); - pci_release_regions(pdev); + release: + nvme_release_instance(dev); free: kfree(dev->queues); kfree(dev->entry); @@ -2037,8 +2229,30 @@ static void nvme_remove(struct pci_dev *pdev) #define nvme_link_reset NULL #define nvme_slot_reset NULL #define nvme_error_resume NULL -#define nvme_suspend NULL -#define nvme_resume NULL + +static int nvme_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + nvme_dev_shutdown(ndev); + return 0; +} + +static int nvme_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + int ret; + + ret = nvme_dev_start(ndev); + /* XXX: should remove gendisks if resume fails */ + if (ret) + nvme_free_queues(ndev); + return ret; +} + +static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, @@ -2062,8 +2276,9 @@ static struct pci_driver nvme_driver = { .id_table = nvme_id_table, .probe = nvme_probe, .remove = nvme_remove, - .suspend = nvme_suspend, - .resume = nvme_resume, + .driver = { + .pm = &nvme_dev_pm_ops, + }, .err_handler = &nvme_err_handler, }; diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 102de2f52b5c..4a4ff4eb8e23 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -933,13 +933,12 @@ static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res = SNTI_TRANSLATION_SUCCESS; int xfer_len; - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); if (inq_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ inq_response[2] = 0x00; /* Page Length MSB */ inq_response[3] = 0x3C; /* Page Length LSB */ @@ -964,12 +963,11 @@ static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, int xfer_len; u8 *log_response; - log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; /* Subpage=0x00, Page Length MSB=0 */ @@ -1000,12 +998,11 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, u8 temp_c; u16 temp_k; - log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), @@ -1069,12 +1066,11 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 temp_c_cur, temp_c_thresh; u16 temp_k; - log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); if (log_response == NULL) { res = -ENOMEM; goto out_mem; } - memset(log_response, 0, LOG_TEMP_PAGE_LENGTH); mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), @@ -1380,12 +1376,11 @@ static int nvme_trans_mode_page_create(struct nvme_ns *ns, blk_desc_offset = mph_size; mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_mem; } - memset(response, 0, resp_size); res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, llbaa, mode_data_length, blk_desc_len); @@ -2480,12 +2475,11 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, } id_ns = mem; - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_dma; } - memset(response, 0, resp_size); nvme_trans_fill_read_cap(response, id_ns, cdb16); xfer_len = min(alloc_len, resp_size); @@ -2554,12 +2548,11 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, goto out_dma; } - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out_dma; } - memset(response, 0, resp_size); /* The first LUN ID will always be 0 per the SAM spec */ for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { @@ -2600,12 +2593,11 @@ static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : (FIXED_FMT_SENSE_DATA_SIZE)); - response = kmalloc(resp_size, GFP_KERNEL); + response = kzalloc(resp_size, GFP_KERNEL); if (response == NULL) { res = -ENOMEM; goto out; } - memset(response, 0, resp_size); if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { /* Descriptor Format Sense Data */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index f451c8d6e231..26ebcf41c213 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,6 +1,6 @@ /* * Definitions for the NVM Express interface - * Copyright (c) 2011, Intel Corporation. + * Copyright (c) 2011-2013, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -19,7 +19,10 @@ #ifndef _LINUX_NVME_H #define _LINUX_NVME_H -#include +#include +#include +#include +#include struct nvme_bar { __u64 cap; /* Controller Capabilities */ @@ -50,6 +53,7 @@ enum { NVME_CC_SHN_NONE = 0 << 14, NVME_CC_SHN_NORMAL = 1 << 14, NVME_CC_SHN_ABRUPT = 2 << 14, + NVME_CC_SHN_MASK = 3 << 14, NVME_CC_IOSQES = 6 << 16, NVME_CC_IOCQES = 4 << 20, NVME_CSTS_RDY = 1 << 0, @@ -57,462 +61,11 @@ enum { NVME_CSTS_SHST_NORMAL = 0 << 2, NVME_CSTS_SHST_OCCUR = 1 << 2, NVME_CSTS_SHST_CMPLT = 2 << 2, -}; - -struct nvme_id_power_state { - __le16 max_power; /* centiwatts */ - __u16 rsvd2; - __le32 entry_lat; /* microseconds */ - __le32 exit_lat; /* microseconds */ - __u8 read_tput; - __u8 read_lat; - __u8 write_tput; - __u8 write_lat; - __u8 rsvd16[16]; + NVME_CSTS_SHST_MASK = 3 << 2, }; #define NVME_VS(major, minor) (major << 16 | minor) -struct nvme_id_ctrl { - __le16 vid; - __le16 ssvid; - char sn[20]; - char mn[40]; - char fr[8]; - __u8 rab; - __u8 ieee[3]; - __u8 mic; - __u8 mdts; - __u8 rsvd78[178]; - __le16 oacs; - __u8 acl; - __u8 aerl; - __u8 frmw; - __u8 lpa; - __u8 elpe; - __u8 npss; - __u8 rsvd264[248]; - __u8 sqes; - __u8 cqes; - __u8 rsvd514[2]; - __le32 nn; - __le16 oncs; - __le16 fuses; - __u8 fna; - __u8 vwc; - __le16 awun; - __le16 awupf; - __u8 rsvd530[1518]; - struct nvme_id_power_state psd[32]; - __u8 vs[1024]; -}; - -enum { - NVME_CTRL_ONCS_COMPARE = 1 << 0, - NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, - NVME_CTRL_ONCS_DSM = 1 << 2, -}; - -struct nvme_lbaf { - __le16 ms; - __u8 ds; - __u8 rp; -}; - -struct nvme_id_ns { - __le64 nsze; - __le64 ncap; - __le64 nuse; - __u8 nsfeat; - __u8 nlbaf; - __u8 flbas; - __u8 mc; - __u8 dpc; - __u8 dps; - __u8 rsvd30[98]; - struct nvme_lbaf lbaf[16]; - __u8 rsvd192[192]; - __u8 vs[3712]; -}; - -enum { - NVME_NS_FEAT_THIN = 1 << 0, - NVME_LBAF_RP_BEST = 0, - NVME_LBAF_RP_BETTER = 1, - NVME_LBAF_RP_GOOD = 2, - NVME_LBAF_RP_DEGRADED = 3, -}; - -struct nvme_smart_log { - __u8 critical_warning; - __u8 temperature[2]; - __u8 avail_spare; - __u8 spare_thresh; - __u8 percent_used; - __u8 rsvd6[26]; - __u8 data_units_read[16]; - __u8 data_units_written[16]; - __u8 host_reads[16]; - __u8 host_writes[16]; - __u8 ctrl_busy_time[16]; - __u8 power_cycles[16]; - __u8 power_on_hours[16]; - __u8 unsafe_shutdowns[16]; - __u8 media_errors[16]; - __u8 num_err_log_entries[16]; - __u8 rsvd192[320]; -}; - -enum { - NVME_SMART_CRIT_SPARE = 1 << 0, - NVME_SMART_CRIT_TEMPERATURE = 1 << 1, - NVME_SMART_CRIT_RELIABILITY = 1 << 2, - NVME_SMART_CRIT_MEDIA = 1 << 3, - NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, -}; - -struct nvme_lba_range_type { - __u8 type; - __u8 attributes; - __u8 rsvd2[14]; - __u64 slba; - __u64 nlb; - __u8 guid[16]; - __u8 rsvd48[16]; -}; - -enum { - NVME_LBART_TYPE_FS = 0x01, - NVME_LBART_TYPE_RAID = 0x02, - NVME_LBART_TYPE_CACHE = 0x03, - NVME_LBART_TYPE_SWAP = 0x04, - - NVME_LBART_ATTRIB_TEMP = 1 << 0, - NVME_LBART_ATTRIB_HIDE = 1 << 1, -}; - -/* I/O commands */ - -enum nvme_opcode { - nvme_cmd_flush = 0x00, - nvme_cmd_write = 0x01, - nvme_cmd_read = 0x02, - nvme_cmd_write_uncor = 0x04, - nvme_cmd_compare = 0x05, - nvme_cmd_dsm = 0x09, -}; - -struct nvme_common_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __le32 cdw2[2]; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le32 cdw10[6]; -}; - -struct nvme_rw_command { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2; - __le64 metadata; - __le64 prp1; - __le64 prp2; - __le64 slba; - __le16 length; - __le16 control; - __le32 dsmgmt; - __le32 reftag; - __le16 apptag; - __le16 appmask; -}; - -enum { - NVME_RW_LR = 1 << 15, - NVME_RW_FUA = 1 << 14, - NVME_RW_DSM_FREQ_UNSPEC = 0, - NVME_RW_DSM_FREQ_TYPICAL = 1, - NVME_RW_DSM_FREQ_RARE = 2, - NVME_RW_DSM_FREQ_READS = 3, - NVME_RW_DSM_FREQ_WRITES = 4, - NVME_RW_DSM_FREQ_RW = 5, - NVME_RW_DSM_FREQ_ONCE = 6, - NVME_RW_DSM_FREQ_PREFETCH = 7, - NVME_RW_DSM_FREQ_TEMP = 8, - NVME_RW_DSM_LATENCY_NONE = 0 << 4, - NVME_RW_DSM_LATENCY_IDLE = 1 << 4, - NVME_RW_DSM_LATENCY_NORM = 2 << 4, - NVME_RW_DSM_LATENCY_LOW = 3 << 4, - NVME_RW_DSM_SEQ_REQ = 1 << 6, - NVME_RW_DSM_COMPRESSED = 1 << 7, -}; - -struct nvme_dsm_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 nr; - __le32 attributes; - __u32 rsvd12[4]; -}; - -enum { - NVME_DSMGMT_IDR = 1 << 0, - NVME_DSMGMT_IDW = 1 << 1, - NVME_DSMGMT_AD = 1 << 2, -}; - -struct nvme_dsm_range { - __le32 cattr; - __le32 nlb; - __le64 slba; -}; - -/* Admin commands */ - -enum nvme_admin_opcode { - nvme_admin_delete_sq = 0x00, - nvme_admin_create_sq = 0x01, - nvme_admin_get_log_page = 0x02, - nvme_admin_delete_cq = 0x04, - nvme_admin_create_cq = 0x05, - nvme_admin_identify = 0x06, - nvme_admin_abort_cmd = 0x08, - nvme_admin_set_features = 0x09, - nvme_admin_get_features = 0x0a, - nvme_admin_async_event = 0x0c, - nvme_admin_activate_fw = 0x10, - nvme_admin_download_fw = 0x11, - nvme_admin_format_nvm = 0x80, - nvme_admin_security_send = 0x81, - nvme_admin_security_recv = 0x82, -}; - -enum { - NVME_QUEUE_PHYS_CONTIG = (1 << 0), - NVME_CQ_IRQ_ENABLED = (1 << 1), - NVME_SQ_PRIO_URGENT = (0 << 1), - NVME_SQ_PRIO_HIGH = (1 << 1), - NVME_SQ_PRIO_MEDIUM = (2 << 1), - NVME_SQ_PRIO_LOW = (3 << 1), - NVME_FEAT_ARBITRATION = 0x01, - NVME_FEAT_POWER_MGMT = 0x02, - NVME_FEAT_LBA_RANGE = 0x03, - NVME_FEAT_TEMP_THRESH = 0x04, - NVME_FEAT_ERR_RECOVERY = 0x05, - NVME_FEAT_VOLATILE_WC = 0x06, - NVME_FEAT_NUM_QUEUES = 0x07, - NVME_FEAT_IRQ_COALESCE = 0x08, - NVME_FEAT_IRQ_CONFIG = 0x09, - NVME_FEAT_WRITE_ATOMIC = 0x0a, - NVME_FEAT_ASYNC_EVENT = 0x0b, - NVME_FEAT_SW_PROGRESS = 0x0c, - NVME_FWACT_REPL = (0 << 3), - NVME_FWACT_REPL_ACTV = (1 << 3), - NVME_FWACT_ACTV = (2 << 3), -}; - -struct nvme_identify { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 cns; - __u32 rsvd11[5]; -}; - -struct nvme_features { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[2]; - __le64 prp1; - __le64 prp2; - __le32 fid; - __le32 dword11; - __u32 rsvd12[4]; -}; - -struct nvme_create_cq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 cqid; - __le16 qsize; - __le16 cq_flags; - __le16 irq_vector; - __u32 rsvd12[4]; -}; - -struct nvme_create_sq { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __u64 rsvd8; - __le16 sqid; - __le16 qsize; - __le16 sq_flags; - __le16 cqid; - __u32 rsvd12[4]; -}; - -struct nvme_delete_queue { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[9]; - __le16 qid; - __u16 rsvd10; - __u32 rsvd11[5]; -}; - -struct nvme_download_firmware { - __u8 opcode; - __u8 flags; - __u16 command_id; - __u32 rsvd1[5]; - __le64 prp1; - __le64 prp2; - __le32 numd; - __le32 offset; - __u32 rsvd12[4]; -}; - -struct nvme_format_cmd { - __u8 opcode; - __u8 flags; - __u16 command_id; - __le32 nsid; - __u64 rsvd2[4]; - __le32 cdw10; - __u32 rsvd11[5]; -}; - -struct nvme_command { - union { - struct nvme_common_command common; - struct nvme_rw_command rw; - struct nvme_identify identify; - struct nvme_features features; - struct nvme_create_cq create_cq; - struct nvme_create_sq create_sq; - struct nvme_delete_queue delete_queue; - struct nvme_download_firmware dlfw; - struct nvme_format_cmd format; - struct nvme_dsm_cmd dsm; - }; -}; - -enum { - NVME_SC_SUCCESS = 0x0, - NVME_SC_INVALID_OPCODE = 0x1, - NVME_SC_INVALID_FIELD = 0x2, - NVME_SC_CMDID_CONFLICT = 0x3, - NVME_SC_DATA_XFER_ERROR = 0x4, - NVME_SC_POWER_LOSS = 0x5, - NVME_SC_INTERNAL = 0x6, - NVME_SC_ABORT_REQ = 0x7, - NVME_SC_ABORT_QUEUE = 0x8, - NVME_SC_FUSED_FAIL = 0x9, - NVME_SC_FUSED_MISSING = 0xa, - NVME_SC_INVALID_NS = 0xb, - NVME_SC_CMD_SEQ_ERROR = 0xc, - NVME_SC_LBA_RANGE = 0x80, - NVME_SC_CAP_EXCEEDED = 0x81, - NVME_SC_NS_NOT_READY = 0x82, - NVME_SC_CQ_INVALID = 0x100, - NVME_SC_QID_INVALID = 0x101, - NVME_SC_QUEUE_SIZE = 0x102, - NVME_SC_ABORT_LIMIT = 0x103, - NVME_SC_ABORT_MISSING = 0x104, - NVME_SC_ASYNC_LIMIT = 0x105, - NVME_SC_FIRMWARE_SLOT = 0x106, - NVME_SC_FIRMWARE_IMAGE = 0x107, - NVME_SC_INVALID_VECTOR = 0x108, - NVME_SC_INVALID_LOG_PAGE = 0x109, - NVME_SC_INVALID_FORMAT = 0x10a, - NVME_SC_BAD_ATTRIBUTES = 0x180, - NVME_SC_WRITE_FAULT = 0x280, - NVME_SC_READ_ERROR = 0x281, - NVME_SC_GUARD_CHECK = 0x282, - NVME_SC_APPTAG_CHECK = 0x283, - NVME_SC_REFTAG_CHECK = 0x284, - NVME_SC_COMPARE_FAILED = 0x285, - NVME_SC_ACCESS_DENIED = 0x286, -}; - -struct nvme_completion { - __le32 result; /* Used by admin commands to return data */ - __u32 rsvd; - __le16 sq_head; /* how much of this queue may be reclaimed */ - __le16 sq_id; /* submission queue that generated this entry */ - __u16 command_id; /* of the command which completed */ - __le16 status; /* did the command fail, and if so, why? */ -}; - -struct nvme_user_io { - __u8 opcode; - __u8 flags; - __u16 control; - __u16 nblocks; - __u16 rsvd; - __u64 metadata; - __u64 addr; - __u64 slba; - __u32 dsmgmt; - __u32 reftag; - __u16 apptag; - __u16 appmask; -}; - -struct nvme_admin_cmd { - __u8 opcode; - __u8 flags; - __u16 rsvd1; - __u32 nsid; - __u32 cdw2; - __u32 cdw3; - __u64 metadata; - __u64 addr; - __u32 metadata_len; - __u32 data_len; - __u32 cdw10; - __u32 cdw11; - __u32 cdw12; - __u32 cdw13; - __u32 cdw14; - __u32 cdw15; - __u32 timeout_ms; - __u32 result; -}; - -#define NVME_IOCTL_ID _IO('N', 0x40) -#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) -#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) - -#ifdef __KERNEL__ -#include -#include -#include - #define NVME_IO_TIMEOUT (5 * HZ) /* @@ -553,7 +106,7 @@ struct nvme_ns { struct request_queue *queue; struct gendisk *disk; - int ns_id; + unsigned ns_id; int lba_shift; int ms; u64 mode_select_num_blocks; @@ -572,6 +125,7 @@ struct nvme_iod { int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ + unsigned long start_time; dma_addr_t first_dma; struct scatterlist sg[0]; }; @@ -613,6 +167,4 @@ struct sg_io_hdr; int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr); int nvme_sg_get_version_num(int __user *ip); -#endif - #endif /* _LINUX_NVME_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index e7c94eeb9475..115add2515aa 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -284,6 +284,7 @@ header-y += nfs_mount.h header-y += nfsacl.h header-y += nl80211.h header-y += nubus.h +header-y += nvme.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h new file mode 100644 index 000000000000..989c04e0c563 --- /dev/null +++ b/include/uapi/linux/nvme.h @@ -0,0 +1,477 @@ +/* + * Definitions for the NVM Express interface + * Copyright (c) 2011-2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _UAPI_LINUX_NVME_H +#define _UAPI_LINUX_NVME_H + +#include + +struct nvme_id_power_state { + __le16 max_power; /* centiwatts */ + __u8 rsvd2; + __u8 flags; + __le32 entry_lat; /* microseconds */ + __le32 exit_lat; /* microseconds */ + __u8 read_tput; + __u8 read_lat; + __u8 write_tput; + __u8 write_lat; + __u8 rsvd16[16]; +}; + +enum { + NVME_PS_FLAGS_MAX_POWER_SCALE = 1 << 0, + NVME_PS_FLAGS_NON_OP_STATE = 1 << 1, +}; + +struct nvme_id_ctrl { + __le16 vid; + __le16 ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + __u8 rab; + __u8 ieee[3]; + __u8 mic; + __u8 mdts; + __u8 rsvd78[178]; + __le16 oacs; + __u8 acl; + __u8 aerl; + __u8 frmw; + __u8 lpa; + __u8 elpe; + __u8 npss; + __u8 rsvd264[248]; + __u8 sqes; + __u8 cqes; + __u8 rsvd514[2]; + __le32 nn; + __le16 oncs; + __le16 fuses; + __u8 fna; + __u8 vwc; + __le16 awun; + __le16 awupf; + __u8 rsvd530[1518]; + struct nvme_id_power_state psd[32]; + __u8 vs[1024]; +}; + +enum { + NVME_CTRL_ONCS_COMPARE = 1 << 0, + NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1, + NVME_CTRL_ONCS_DSM = 1 << 2, +}; + +struct nvme_lbaf { + __le16 ms; + __u8 ds; + __u8 rp; +}; + +struct nvme_id_ns { + __le64 nsze; + __le64 ncap; + __le64 nuse; + __u8 nsfeat; + __u8 nlbaf; + __u8 flbas; + __u8 mc; + __u8 dpc; + __u8 dps; + __u8 rsvd30[98]; + struct nvme_lbaf lbaf[16]; + __u8 rsvd192[192]; + __u8 vs[3712]; +}; + +enum { + NVME_NS_FEAT_THIN = 1 << 0, + NVME_LBAF_RP_BEST = 0, + NVME_LBAF_RP_BETTER = 1, + NVME_LBAF_RP_GOOD = 2, + NVME_LBAF_RP_DEGRADED = 3, +}; + +struct nvme_smart_log { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 rsvd6[26]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_reads[16]; + __u8 host_writes[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __u8 rsvd192[320]; +}; + +enum { + NVME_SMART_CRIT_SPARE = 1 << 0, + NVME_SMART_CRIT_TEMPERATURE = 1 << 1, + NVME_SMART_CRIT_RELIABILITY = 1 << 2, + NVME_SMART_CRIT_MEDIA = 1 << 3, + NVME_SMART_CRIT_VOLATILE_MEMORY = 1 << 4, +}; + +struct nvme_lba_range_type { + __u8 type; + __u8 attributes; + __u8 rsvd2[14]; + __u64 slba; + __u64 nlb; + __u8 guid[16]; + __u8 rsvd48[16]; +}; + +enum { + NVME_LBART_TYPE_FS = 0x01, + NVME_LBART_TYPE_RAID = 0x02, + NVME_LBART_TYPE_CACHE = 0x03, + NVME_LBART_TYPE_SWAP = 0x04, + + NVME_LBART_ATTRIB_TEMP = 1 << 0, + NVME_LBART_ATTRIB_HIDE = 1 << 1, +}; + +/* I/O commands */ + +enum nvme_opcode { + nvme_cmd_flush = 0x00, + nvme_cmd_write = 0x01, + nvme_cmd_read = 0x02, + nvme_cmd_write_uncor = 0x04, + nvme_cmd_compare = 0x05, + nvme_cmd_dsm = 0x09, +}; + +struct nvme_common_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __le32 cdw2[2]; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le32 cdw10[6]; +}; + +struct nvme_rw_command { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2; + __le64 metadata; + __le64 prp1; + __le64 prp2; + __le64 slba; + __le16 length; + __le16 control; + __le32 dsmgmt; + __le32 reftag; + __le16 apptag; + __le16 appmask; +}; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, +}; + +struct nvme_dsm_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 nr; + __le32 attributes; + __u32 rsvd12[4]; +}; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +struct nvme_dsm_range { + __le32 cattr; + __le32 nlb; + __le64 slba; +}; + +/* Admin commands */ + +enum nvme_admin_opcode { + nvme_admin_delete_sq = 0x00, + nvme_admin_create_sq = 0x01, + nvme_admin_get_log_page = 0x02, + nvme_admin_delete_cq = 0x04, + nvme_admin_create_cq = 0x05, + nvme_admin_identify = 0x06, + nvme_admin_abort_cmd = 0x08, + nvme_admin_set_features = 0x09, + nvme_admin_get_features = 0x0a, + nvme_admin_async_event = 0x0c, + nvme_admin_activate_fw = 0x10, + nvme_admin_download_fw = 0x11, + nvme_admin_format_nvm = 0x80, + nvme_admin_security_send = 0x81, + nvme_admin_security_recv = 0x82, +}; + +enum { + NVME_QUEUE_PHYS_CONTIG = (1 << 0), + NVME_CQ_IRQ_ENABLED = (1 << 1), + NVME_SQ_PRIO_URGENT = (0 << 1), + NVME_SQ_PRIO_HIGH = (1 << 1), + NVME_SQ_PRIO_MEDIUM = (2 << 1), + NVME_SQ_PRIO_LOW = (3 << 1), + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MGMT = 0x02, + NVME_FEAT_LBA_RANGE = 0x03, + NVME_FEAT_TEMP_THRESH = 0x04, + NVME_FEAT_ERR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WC = 0x06, + NVME_FEAT_NUM_QUEUES = 0x07, + NVME_FEAT_IRQ_COALESCE = 0x08, + NVME_FEAT_IRQ_CONFIG = 0x09, + NVME_FEAT_WRITE_ATOMIC = 0x0a, + NVME_FEAT_ASYNC_EVENT = 0x0b, + NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FWACT_REPL = (0 << 3), + NVME_FWACT_REPL_ACTV = (1 << 3), + NVME_FWACT_ACTV = (2 << 3), +}; + +struct nvme_identify { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 cns; + __u32 rsvd11[5]; +}; + +struct nvme_features { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[2]; + __le64 prp1; + __le64 prp2; + __le32 fid; + __le32 dword11; + __u32 rsvd12[4]; +}; + +struct nvme_create_cq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 cqid; + __le16 qsize; + __le16 cq_flags; + __le16 irq_vector; + __u32 rsvd12[4]; +}; + +struct nvme_create_sq { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __u64 rsvd8; + __le16 sqid; + __le16 qsize; + __le16 sq_flags; + __le16 cqid; + __u32 rsvd12[4]; +}; + +struct nvme_delete_queue { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[9]; + __le16 qid; + __u16 rsvd10; + __u32 rsvd11[5]; +}; + +struct nvme_download_firmware { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __le32 numd; + __le32 offset; + __u32 rsvd12[4]; +}; + +struct nvme_format_cmd { + __u8 opcode; + __u8 flags; + __u16 command_id; + __le32 nsid; + __u64 rsvd2[4]; + __le32 cdw10; + __u32 rsvd11[5]; +}; + +struct nvme_command { + union { + struct nvme_common_command common; + struct nvme_rw_command rw; + struct nvme_identify identify; + struct nvme_features features; + struct nvme_create_cq create_cq; + struct nvme_create_sq create_sq; + struct nvme_delete_queue delete_queue; + struct nvme_download_firmware dlfw; + struct nvme_format_cmd format; + struct nvme_dsm_cmd dsm; + }; +}; + +enum { + NVME_SC_SUCCESS = 0x0, + NVME_SC_INVALID_OPCODE = 0x1, + NVME_SC_INVALID_FIELD = 0x2, + NVME_SC_CMDID_CONFLICT = 0x3, + NVME_SC_DATA_XFER_ERROR = 0x4, + NVME_SC_POWER_LOSS = 0x5, + NVME_SC_INTERNAL = 0x6, + NVME_SC_ABORT_REQ = 0x7, + NVME_SC_ABORT_QUEUE = 0x8, + NVME_SC_FUSED_FAIL = 0x9, + NVME_SC_FUSED_MISSING = 0xa, + NVME_SC_INVALID_NS = 0xb, + NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_LBA_RANGE = 0x80, + NVME_SC_CAP_EXCEEDED = 0x81, + NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_CQ_INVALID = 0x100, + NVME_SC_QID_INVALID = 0x101, + NVME_SC_QUEUE_SIZE = 0x102, + NVME_SC_ABORT_LIMIT = 0x103, + NVME_SC_ABORT_MISSING = 0x104, + NVME_SC_ASYNC_LIMIT = 0x105, + NVME_SC_FIRMWARE_SLOT = 0x106, + NVME_SC_FIRMWARE_IMAGE = 0x107, + NVME_SC_INVALID_VECTOR = 0x108, + NVME_SC_INVALID_LOG_PAGE = 0x109, + NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_WRITE_FAULT = 0x280, + NVME_SC_READ_ERROR = 0x281, + NVME_SC_GUARD_CHECK = 0x282, + NVME_SC_APPTAG_CHECK = 0x283, + NVME_SC_REFTAG_CHECK = 0x284, + NVME_SC_COMPARE_FAILED = 0x285, + NVME_SC_ACCESS_DENIED = 0x286, +}; + +struct nvme_completion { + __le32 result; /* Used by admin commands to return data */ + __u32 rsvd; + __le16 sq_head; /* how much of this queue may be reclaimed */ + __le16 sq_id; /* submission queue that generated this entry */ + __u16 command_id; /* of the command which completed */ + __le16 status; /* did the command fail, and if so, why? */ +}; + +struct nvme_user_io { + __u8 opcode; + __u8 flags; + __u16 control; + __u16 nblocks; + __u16 rsvd; + __u64 metadata; + __u64 addr; + __u64 slba; + __u32 dsmgmt; + __u32 reftag; + __u16 apptag; + __u16 appmask; +}; + +struct nvme_admin_cmd { + __u8 opcode; + __u8 flags; + __u16 rsvd1; + __u32 nsid; + __u32 cdw2; + __u32 cdw3; + __u64 metadata; + __u64 addr; + __u32 metadata_len; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u32 cdw12; + __u32 cdw13; + __u32 cdw14; + __u32 cdw15; + __u32 timeout_ms; + __u32 result; +}; + +#define NVME_IOCTL_ID _IO('N', 0x40) +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) +#define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) + +#endif /* _UAPI_LINUX_NVME_H */