drivers/nvme/host/multipath.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2017-2018 Christoph Hellwig.
   4  */
   5
   6 #include <linux/moduleparam.h>
   7 #include <trace/events/block.h>
   8 #include "nvme.h"
   9
  10 static bool multipath = true;
  11 module_param(multipath, bool, 0444);
  12 MODULE_PARM_DESC(multipath,
  13         "turn on native support for multiple controllers per subsystem");
  14
  15 /*
  16  * If multipathing is enabled we need to always use the subsystem instance
  17  * number for numbering our devices to avoid conflicts between subsystems that
  18  * have multiple controllers and thus use the multipath-aware subsystem node
  19  * and those that have a single controller and use the controller node
  20  * directly.
  21  */
  22 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
  23                         struct nvme_ctrl *ctrl, int *flags)
  24 {
  25         if (!multipath) {
  26                 sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
  27         } else if (ns->head->disk) {
  28                 sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
  29                                 ctrl->instance, ns->head->instance);
  30                 *flags = GENHD_FL_HIDDEN;
  31         } else {
  32                 sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
  33                                 ns->head->instance);
  34         }
  35 }
  36
  37 void nvme_failover_req(struct request *req)
  38 {
  39         struct nvme_ns *ns = req->q->queuedata;
  40         u16 status = nvme_req(req)->status;
  41         unsigned long flags;
  42
  43         spin_lock_irqsave(&ns->head->requeue_lock, flags);
  44         blk_steal_bios(&ns->head->requeue_list, req);
  45         spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
  46         blk_mq_end_request(req, 0);
  47
  48         switch (status & 0x7ff) {
  49         case NVME_SC_ANA_TRANSITION:
  50         case NVME_SC_ANA_INACCESSIBLE:
  51         case NVME_SC_ANA_PERSISTENT_LOSS:
  52                 /*
  53                  * If we got back an ANA error we know the controller is alive,
  54                  * but not ready to serve this namespaces.  The spec suggests
  55                  * we should update our general state here, but due to the fact
  56                  * that the admin and I/O queues are not serialized that is
  57                  * fundamentally racy.  So instead just clear the current path,
  58                  * mark the the path as pending and kick of a re-read of the ANA
  59                  * log page ASAP.
  60                  */
  61                 nvme_mpath_clear_current_path(ns);
  62                 if (ns->ctrl->ana_log_buf) {
  63                         set_bit(NVME_NS_ANA_PENDING, &ns->flags);
  64                         queue_work(nvme_wq, &ns->ctrl->ana_work);
  65                 }
  66                 break;
  67         case NVME_SC_HOST_PATH_ERROR:
  68                 /*
  69                  * Temporary transport disruption in talking to the controller.
  70                  * Try to send on a new path.
  71                  */
  72                 nvme_mpath_clear_current_path(ns);
  73                 break;
  74         default:
  75                 /*
  76                  * Reset the controller for any non-ANA error as we don't know
  77                  * what caused the error.
  78                  */
  79                 nvme_reset_ctrl(ns->ctrl);
  80                 break;
  81         }
  82
  83         kblockd_schedule_work(&ns->head->requeue_work);
  84 }
  85
  86 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
  87 {
  88         struct nvme_ns *ns;
  89
  90         down_read(&ctrl->namespaces_rwsem);
  91         list_for_each_entry(ns, &ctrl->namespaces, list) {
  92                 if (ns->head->disk)
  93                         kblockd_schedule_work(&ns->head->requeue_work);
  94         }
  95         up_read(&ctrl->namespaces_rwsem);
  96 }
  97
  98 static const char *nvme_ana_state_names[] = {
  99         [0]                             = "invalid state",
 100         [NVME_ANA_OPTIMIZED]            = "optimized",
 101         [NVME_ANA_NONOPTIMIZED]         = "non-optimized",
 102         [NVME_ANA_INACCESSIBLE]         = "inaccessible",
 103         [NVME_ANA_PERSISTENT_LOSS]      = "persistent-loss",
 104         [NVME_ANA_CHANGE]               = "change",
 105 };
 106
 107 void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 108 {
 109         struct nvme_ns_head *head = ns->head;
 110         int node;
 111
 112         if (!head)
 113                 return;
 114
 115         for_each_node(node) {
 116                 if (ns == rcu_access_pointer(head->current_path[node]))
 117                         rcu_assign_pointer(head->current_path[node], NULL);
 118         }
 119 }
 120
 121 static bool nvme_path_is_disabled(struct nvme_ns *ns)
 122 {
 123         return ns->ctrl->state != NVME_CTRL_LIVE ||
 124                 test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
 125                 test_bit(NVME_NS_REMOVING, &ns->flags);
 126 }
 127
 128 static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 129 {
 130         int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
 131         struct nvme_ns *found = NULL, *fallback = NULL, *ns;
 132
 133         list_for_each_entry_rcu(ns, &head->list, siblings) {
 134                 if (nvme_path_is_disabled(ns))
 135                         continue;
 136
 137                 if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 138                         distance = node_distance(node, ns->ctrl->numa_node);
 139                 else
 140                         distance = LOCAL_DISTANCE;
 141
 142                 switch (ns->ana_state) {
 143                 case NVME_ANA_OPTIMIZED:
 144                         if (distance < found_distance) {
 145                                 found_distance = distance;
 146                                 found = ns;
 147                         }
 148                         break;
 149                 case NVME_ANA_NONOPTIMIZED:
 150                         if (distance < fallback_distance) {
 151                                 fallback_distance = distance;
 152                                 fallback = ns;
 153                         }
 154                         break;
 155                 default:
 156                         break;
 157                 }
 158         }
 159
 160         if (!found)
 161                 found = fallback;
 162         if (found)
 163                 rcu_assign_pointer(head->current_path[node], found);
 164         return found;
 165 }
 166
 167 static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
 168                 struct nvme_ns *ns)
 169 {
 170         ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
 171                         siblings);
 172         if (ns)
 173                 return ns;
 174         return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
 175 }
 176
 177 static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 178                 int node, struct nvme_ns *old)
 179 {
 180         struct nvme_ns *ns, *found, *fallback = NULL;
 181
 182         if (list_is_singular(&head->list)) {
 183                 if (nvme_path_is_disabled(old))
 184                         return NULL;
 185                 return old;
 186         }
 187
 188         for (ns = nvme_next_ns(head, old);
 189              ns != old;
 190              ns = nvme_next_ns(head, ns)) {
 191                 if (nvme_path_is_disabled(ns))
 192                         continue;
 193
 194                 if (ns->ana_state == NVME_ANA_OPTIMIZED) {
 195                         found = ns;
 196                         goto out;
 197                 }
 198                 if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
 199                         fallback = ns;
 200         }
 201
 202         if (!fallback)
 203                 return NULL;
 204         found = fallback;
 205 out:
 206         rcu_assign_pointer(head->current_path[node], found);
 207         return found;
 208 }
 209
 210 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 211 {
 212         return ns->ctrl->state == NVME_CTRL_LIVE &&
 213                 ns->ana_state == NVME_ANA_OPTIMIZED;
 214 }
 215
 216 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 217 {
 218         int node = numa_node_id();
 219         struct nvme_ns *ns;
 220
 221         ns = srcu_dereference(head->current_path[node], &head->srcu);
 222         if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
 223                 ns = nvme_round_robin_path(head, node, ns);
 224         if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 225                 ns = __nvme_find_path(head, node);
 226         return ns;
 227 }
 228
 229 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 230                 struct bio *bio)
 231 {
 232         struct nvme_ns_head *head = q->queuedata;
 233         struct device *dev = disk_to_dev(head->disk);
 234         struct nvme_ns *ns;
 235         blk_qc_t ret = BLK_QC_T_NONE;
 236         int srcu_idx;
 237
 238         /*
 239          * The namespace might be going away and the bio might
 240          * be moved to a different queue via blk_steal_bios(),
 241          * so we need to use the bio_split pool from the original
 242          * queue to allocate the bvecs from.
 243          */
 244         blk_queue_split(q, &bio);
 245
 246         srcu_idx = srcu_read_lock(&head->srcu);
 247         ns = nvme_find_path(head);
 248         if (likely(ns)) {
 249                 bio->bi_disk = ns->disk;
 250                 bio->bi_opf |= REQ_NVME_MPATH;
 251                 trace_block_bio_remap(bio->bi_disk->queue, bio,
 252                                       disk_devt(ns->head->disk),
 253                                       bio->bi_iter.bi_sector);
 254                 ret = direct_make_request(bio);
 255         } else if (!list_empty_careful(&head->list)) {
 256                 dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
 257
 258                 spin_lock_irq(&head->requeue_lock);
 259                 bio_list_add(&head->requeue_list, bio);
 260                 spin_unlock_irq(&head->requeue_lock);
 261         } else {
 262                 dev_warn_ratelimited(dev, "no path - failing I/O\n");
 263
 264                 bio->bi_status = BLK_STS_IOERR;
 265                 bio_endio(bio);
 266         }
 267
 268         srcu_read_unlock(&head->srcu, srcu_idx);
 269         return ret;
 270 }
 271
 272 static void nvme_requeue_work(struct work_struct *work)
 273 {
 274         struct nvme_ns_head *head =
 275                 container_of(work, struct nvme_ns_head, requeue_work);
 276         struct bio *bio, *next;
 277
 278         spin_lock_irq(&head->requeue_lock);
 279         next = bio_list_get(&head->requeue_list);
 280         spin_unlock_irq(&head->requeue_lock);
 281
 282         while ((bio = next) != NULL) {
 283                 next = bio->bi_next;
 284                 bio->bi_next = NULL;
 285
 286                 /*
 287                  * Reset disk to the mpath node and resubmit to select a new
 288                  * path.
 289                  */
 290                 bio->bi_disk = head->disk;
 291                 generic_make_request(bio);
 292         }
 293 }
 294
 295 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 296 {
 297         struct request_queue *q;
 298         bool vwc = false;
 299
 300         mutex_init(&head->lock);
 301         bio_list_init(&head->requeue_list);
 302         spin_lock_init(&head->requeue_lock);
 303         INIT_WORK(&head->requeue_work, nvme_requeue_work);
 304
 305         /*
 306          * Add a multipath node if the subsystems supports multiple controllers.
 307          * We also do this for private namespaces as the namespace sharing data could
 308          * change after a rescan.
 309          */
 310         if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 311                 return 0;
 312
 313         q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
 314         if (!q)
 315                 goto out;
 316         q->queuedata = head;
 317         blk_queue_make_request(q, nvme_ns_head_make_request);
 318         blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 319         /* set to a default value for 512 until disk is validated */
 320         blk_queue_logical_block_size(q, 512);
 321         blk_set_stacking_limits(&q->limits);
 322
 323         /* we need to propagate up the VMC settings */
 324         if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
 325                 vwc = true;
 326         blk_queue_write_cache(q, vwc, vwc);
 327
 328         head->disk = alloc_disk(0);
 329         if (!head->disk)
 330                 goto out_cleanup_queue;
 331         head->disk->fops = &nvme_ns_head_ops;
 332         head->disk->private_data = head;
 333         head->disk->queue = q;
 334         head->disk->flags = GENHD_FL_EXT_DEVT;
 335         sprintf(head->disk->disk_name, "nvme%dn%d",
 336                         ctrl->subsys->instance, head->instance);
 337         return 0;
 338
 339 out_cleanup_queue:
 340         blk_cleanup_queue(q);
 341 out:
 342         return -ENOMEM;
 343 }
 344
 345 static void nvme_mpath_set_live(struct nvme_ns *ns)
 346 {
 347         struct nvme_ns_head *head = ns->head;
 348
 349         lockdep_assert_held(&ns->head->lock);
 350
 351         if (!head->disk)
 352                 return;
 353
 354         if (!(head->disk->flags & GENHD_FL_UP))
 355                 device_add_disk(&head->subsys->dev, head->disk,
 356                                 nvme_ns_id_attr_groups);
 357
 358         if (nvme_path_is_optimized(ns)) {
 359                 int node, srcu_idx;
 360
 361                 srcu_idx = srcu_read_lock(&head->srcu);
 362                 for_each_node(node)
 363                         __nvme_find_path(head, node);
 364                 srcu_read_unlock(&head->srcu, srcu_idx);
 365         }
 366
 367         kblockd_schedule_work(&ns->head->requeue_work);
 368 }
 369
 370 static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
 371                 int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
 372                         void *))
 373 {
 374         void *base = ctrl->ana_log_buf;
 375         size_t offset = sizeof(struct nvme_ana_rsp_hdr);
 376         int error, i;
 377
 378         lockdep_assert_held(&ctrl->ana_lock);
 379
 380         for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
 381                 struct nvme_ana_group_desc *desc = base + offset;
 382                 u32 nr_nsids = le32_to_cpu(desc->nnsids);
 383                 size_t nsid_buf_size = nr_nsids * sizeof(__le32);
 384
 385                 if (WARN_ON_ONCE(desc->grpid == 0))
 386                         return -EINVAL;
 387                 if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
 388                         return -EINVAL;
 389                 if (WARN_ON_ONCE(desc->state == 0))
 390                         return -EINVAL;
 391                 if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
 392                         return -EINVAL;
 393
 394                 offset += sizeof(*desc);
 395                 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
 396                         return -EINVAL;
 397
 398                 error = cb(ctrl, desc, data);
 399                 if (error)
 400                         return error;
 401
 402                 offset += nsid_buf_size;
 403                 if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
 404                         return -EINVAL;
 405         }
 406
 407         return 0;
 408 }
 409
 410 static inline bool nvme_state_is_live(enum nvme_ana_state state)
 411 {
 412         return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
 413 }
 414
 415 static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
 416                 struct nvme_ns *ns)
 417 {
 418         mutex_lock(&ns->head->lock);
 419         ns->ana_grpid = le32_to_cpu(desc->grpid);
 420         ns->ana_state = desc->state;
 421         clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
 422
 423         if (nvme_state_is_live(ns->ana_state))
 424                 nvme_mpath_set_live(ns);
 425         mutex_unlock(&ns->head->lock);
 426 }
 427
 428 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 429                 struct nvme_ana_group_desc *desc, void *data)
 430 {
 431         u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 432         unsigned *nr_change_groups = data;
 433         struct nvme_ns *ns;
 434
 435         dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 436                         le32_to_cpu(desc->grpid),
 437                         nvme_ana_state_names[desc->state]);
 438
 439         if (desc->state == NVME_ANA_CHANGE)
 440                 (*nr_change_groups)++;
 441
 442         if (!nr_nsids)
 443                 return 0;
 444
 445         down_write(&ctrl->namespaces_rwsem);
 446         list_for_each_entry(ns, &ctrl->namespaces, list) {
 447                 if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
 448                         continue;
 449                 nvme_update_ns_ana_state(desc, ns);
 450                 if (++n == nr_nsids)
 451                         break;
 452         }
 453         up_write(&ctrl->namespaces_rwsem);
 454         WARN_ON_ONCE(n < nr_nsids);
 455         return 0;
 456 }
 457
 458 static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
 459 {
 460         u32 nr_change_groups = 0;
 461         int error;
 462
 463         mutex_lock(&ctrl->ana_lock);
 464         error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
 465                         groups_only ? NVME_ANA_LOG_RGO : 0,
 466                         ctrl->ana_log_buf, ctrl->ana_log_size, 0);
 467         if (error) {
 468                 dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
 469                 goto out_unlock;
 470         }
 471
 472         error = nvme_parse_ana_log(ctrl, &nr_change_groups,
 473                         nvme_update_ana_state);
 474         if (error)
 475                 goto out_unlock;
 476
 477         /*
 478          * In theory we should have an ANATT timer per group as they might enter
 479          * the change state at different times.  But that is a lot of overhead
 480          * just to protect against a target that keeps entering new changes
 481          * states while never finishing previous ones.  But we'll still
 482          * eventually time out once all groups are in change state, so this
 483          * isn't a big deal.
 484          *
 485          * We also double the ANATT value to provide some slack for transports
 486          * or AEN processing overhead.
 487          */
 488         if (nr_change_groups)
 489                 mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
 490         else
 491                 del_timer_sync(&ctrl->anatt_timer);
 492 out_unlock:
 493         mutex_unlock(&ctrl->ana_lock);
 494         return error;
 495 }
 496
 497 static void nvme_ana_work(struct work_struct *work)
 498 {
 499         struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
 500
 501         nvme_read_ana_log(ctrl, false);
 502 }
 503
 504 static void nvme_anatt_timeout(struct timer_list *t)
 505 {
 506         struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
 507
 508         dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
 509         nvme_reset_ctrl(ctrl);
 510 }
 511
 512 void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 513 {
 514         if (!nvme_ctrl_use_ana(ctrl))
 515                 return;
 516         del_timer_sync(&ctrl->anatt_timer);
 517         cancel_work_sync(&ctrl->ana_work);
 518 }
 519
 520 #define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
 521         struct device_attribute subsys_attr_##_name =   \
 522                 __ATTR(_name, _mode, _show, _store)
 523
 524 static const char *nvme_iopolicy_names[] = {
 525         [NVME_IOPOLICY_NUMA]    = "numa",
 526         [NVME_IOPOLICY_RR]      = "round-robin",
 527 };
 528
 529 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 530                 struct device_attribute *attr, char *buf)
 531 {
 532         struct nvme_subsystem *subsys =
 533                 container_of(dev, struct nvme_subsystem, dev);
 534
 535         return sprintf(buf, "%s\n",
 536                         nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 537 }
 538
 539 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 540                 struct device_attribute *attr, const char *buf, size_t count)
 541 {
 542         struct nvme_subsystem *subsys =
 543                 container_of(dev, struct nvme_subsystem, dev);
 544         int i;
 545
 546         for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 547                 if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
 548                         WRITE_ONCE(subsys->iopolicy, i);
 549                         return count;
 550                 }
 551         }
 552
 553         return -EINVAL;
 554 }
 555 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
 556                       nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
 557
 558 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 559                 char *buf)
 560 {
 561         return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
 562 }
 563 DEVICE_ATTR_RO(ana_grpid);
 564
 565 static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
 566                 char *buf)
 567 {
 568         struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
 569
 570         return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
 571 }
 572 DEVICE_ATTR_RO(ana_state);
 573
 574 static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
 575                 struct nvme_ana_group_desc *desc, void *data)
 576 {
 577         struct nvme_ns *ns = data;
 578
 579         if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
 580                 nvme_update_ns_ana_state(desc, ns);
 581                 return -ENXIO; /* just break out of the loop */
 582         }
 583
 584         return 0;
 585 }
 586
 587 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 588 {
 589         if (nvme_ctrl_use_ana(ns->ctrl)) {
 590                 mutex_lock(&ns->ctrl->ana_lock);
 591                 ns->ana_grpid = le32_to_cpu(id->anagrpid);
 592                 nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
 593                 mutex_unlock(&ns->ctrl->ana_lock);
 594         } else {
 595                 mutex_lock(&ns->head->lock);
 596                 ns->ana_state = NVME_ANA_OPTIMIZED;
 597                 nvme_mpath_set_live(ns);
 598                 mutex_unlock(&ns->head->lock);
 599         }
 600 }
 601
 602 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 603 {
 604         if (!head->disk)
 605                 return;
 606         if (head->disk->flags & GENHD_FL_UP)
 607                 del_gendisk(head->disk);
 608         blk_set_queue_dying(head->disk->queue);
 609         /* make sure all pending bios are cleaned up */
 610         kblockd_schedule_work(&head->requeue_work);
 611         flush_work(&head->requeue_work);
 612         blk_cleanup_queue(head->disk->queue);
 613         put_disk(head->disk);
 614 }
 615
 616 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 617 {
 618         int error;
 619
 620         /* check if multipath is enabled and we have the capability */
 621         if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
 622                 return 0;
 623
 624         ctrl->anacap = id->anacap;
 625         ctrl->anatt = id->anatt;
 626         ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
 627         ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
 628
 629         mutex_init(&ctrl->ana_lock);
 630         timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 631         ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
 632                 ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
 633         ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
 634
 635         if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
 636                 dev_err(ctrl->device,
 637                         "ANA log page size (%zd) larger than MDTS (%d).\n",
 638                         ctrl->ana_log_size,
 639                         ctrl->max_hw_sectors << SECTOR_SHIFT);
 640                 dev_err(ctrl->device, "disabling ANA support.\n");
 641                 return 0;
 642         }
 643
 644         INIT_WORK(&ctrl->ana_work, nvme_ana_work);
 645         ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
 646         if (!ctrl->ana_log_buf) {
 647                 error = -ENOMEM;
 648                 goto out;
 649         }
 650
 651         error = nvme_read_ana_log(ctrl, true);
 652         if (error)
 653                 goto out_free_ana_log_buf;
 654         return 0;
 655 out_free_ana_log_buf:
 656         kfree(ctrl->ana_log_buf);
 657         ctrl->ana_log_buf = NULL;
 658 out:
 659         return error;
 660 }
 661
 662 void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
 663 {
 664         kfree(ctrl->ana_log_buf);
 665         ctrl->ana_log_buf = NULL;
 666 }
 667