]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
RDMA/core: Implement compat device/sysfs tree in net namespace
authorParav Pandit <parav@mellanox.com>
Tue, 26 Feb 2019 11:56:13 +0000 (13:56 +0200)
committerJason Gunthorpe <jgg@mellanox.com>
Thu, 28 Mar 2019 17:52:02 +0000 (14:52 -0300)
Implement compatibility layer sysfs entries of ib_core so that non
init_net net namespaces can also discover rdma devices.

Each non init_net net namespace has ib_core_device created in it.
Such ib_core_device sysfs tree resembles rdma devices found in
init_net namespace.

This allows discovering rdma devices in multiple non init_net net
namespaces via sysfs entries and helpful to rdma-core userspace.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/core/device.c
include/rdma/ib_verbs.h

index 078566d0d7c2a3aebb7d92a8f9ab23cebccc4d08..167e2d46e4cb37ec973a444648139e1d6003797c 100644 (file)
@@ -38,6 +38,8 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/hashtable.h>
@@ -101,6 +103,30 @@ static DECLARE_RWSEM(clients_rwsem);
  * be registered.
  */
 #define CLIENT_DATA_REGISTERED XA_MARK_1
+
+/**
+ * struct rdma_dev_net - rdma net namespace metadata for a net
+ * @net:       Pointer to owner net namespace
+ * @id:                xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+       possible_net_t net;
+       u32 id;
+};
+
+static unsigned int rdma_dev_net_id;
+
+/*
+ * A list of net namespaces is maintained in an xarray. This is necessary
+ * because we can't get the locking right using the existing net ns list. We
+ * would require a init_net callback after the list is updated.
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
 /*
  * xarray has this behavior where it won't iterate over NULL values stored in
  * allocated arrays.  So we need our own iterator to see all values stored in
@@ -268,6 +294,26 @@ struct ib_device *ib_device_get_by_name(const char *name,
 }
 EXPORT_SYMBOL(ib_device_get_by_name);
 
+static int rename_compat_devs(struct ib_device *device)
+{
+       struct ib_core_device *cdev;
+       unsigned long index;
+       int ret = 0;
+
+       mutex_lock(&device->compat_devs_mutex);
+       xa_for_each (&device->compat_devs, index, cdev) {
+               ret = device_rename(&cdev->dev, dev_name(&device->dev));
+               if (ret) {
+                       dev_warn(&cdev->dev,
+                                "Fail to rename compatdev to new name %s\n",
+                                dev_name(&device->dev));
+                       break;
+               }
+       }
+       mutex_unlock(&device->compat_devs_mutex);
+       return ret;
+}
+
 int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
        int ret;
@@ -287,6 +333,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
        if (ret)
                goto out;
        strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+       ret = rename_compat_devs(ibdev);
 out:
        up_write(&devices_rwsem);
        return ret;
@@ -336,6 +383,7 @@ static void ib_device_release(struct device *device)
        WARN_ON(refcount_read(&dev->refcount));
        ib_cache_release_one(dev);
        ib_security_release_port_pkey_list(dev);
+       xa_destroy(&dev->compat_devs);
        xa_destroy(&dev->client_data);
        if (dev->port_data)
                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
@@ -359,7 +407,10 @@ static int ib_device_uevent(struct device *device,
 
 static const void *net_namespace(struct device *d)
 {
-       return &init_net;
+       struct ib_core_device *coredev =
+                       container_of(d, struct ib_core_device, dev);
+
+       return read_pnet(&coredev->rdma_net);
 }
 
 static struct class ib_class = {
@@ -371,7 +422,7 @@ static struct class ib_class = {
 };
 
 static void rdma_init_coredev(struct ib_core_device *coredev,
-                             struct ib_device *dev)
+                             struct ib_device *dev, struct net *net)
 {
        /* This BUILD_BUG_ON is intended to catch layout change
         * of union of ib_core_device and device.
@@ -387,6 +438,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev,
        device_initialize(&coredev->dev);
        coredev->owner = dev;
        INIT_LIST_HEAD(&coredev->port_list);
+       write_pnet(&coredev->rdma_net, net);
 }
 
 /**
@@ -416,7 +468,7 @@ struct ib_device *_ib_alloc_device(size_t size)
        }
 
        device->groups[0] = &ib_dev_attr_group;
-       rdma_init_coredev(&device->coredev, device);
+       rdma_init_coredev(&device->coredev, device, &init_net);
 
        INIT_LIST_HEAD(&device->event_handler_list);
        spin_lock_init(&device->event_handler_lock);
@@ -427,6 +479,8 @@ struct ib_device *_ib_alloc_device(size_t size)
         */
        xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
        init_rwsem(&device->client_data_rwsem);
+       xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
+       mutex_init(&device->compat_devs_mutex);
        init_completion(&device->unreg_completion);
        INIT_WORK(&device->unregistration_work, ib_unregister_work);
 
@@ -459,6 +513,7 @@ void ib_dealloc_device(struct ib_device *device)
        /* Expedite releasing netdev references */
        free_netdevs(device);
 
+       WARN_ON(!xa_empty(&device->compat_devs));
        WARN_ON(!xa_empty(&device->client_data));
        WARN_ON(refcount_read(&device->refcount));
        rdma_restrack_clean(device);
@@ -667,6 +722,180 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
        return NOTIFY_OK;
 }
 
+static void compatdev_release(struct device *dev)
+{
+       struct ib_core_device *cdev =
+               container_of(dev, struct ib_core_device, dev);
+
+       kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+                             struct rdma_dev_net *rnet)
+{
+       struct ib_core_device *cdev;
+       int ret;
+
+       /*
+        * Create and add compat device in all namespaces other than where it
+        * is currently bound to.
+        */
+       if (net_eq(read_pnet(&rnet->net),
+                  read_pnet(&device->coredev.rdma_net)))
+               return 0;
+
+       /*
+        * The first of init_net() or ib_register_device() to take the
+        * compat_devs_mutex wins and gets to add the device. Others will wait
+        * for completion here.
+        */
+       mutex_lock(&device->compat_devs_mutex);
+       cdev = xa_load(&device->compat_devs, rnet->id);
+       if (cdev) {
+               ret = 0;
+               goto done;
+       }
+       ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
+       if (ret)
+               goto done;
+
+       cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+       if (!cdev) {
+               ret = -ENOMEM;
+               goto cdev_err;
+       }
+
+       cdev->dev.parent = device->dev.parent;
+       rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
+       cdev->dev.release = compatdev_release;
+       dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
+
+       ret = device_add(&cdev->dev);
+       if (ret)
+               goto add_err;
+
+       ret = xa_err(xa_store(&device->compat_devs, rnet->id,
+                             cdev, GFP_KERNEL));
+       if (ret)
+               goto insert_err;
+
+       mutex_unlock(&device->compat_devs_mutex);
+       return 0;
+
+insert_err:
+       device_del(&cdev->dev);
+add_err:
+       put_device(&cdev->dev);
+cdev_err:
+       xa_release(&device->compat_devs, rnet->id);
+done:
+       mutex_unlock(&device->compat_devs_mutex);
+       return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+       struct ib_core_device *cdev;
+
+       mutex_lock(&device->compat_devs_mutex);
+       cdev = xa_erase(&device->compat_devs, id);
+       mutex_unlock(&device->compat_devs_mutex);
+       if (cdev) {
+               device_del(&cdev->dev);
+               put_device(&cdev->dev);
+       }
+}
+
+static void remove_compat_devs(struct ib_device *device)
+{
+       struct ib_core_device *cdev;
+       unsigned long index;
+
+       xa_for_each (&device->compat_devs, index, cdev)
+               remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+       struct rdma_dev_net *rnet;
+       unsigned long index;
+       int ret = 0;
+
+       down_read(&rdma_nets_rwsem);
+       xa_for_each (&rdma_nets, index, rnet) {
+               ret = add_one_compat_dev(device, rnet);
+               if (ret)
+                       break;
+       }
+       up_read(&rdma_nets_rwsem);
+       return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+       struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+       struct ib_device *dev;
+       unsigned long index;
+       int ret;
+
+       down_write(&rdma_nets_rwsem);
+       /*
+        * Prevent the ID from being re-used and hide the id from xa_for_each.
+        */
+       ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
+       WARN_ON(ret);
+       up_write(&rdma_nets_rwsem);
+
+       down_read(&devices_rwsem);
+       xa_for_each (&devices, index, dev) {
+               get_device(&dev->dev);
+               /*
+                * Release the devices_rwsem so that pontentially blocking
+                * device_del, doesn't hold the devices_rwsem for too long.
+                */
+               up_read(&devices_rwsem);
+
+               remove_one_compat_dev(dev, rnet->id);
+
+               put_device(&dev->dev);
+               down_read(&devices_rwsem);
+       }
+       up_read(&devices_rwsem);
+
+       xa_erase(&rdma_nets, rnet->id);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+       struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+       unsigned long index;
+       struct ib_device *dev;
+       int ret;
+
+       /* No need to create any compat devices in default init_net. */
+       if (net_eq(net, &init_net))
+               return 0;
+
+       write_pnet(&rnet->net, net);
+
+       ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
+       if (ret)
+               return ret;
+
+       down_read(&devices_rwsem);
+       xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+               ret = add_one_compat_dev(dev, rnet);
+               if (ret)
+                       break;
+       }
+       up_read(&devices_rwsem);
+
+       if (ret)
+               rdma_dev_exit_net(net);
+
+       return ret;
+}
+
 /*
  * Assign the unique string device name and the unique device index. This is
  * undone by ib_dealloc_device.
@@ -788,6 +1017,13 @@ static void disable_device(struct ib_device *device)
        ib_device_put(device);
        wait_for_completion(&device->unreg_completion);
 
+       /*
+        * compat devices must be removed after device refcount drops to zero.
+        * Otherwise init_net() may add more compatdevs after removing compat
+        * devices and before device is disabled.
+        */
+       remove_compat_devs(device);
+
        /* Expedite removing unregistered pointers from the hash table */
        free_netdevs(device);
 }
@@ -830,7 +1066,8 @@ static int enable_device_and_get(struct ib_device *device)
                        break;
        }
        up_read(&clients_rwsem);
-
+       if (!ret)
+               ret = add_compat_devs(device);
 out:
        up_read(&devices_rwsem);
        return ret;
@@ -1061,6 +1298,13 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
 }
 EXPORT_SYMBOL(ib_unregister_device_queued);
 
+static struct pernet_operations rdma_dev_net_ops = {
+       .init = rdma_dev_init_net,
+       .exit = rdma_dev_exit_net,
+       .id = &rdma_dev_net_id,
+       .size = sizeof(struct rdma_dev_net),
+};
+
 static int assign_client_id(struct ib_client *client)
 {
        int ret;
@@ -1926,12 +2170,20 @@ static int __init ib_core_init(void)
                goto err_sa;
        }
 
+       ret = register_pernet_device(&rdma_dev_net_ops);
+       if (ret) {
+               pr_warn("Couldn't init compat dev. ret %d\n", ret);
+               goto err_compat;
+       }
+
        nldev_init();
        rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
        roce_gid_mgmt_init();
 
        return 0;
 
+err_compat:
+       unregister_lsm_notifier(&ibdev_lsm_nb);
 err_sa:
        ib_sa_cleanup();
 err_mad:
@@ -1956,6 +2208,7 @@ static void __exit ib_core_cleanup(void)
        roce_gid_mgmt_cleanup();
        nldev_exit();
        rdma_nl_unregister(RDMA_NL_LS);
+       unregister_pernet_device(&rdma_dev_net_ops);
        unregister_lsm_notifier(&ibdev_lsm_nb);
        ib_sa_cleanup();
        ib_mad_cleanup();
index 5f9f4fcdc4ccc7be7c1aa219591be8628f29714c..d42267e72c4b05e19a36323b1f4f090ad2b3d998 100644 (file)
@@ -2559,6 +2559,7 @@ struct ib_core_device {
         * union of ib_core_device and device exists in ib_device.
         */
        struct device dev;
+       possible_net_t rdma_net;
        struct kobject *ports_kobj;
        struct list_head port_list;
        struct ib_device *owner; /* reach back to owner ib_device */
@@ -2636,6 +2637,11 @@ struct ib_device {
        struct work_struct unregistration_work;
 
        const struct rdma_link_ops *link_ops;
+
+       /* Protects compat_devs xarray modifications */
+       struct mutex compat_devs_mutex;
+       /* Maintains compat devices for each net namespace */
+       struct xarray compat_devs;
 };
 
 struct ib_client {