]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - net/rds/ib_rdma.c
Merge tag 'kvm-5.6-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux.git] / net / rds / ib_rdma.c
index 0b347f46b2f41561d51668c97f479733f6388163..b34b24e237f81bfdf5d01c7c990895517647f25c 100644 (file)
 
 #include "rds_single_path.h"
 #include "ib_mr.h"
+#include "rds.h"
 
 struct workqueue_struct *rds_ib_mr_wq;
+struct rds_ib_dereg_odp_mr {
+       struct work_struct work;
+       struct ib_mr *mr;
+};
 
-static DEFINE_PER_CPU(unsigned long, clean_list_grace);
-#define CLEAN_LIST_BUSY_BIT 0
+static void rds_ib_odp_mr_worker(struct work_struct *work);
 
 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
 {
@@ -195,12 +199,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
 {
        struct rds_ib_mr *ibmr = NULL;
        struct llist_node *ret;
-       unsigned long *flag;
+       unsigned long flags;
 
-       preempt_disable();
-       flag = this_cpu_ptr(&clean_list_grace);
-       set_bit(CLEAN_LIST_BUSY_BIT, flag);
+       spin_lock_irqsave(&pool->clean_lock, flags);
        ret = llist_del_first(&pool->clean_list);
+       spin_unlock_irqrestore(&pool->clean_lock, flags);
        if (ret) {
                ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
                if (pool->pool_type == RDS_IB_MR_8K_POOL)
@@ -209,28 +212,17 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
                        rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
        }
 
-       clear_bit(CLEAN_LIST_BUSY_BIT, flag);
-       preempt_enable();
        return ibmr;
 }
 
-static inline void wait_clean_list_grace(void)
-{
-       int cpu;
-       unsigned long *flag;
-
-       for_each_online_cpu(cpu) {
-               flag = &per_cpu(clean_list_grace, cpu);
-               while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
-                       cpu_relax();
-       }
-}
-
 void rds_ib_sync_mr(void *trans_private, int direction)
 {
        struct rds_ib_mr *ibmr = trans_private;
        struct rds_ib_device *rds_ibdev = ibmr->device;
 
+       if (ibmr->odp)
+               return;
+
        switch (direction) {
        case DMA_FROM_DEVICE:
                ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@@ -324,8 +316,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
  * of clusters.  Each cluster has linked llist nodes of
  * MR_CLUSTER_SIZE mrs that are ready for reuse.
  */
-static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
-                               struct list_head *list,
+static void list_to_llist_nodes(struct list_head *list,
                                struct llist_node **nodes_head,
                                struct llist_node **nodes_tail)
 {
@@ -402,8 +393,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
         */
        dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
        dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
-       if (free_all)
+       if (free_all) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&pool->clean_lock, flags);
                llist_append_to_list(&pool->clean_list, &unmap_list);
+               spin_unlock_irqrestore(&pool->clean_lock, flags);
+       }
 
        free_goal = rds_ib_flush_goal(pool, free_all);
 
@@ -416,27 +412,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
                rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
 
        if (!list_empty(&unmap_list)) {
-               /* we have to make sure that none of the things we're about
-                * to put on the clean list would race with other cpus trying
-                * to pull items off.  The llist would explode if we managed to
-                * remove something from the clean list and then add it back again
-                * while another CPU was spinning on that same item in llist_del_first.
-                *
-                * This is pretty unlikely, but just in case  wait for an llist grace period
-                * here before adding anything back into the clean list.
-                */
-               wait_clean_list_grace();
+               unsigned long flags;
 
-               list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
+               list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
                if (ibmr_ret) {
                        *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
                        clean_nodes = clean_nodes->next;
                }
                /* more than one entry in llist nodes */
-               if (clean_nodes)
+               if (clean_nodes) {
+                       spin_lock_irqsave(&pool->clean_lock, flags);
                        llist_add_batch(clean_nodes, clean_tail,
                                        &pool->clean_list);
-
+                       spin_unlock_irqrestore(&pool->clean_lock, flags);
+               }
        }
 
        atomic_sub(unpinned, &pool->free_pinned);
@@ -471,7 +460,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
                                rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
                        else
                                rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
-                       return ERR_PTR(-EAGAIN);
+                       break;
                }
 
                /* We do have some empty MRs. Flush them out. */
@@ -485,7 +474,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
                        return ibmr;
        }
 
-       return ibmr;
+       return NULL;
 }
 
 static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
@@ -503,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
 
        rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
 
+       if (ibmr->odp) {
+               /* A MR created and marked as use_once. We use delayed work,
+                * because there is a change that we are in interrupt and can't
+                * call to ib_dereg_mr() directly.
+                */
+               INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
+               queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
+               return;
+       }
+
        /* Return it to the pool's free list */
        if (rds_ibdev->use_fastreg)
                rds_ib_free_frmr_list(ibmr);
@@ -547,9 +546,17 @@ void rds_ib_flush_mrs(void)
        up_read(&rds_ib_devices_lock);
 }
 
+u32 rds_ib_get_lkey(void *trans_private)
+{
+       struct rds_ib_mr *ibmr = trans_private;
+
+       return ibmr->u.mr->lkey;
+}
+
 void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                    struct rds_sock *rs, u32 *key_ret,
-                   struct rds_connection *conn)
+                   struct rds_connection *conn,
+                   u64 start, u64 length, int need_odp)
 {
        struct rds_ib_device *rds_ibdev;
        struct rds_ib_mr *ibmr = NULL;
@@ -562,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
                goto out;
        }
 
+       if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
+               u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
+               int access_flags =
+                       (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+                        IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
+                        IB_ACCESS_ON_DEMAND);
+               struct ib_sge sge = {};
+               struct ib_mr *ib_mr;
+
+               if (!rds_ibdev->odp_capable) {
+                       ret = -EOPNOTSUPP;
+                       goto out;
+               }
+
+               ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
+                                      access_flags);
+
+               if (IS_ERR(ib_mr)) {
+                       rdsdebug("rds_ib_get_user_mr returned %d\n",
+                                IS_ERR(ib_mr));
+                       ret = PTR_ERR(ib_mr);
+                       goto out;
+               }
+               if (key_ret)
+                       *key_ret = ib_mr->rkey;
+
+               ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
+               if (!ibmr) {
+                       ib_dereg_mr(ib_mr);
+                       ret = -ENOMEM;
+                       goto out;
+               }
+               ibmr->u.mr = ib_mr;
+               ibmr->odp = 1;
+
+               sge.addr = virt_addr;
+               sge.length = length;
+               sge.lkey = ib_mr->lkey;
+
+               ib_advise_mr(rds_ibdev->pd,
+                            IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
+                            IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
+               return ibmr;
+       }
+
        if (conn)
                ic = conn->c_transport_data;
 
@@ -610,6 +662,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
        init_llist_head(&pool->free_list);
        init_llist_head(&pool->drop_list);
        init_llist_head(&pool->clean_list);
+       spin_lock_init(&pool->clean_lock);
        mutex_init(&pool->flush_lock);
        init_waitqueue_head(&pool->flush_wait);
        INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
@@ -649,3 +702,12 @@ void rds_ib_mr_exit(void)
 {
        destroy_workqueue(rds_ib_mr_wq);
 }
+
+static void rds_ib_odp_mr_worker(struct work_struct  *work)
+{
+       struct rds_ib_mr *ibmr;
+
+       ibmr = container_of(work, struct rds_ib_mr, work.work);
+       ib_dereg_mr(ibmr->u.mr);
+       kfree(ibmr);
+}