]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/infiniband/hw/mlx4/qp.c
429a59c5801cc129cc2ac477495fb49c8adbdecc
[linux.git] / drivers / infiniband / hw / mlx4 / qp.c
1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33
34 #include <linux/log2.h>
35 #include <linux/etherdevice.h>
36 #include <net/ip.h>
37 #include <linux/slab.h>
38 #include <linux/netdevice.h>
39
40 #include <rdma/ib_cache.h>
41 #include <rdma/ib_pack.h>
42 #include <rdma/ib_addr.h>
43 #include <rdma/ib_mad.h>
44 #include <rdma/uverbs_ioctl.h>
45
46 #include <linux/mlx4/driver.h>
47 #include <linux/mlx4/qp.h>
48
49 #include "mlx4_ib.h"
50 #include <rdma/mlx4-abi.h>
51
52 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
53                              struct mlx4_ib_cq *recv_cq);
54 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
55                                struct mlx4_ib_cq *recv_cq);
56 static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
57                               struct ib_udata *udata);
58
59 enum {
60         MLX4_IB_ACK_REQ_FREQ    = 8,
61 };
62
63 enum {
64         MLX4_IB_DEFAULT_SCHED_QUEUE     = 0x83,
65         MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
66         MLX4_IB_LINK_TYPE_IB            = 0,
67         MLX4_IB_LINK_TYPE_ETH           = 1
68 };
69
70 enum {
71         /*
72          * Largest possible UD header: send with GRH and immediate
73          * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
74          * tag.  (LRH would only use 8 bytes, so Ethernet is the
75          * biggest case)
76          */
77         MLX4_IB_UD_HEADER_SIZE          = 82,
78         MLX4_IB_LSO_HEADER_SPARE        = 128,
79 };
80
81 struct mlx4_ib_sqp {
82         struct mlx4_ib_qp       qp;
83         int                     pkey_index;
84         u32                     qkey;
85         u32                     send_psn;
86         struct ib_ud_header     ud_header;
87         u8                      header_buf[MLX4_IB_UD_HEADER_SIZE];
88         struct ib_qp            *roce_v2_gsi;
89 };
90
91 enum {
92         MLX4_IB_MIN_SQ_STRIDE   = 6,
93         MLX4_IB_CACHE_LINE_SIZE = 64,
94 };
95
96 enum {
97         MLX4_RAW_QP_MTU         = 7,
98         MLX4_RAW_QP_MSGMAX      = 31,
99 };
100
101 #ifndef ETH_ALEN
102 #define ETH_ALEN        6
103 #endif
104
105 static const __be32 mlx4_ib_opcode[] = {
106         [IB_WR_SEND]                            = cpu_to_be32(MLX4_OPCODE_SEND),
107         [IB_WR_LSO]                             = cpu_to_be32(MLX4_OPCODE_LSO),
108         [IB_WR_SEND_WITH_IMM]                   = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
109         [IB_WR_RDMA_WRITE]                      = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
110         [IB_WR_RDMA_WRITE_WITH_IMM]             = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
111         [IB_WR_RDMA_READ]                       = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
112         [IB_WR_ATOMIC_CMP_AND_SWP]              = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
113         [IB_WR_ATOMIC_FETCH_AND_ADD]            = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
114         [IB_WR_SEND_WITH_INV]                   = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
115         [IB_WR_LOCAL_INV]                       = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
116         [IB_WR_REG_MR]                          = cpu_to_be32(MLX4_OPCODE_FMR),
117         [IB_WR_MASKED_ATOMIC_CMP_AND_SWP]       = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
118         [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]     = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
119 };
120
121 enum mlx4_ib_source_type {
122         MLX4_IB_QP_SRC  = 0,
123         MLX4_IB_RWQ_SRC = 1,
124 };
125
126 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
127 {
128         return container_of(mqp, struct mlx4_ib_sqp, qp);
129 }
130
131 static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
132 {
133         if (!mlx4_is_master(dev->dev))
134                 return 0;
135
136         return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
137                qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
138                 8 * MLX4_MFUNC_MAX;
139 }
140
141 static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
142 {
143         int proxy_sqp = 0;
144         int real_sqp = 0;
145         int i;
146         /* PPF or Native -- real SQP */
147         real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
148                     qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
149                     qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
150         if (real_sqp)
151                 return 1;
152         /* VF or PF -- proxy SQP */
153         if (mlx4_is_mfunc(dev->dev)) {
154                 for (i = 0; i < dev->dev->caps.num_ports; i++) {
155                         if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy ||
156                             qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp1_proxy) {
157                                 proxy_sqp = 1;
158                                 break;
159                         }
160                 }
161         }
162         if (proxy_sqp)
163                 return 1;
164
165         return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP);
166 }
167
168 /* used for INIT/CLOSE port logic */
169 static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
170 {
171         int proxy_qp0 = 0;
172         int real_qp0 = 0;
173         int i;
174         /* PPF or Native -- real QP0 */
175         real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
176                     qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
177                     qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
178         if (real_qp0)
179                 return 1;
180         /* VF or PF -- proxy QP0 */
181         if (mlx4_is_mfunc(dev->dev)) {
182                 for (i = 0; i < dev->dev->caps.num_ports; i++) {
183                         if (qp->mqp.qpn == dev->dev->caps.spec_qps[i].qp0_proxy) {
184                                 proxy_qp0 = 1;
185                                 break;
186                         }
187                 }
188         }
189         return proxy_qp0;
190 }
191
192 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
193 {
194         return mlx4_buf_offset(&qp->buf, offset);
195 }
196
197 static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
198 {
199         return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
200 }
201
202 static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
203 {
204         return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
205 }
206
207 /*
208  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
209  * first four bytes of every 64 byte chunk with 0xffffffff, except for
210  * the very first chunk of the WQE.
211  */
212 static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
213 {
214         __be32 *wqe;
215         int i;
216         int s;
217         void *buf;
218         struct mlx4_wqe_ctrl_seg *ctrl;
219
220         buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
221         ctrl = (struct mlx4_wqe_ctrl_seg *)buf;
222         s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4;
223         for (i = 64; i < s; i += 64) {
224                 wqe = buf + i;
225                 *wqe = cpu_to_be32(0xffffffff);
226         }
227 }
228
229 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
230 {
231         struct ib_event event;
232         struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
233
234         if (type == MLX4_EVENT_TYPE_PATH_MIG)
235                 to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
236
237         if (ibqp->event_handler) {
238                 event.device     = ibqp->device;
239                 event.element.qp = ibqp;
240                 switch (type) {
241                 case MLX4_EVENT_TYPE_PATH_MIG:
242                         event.event = IB_EVENT_PATH_MIG;
243                         break;
244                 case MLX4_EVENT_TYPE_COMM_EST:
245                         event.event = IB_EVENT_COMM_EST;
246                         break;
247                 case MLX4_EVENT_TYPE_SQ_DRAINED:
248                         event.event = IB_EVENT_SQ_DRAINED;
249                         break;
250                 case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
251                         event.event = IB_EVENT_QP_LAST_WQE_REACHED;
252                         break;
253                 case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
254                         event.event = IB_EVENT_QP_FATAL;
255                         break;
256                 case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
257                         event.event = IB_EVENT_PATH_MIG_ERR;
258                         break;
259                 case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
260                         event.event = IB_EVENT_QP_REQ_ERR;
261                         break;
262                 case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
263                         event.event = IB_EVENT_QP_ACCESS_ERR;
264                         break;
265                 default:
266                         pr_warn("Unexpected event type %d "
267                                "on QP %06x\n", type, qp->qpn);
268                         return;
269                 }
270
271                 ibqp->event_handler(&event, ibqp->qp_context);
272         }
273 }
274
275 static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type)
276 {
277         pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n",
278                             type, qp->qpn);
279 }
280
281 static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
282 {
283         /*
284          * UD WQEs must have a datagram segment.
285          * RC and UC WQEs might have a remote address segment.
286          * MLX WQEs need two extra inline data segments (for the UD
287          * header and space for the ICRC).
288          */
289         switch (type) {
290         case MLX4_IB_QPT_UD:
291                 return sizeof (struct mlx4_wqe_ctrl_seg) +
292                         sizeof (struct mlx4_wqe_datagram_seg) +
293                         ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
294         case MLX4_IB_QPT_PROXY_SMI_OWNER:
295         case MLX4_IB_QPT_PROXY_SMI:
296         case MLX4_IB_QPT_PROXY_GSI:
297                 return sizeof (struct mlx4_wqe_ctrl_seg) +
298                         sizeof (struct mlx4_wqe_datagram_seg) + 64;
299         case MLX4_IB_QPT_TUN_SMI_OWNER:
300         case MLX4_IB_QPT_TUN_GSI:
301                 return sizeof (struct mlx4_wqe_ctrl_seg) +
302                         sizeof (struct mlx4_wqe_datagram_seg);
303
304         case MLX4_IB_QPT_UC:
305                 return sizeof (struct mlx4_wqe_ctrl_seg) +
306                         sizeof (struct mlx4_wqe_raddr_seg);
307         case MLX4_IB_QPT_RC:
308                 return sizeof (struct mlx4_wqe_ctrl_seg) +
309                         sizeof (struct mlx4_wqe_masked_atomic_seg) +
310                         sizeof (struct mlx4_wqe_raddr_seg);
311         case MLX4_IB_QPT_SMI:
312         case MLX4_IB_QPT_GSI:
313                 return sizeof (struct mlx4_wqe_ctrl_seg) +
314                         ALIGN(MLX4_IB_UD_HEADER_SIZE +
315                               DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
316                                            MLX4_INLINE_ALIGN) *
317                               sizeof (struct mlx4_wqe_inline_seg),
318                               sizeof (struct mlx4_wqe_data_seg)) +
319                         ALIGN(4 +
320                               sizeof (struct mlx4_wqe_inline_seg),
321                               sizeof (struct mlx4_wqe_data_seg));
322         default:
323                 return sizeof (struct mlx4_wqe_ctrl_seg);
324         }
325 }
326
327 static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
328                        bool is_user, int has_rq, struct mlx4_ib_qp *qp,
329                        u32 inl_recv_sz)
330 {
331         /* Sanity check RQ size before proceeding */
332         if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
333             cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
334                 return -EINVAL;
335
336         if (!has_rq) {
337                 if (cap->max_recv_wr || inl_recv_sz)
338                         return -EINVAL;
339
340                 qp->rq.wqe_cnt = qp->rq.max_gs = 0;
341         } else {
342                 u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg *
343                         sizeof(struct mlx4_wqe_data_seg);
344                 u32 wqe_size;
345
346                 /* HW requires >= 1 RQ entry with >= 1 gather entry */
347                 if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge ||
348                                 inl_recv_sz > max_inl_recv_sz))
349                         return -EINVAL;
350
351                 qp->rq.wqe_cnt   = roundup_pow_of_two(max(1U, cap->max_recv_wr));
352                 qp->rq.max_gs    = roundup_pow_of_two(max(1U, cap->max_recv_sge));
353                 wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg);
354                 qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz));
355         }
356
357         /* leave userspace return values as they were, so as not to break ABI */
358         if (is_user) {
359                 cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
360                 cap->max_recv_sge = qp->rq.max_gs;
361         } else {
362                 cap->max_recv_wr  = qp->rq.max_post =
363                         min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
364                 cap->max_recv_sge = min(qp->rq.max_gs,
365                                         min(dev->dev->caps.max_sq_sg,
366                                             dev->dev->caps.max_rq_sg));
367         }
368
369         return 0;
370 }
371
372 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
373                               enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
374 {
375         int s;
376
377         /* Sanity check SQ size before proceeding */
378         if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
379             cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
380             cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
381             sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
382                 return -EINVAL;
383
384         /*
385          * For MLX transport we need 2 extra S/G entries:
386          * one for the header and one for the checksum at the end
387          */
388         if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
389              type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
390             cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
391                 return -EINVAL;
392
393         s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
394                 cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
395                 send_wqe_overhead(type, qp->flags);
396
397         if (s > dev->dev->caps.max_sq_desc_sz)
398                 return -EINVAL;
399
400         qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
401
402         /*
403          * We need to leave 2 KB + 1 WR of headroom in the SQ to
404          * allow HW to prefetch.
405          */
406         qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
407         qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr +
408                                             qp->sq_spare_wqes);
409
410         qp->sq.max_gs =
411                 (min(dev->dev->caps.max_sq_desc_sz,
412                      (1 << qp->sq.wqe_shift)) -
413                  send_wqe_overhead(type, qp->flags)) /
414                 sizeof (struct mlx4_wqe_data_seg);
415
416         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
417                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
418         if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
419                 qp->rq.offset = 0;
420                 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
421         } else {
422                 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
423                 qp->sq.offset = 0;
424         }
425
426         cap->max_send_wr  = qp->sq.max_post =
427                 qp->sq.wqe_cnt - qp->sq_spare_wqes;
428         cap->max_send_sge = min(qp->sq.max_gs,
429                                 min(dev->dev->caps.max_sq_sg,
430                                     dev->dev->caps.max_rq_sg));
431         /* We don't support inline sends for kernel QPs (yet) */
432         cap->max_inline_data = 0;
433
434         return 0;
435 }
436
437 static int set_user_sq_size(struct mlx4_ib_dev *dev,
438                             struct mlx4_ib_qp *qp,
439                             struct mlx4_ib_create_qp *ucmd)
440 {
441         /* Sanity check SQ size before proceeding */
442         if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes       ||
443             ucmd->log_sq_stride >
444                 ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
445             ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
446                 return -EINVAL;
447
448         qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
449         qp->sq.wqe_shift = ucmd->log_sq_stride;
450
451         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
452                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
453
454         return 0;
455 }
456
457 static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
458 {
459         int i;
460
461         qp->sqp_proxy_rcv =
462                 kmalloc_array(qp->rq.wqe_cnt, sizeof(struct mlx4_ib_buf),
463                               GFP_KERNEL);
464         if (!qp->sqp_proxy_rcv)
465                 return -ENOMEM;
466         for (i = 0; i < qp->rq.wqe_cnt; i++) {
467                 qp->sqp_proxy_rcv[i].addr =
468                         kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
469                                 GFP_KERNEL);
470                 if (!qp->sqp_proxy_rcv[i].addr)
471                         goto err;
472                 qp->sqp_proxy_rcv[i].map =
473                         ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
474                                           sizeof (struct mlx4_ib_proxy_sqp_hdr),
475                                           DMA_FROM_DEVICE);
476                 if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) {
477                         kfree(qp->sqp_proxy_rcv[i].addr);
478                         goto err;
479                 }
480         }
481         return 0;
482
483 err:
484         while (i > 0) {
485                 --i;
486                 ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
487                                     sizeof (struct mlx4_ib_proxy_sqp_hdr),
488                                     DMA_FROM_DEVICE);
489                 kfree(qp->sqp_proxy_rcv[i].addr);
490         }
491         kfree(qp->sqp_proxy_rcv);
492         qp->sqp_proxy_rcv = NULL;
493         return -ENOMEM;
494 }
495
496 static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
497 {
498         int i;
499
500         for (i = 0; i < qp->rq.wqe_cnt; i++) {
501                 ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
502                                     sizeof (struct mlx4_ib_proxy_sqp_hdr),
503                                     DMA_FROM_DEVICE);
504                 kfree(qp->sqp_proxy_rcv[i].addr);
505         }
506         kfree(qp->sqp_proxy_rcv);
507 }
508
509 static int qp_has_rq(struct ib_qp_init_attr *attr)
510 {
511         if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
512                 return 0;
513
514         return !attr->srq;
515 }
516
517 static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
518 {
519         int i;
520         for (i = 0; i < dev->caps.num_ports; i++) {
521                 if (qpn == dev->caps.spec_qps[i].qp0_proxy)
522                         return !!dev->caps.spec_qps[i].qp0_qkey;
523         }
524         return 0;
525 }
526
527 static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev,
528                                     struct mlx4_ib_qp *qp)
529 {
530         mutex_lock(&dev->counters_table[qp->port - 1].mutex);
531         mlx4_counter_free(dev->dev, qp->counter_index->index);
532         list_del(&qp->counter_index->list);
533         mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
534
535         kfree(qp->counter_index);
536         qp->counter_index = NULL;
537 }
538
539 static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx,
540                       struct ib_qp_init_attr *init_attr,
541                       struct mlx4_ib_create_qp_rss *ucmd)
542 {
543         rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num |
544                 (init_attr->rwq_ind_tbl->log_ind_tbl_size << 24);
545
546         if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) &&
547             (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) {
548                 memcpy(rss_ctx->rss_key, ucmd->rx_hash_key,
549                        MLX4_EN_RSS_KEY_SIZE);
550         } else {
551                 pr_debug("RX Hash function is not supported\n");
552                 return (-EOPNOTSUPP);
553         }
554
555         if (ucmd->rx_hash_fields_mask & ~(MLX4_IB_RX_HASH_SRC_IPV4      |
556                                           MLX4_IB_RX_HASH_DST_IPV4      |
557                                           MLX4_IB_RX_HASH_SRC_IPV6      |
558                                           MLX4_IB_RX_HASH_DST_IPV6      |
559                                           MLX4_IB_RX_HASH_SRC_PORT_TCP  |
560                                           MLX4_IB_RX_HASH_DST_PORT_TCP  |
561                                           MLX4_IB_RX_HASH_SRC_PORT_UDP  |
562                                           MLX4_IB_RX_HASH_DST_PORT_UDP  |
563                                           MLX4_IB_RX_HASH_INNER)) {
564                 pr_debug("RX Hash fields_mask has unsupported mask (0x%llx)\n",
565                          ucmd->rx_hash_fields_mask);
566                 return (-EOPNOTSUPP);
567         }
568
569         if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) &&
570             (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
571                 rss_ctx->flags = MLX4_RSS_IPV4;
572         } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) ||
573                    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) {
574                 pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n");
575                 return (-EOPNOTSUPP);
576         }
577
578         if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) &&
579             (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
580                 rss_ctx->flags |= MLX4_RSS_IPV6;
581         } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) ||
582                    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) {
583                 pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n");
584                 return (-EOPNOTSUPP);
585         }
586
587         if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) &&
588             (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
589                 if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) {
590                         pr_debug("RX Hash fields_mask for UDP is not supported\n");
591                         return (-EOPNOTSUPP);
592                 }
593
594                 if (rss_ctx->flags & MLX4_RSS_IPV4)
595                         rss_ctx->flags |= MLX4_RSS_UDP_IPV4;
596                 if (rss_ctx->flags & MLX4_RSS_IPV6)
597                         rss_ctx->flags |= MLX4_RSS_UDP_IPV6;
598                 if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
599                         pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n");
600                         return (-EOPNOTSUPP);
601                 }
602         } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) ||
603                    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) {
604                 pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n");
605                 return (-EOPNOTSUPP);
606         }
607
608         if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) &&
609             (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
610                 if (rss_ctx->flags & MLX4_RSS_IPV4)
611                         rss_ctx->flags |= MLX4_RSS_TCP_IPV4;
612                 if (rss_ctx->flags & MLX4_RSS_IPV6)
613                         rss_ctx->flags |= MLX4_RSS_TCP_IPV6;
614                 if (!(rss_ctx->flags & (MLX4_RSS_IPV6 | MLX4_RSS_IPV4))) {
615                         pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n");
616                         return (-EOPNOTSUPP);
617                 }
618         } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) ||
619                    (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) {
620                 pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n");
621                 return (-EOPNOTSUPP);
622         }
623
624         if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) {
625                 if (dev->dev->caps.tunnel_offload_mode ==
626                     MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
627                         /*
628                          * Hash according to inner headers if exist, otherwise
629                          * according to outer headers.
630                          */
631                         rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY;
632                 } else {
633                         pr_debug("RSS Hash for inner headers isn't supported\n");
634                         return (-EOPNOTSUPP);
635                 }
636         }
637
638         return 0;
639 }
640
641 static int create_qp_rss(struct mlx4_ib_dev *dev,
642                          struct ib_qp_init_attr *init_attr,
643                          struct mlx4_ib_create_qp_rss *ucmd,
644                          struct mlx4_ib_qp *qp)
645 {
646         int qpn;
647         int err;
648
649         qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
650
651         err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage);
652         if (err)
653                 return err;
654
655         err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
656         if (err)
657                 goto err_qpn;
658
659         mutex_init(&qp->mutex);
660
661         INIT_LIST_HEAD(&qp->gid_list);
662         INIT_LIST_HEAD(&qp->steering_rules);
663
664         qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
665         qp->state = IB_QPS_RESET;
666
667         /* Set dummy send resources to be compatible with HV and PRM */
668         qp->sq_no_prefetch = 1;
669         qp->sq.wqe_cnt = 1;
670         qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
671         qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE;
672         qp->mtt = (to_mqp(
673                    (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt;
674
675         qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL);
676         if (!qp->rss_ctx) {
677                 err = -ENOMEM;
678                 goto err_qp_alloc;
679         }
680
681         err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd);
682         if (err)
683                 goto err;
684
685         return 0;
686
687 err:
688         kfree(qp->rss_ctx);
689
690 err_qp_alloc:
691         mlx4_qp_remove(dev->dev, &qp->mqp);
692         mlx4_qp_free(dev->dev, &qp->mqp);
693
694 err_qpn:
695         mlx4_qp_release_range(dev->dev, qpn, 1);
696         return err;
697 }
698
699 static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
700                                             struct ib_qp_init_attr *init_attr,
701                                             struct ib_udata *udata)
702 {
703         struct mlx4_ib_qp *qp;
704         struct mlx4_ib_create_qp_rss ucmd = {};
705         size_t required_cmd_sz;
706         int err;
707
708         if (!udata) {
709                 pr_debug("RSS QP with NULL udata\n");
710                 return ERR_PTR(-EINVAL);
711         }
712
713         if (udata->outlen)
714                 return ERR_PTR(-EOPNOTSUPP);
715
716         required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
717                                         sizeof(ucmd.reserved1);
718         if (udata->inlen < required_cmd_sz) {
719                 pr_debug("invalid inlen\n");
720                 return ERR_PTR(-EINVAL);
721         }
722
723         if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
724                 pr_debug("copy failed\n");
725                 return ERR_PTR(-EFAULT);
726         }
727
728         if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)))
729                 return ERR_PTR(-EOPNOTSUPP);
730
731         if (ucmd.comp_mask || ucmd.reserved1)
732                 return ERR_PTR(-EOPNOTSUPP);
733
734         if (udata->inlen > sizeof(ucmd) &&
735             !ib_is_udata_cleared(udata, sizeof(ucmd),
736                                  udata->inlen - sizeof(ucmd))) {
737                 pr_debug("inlen is not supported\n");
738                 return ERR_PTR(-EOPNOTSUPP);
739         }
740
741         if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
742                 pr_debug("RSS QP with unsupported QP type %d\n",
743                          init_attr->qp_type);
744                 return ERR_PTR(-EOPNOTSUPP);
745         }
746
747         if (init_attr->create_flags) {
748                 pr_debug("RSS QP doesn't support create flags\n");
749                 return ERR_PTR(-EOPNOTSUPP);
750         }
751
752         if (init_attr->send_cq || init_attr->cap.max_send_wr) {
753                 pr_debug("RSS QP with unsupported send attributes\n");
754                 return ERR_PTR(-EOPNOTSUPP);
755         }
756
757         qp = kzalloc(sizeof(*qp), GFP_KERNEL);
758         if (!qp)
759                 return ERR_PTR(-ENOMEM);
760
761         qp->pri.vid = 0xFFFF;
762         qp->alt.vid = 0xFFFF;
763
764         err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
765         if (err) {
766                 kfree(qp);
767                 return ERR_PTR(err);
768         }
769
770         qp->ibqp.qp_num = qp->mqp.qpn;
771
772         return &qp->ibqp;
773 }
774
775 /*
776  * This function allocates a WQN from a range which is consecutive and aligned
777  * to its size. In case the range is full, then it creates a new range and
778  * allocates WQN from it. The new range will be used for following allocations.
779  */
780 static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context,
781                              struct mlx4_ib_qp *qp, int range_size, int *wqn)
782 {
783         struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
784         struct mlx4_wqn_range *range;
785         int err = 0;
786
787         mutex_lock(&context->wqn_ranges_mutex);
788
789         range = list_first_entry_or_null(&context->wqn_ranges_list,
790                                          struct mlx4_wqn_range, list);
791
792         if (!range || (range->refcount == range->size) || range->dirty) {
793                 range = kzalloc(sizeof(*range), GFP_KERNEL);
794                 if (!range) {
795                         err = -ENOMEM;
796                         goto out;
797                 }
798
799                 err = mlx4_qp_reserve_range(dev->dev, range_size,
800                                             range_size, &range->base_wqn, 0,
801                                             qp->mqp.usage);
802                 if (err) {
803                         kfree(range);
804                         goto out;
805                 }
806
807                 range->size = range_size;
808                 list_add(&range->list, &context->wqn_ranges_list);
809         } else if (range_size != 1) {
810                 /*
811                  * Requesting a new range (>1) when last range is still open, is
812                  * not valid.
813                  */
814                 err = -EINVAL;
815                 goto out;
816         }
817
818         qp->wqn_range = range;
819
820         *wqn = range->base_wqn + range->refcount;
821
822         range->refcount++;
823
824 out:
825         mutex_unlock(&context->wqn_ranges_mutex);
826
827         return err;
828 }
829
830 static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context,
831                                 struct mlx4_ib_qp *qp, bool dirty_release)
832 {
833         struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device);
834         struct mlx4_wqn_range *range;
835
836         mutex_lock(&context->wqn_ranges_mutex);
837
838         range = qp->wqn_range;
839
840         range->refcount--;
841         if (!range->refcount) {
842                 mlx4_qp_release_range(dev->dev, range->base_wqn,
843                                       range->size);
844                 list_del(&range->list);
845                 kfree(range);
846         } else if (dirty_release) {
847         /*
848          * A range which one of its WQNs is destroyed, won't be able to be
849          * reused for further WQN allocations.
850          * The next created WQ will allocate a new range.
851          */
852                 range->dirty = 1;
853         }
854
855         mutex_unlock(&context->wqn_ranges_mutex);
856 }
857
858 static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
859                             enum mlx4_ib_source_type src,
860                             struct ib_qp_init_attr *init_attr,
861                             struct ib_udata *udata, int sqpn,
862                             struct mlx4_ib_qp **caller_qp)
863 {
864         int qpn;
865         int err;
866         struct mlx4_ib_sqp *sqp = NULL;
867         struct mlx4_ib_qp *qp;
868         struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
869                 udata, struct mlx4_ib_ucontext, ibucontext);
870         enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
871         struct mlx4_ib_cq *mcq;
872         unsigned long flags;
873         int range_size = 0;
874
875         /* When tunneling special qps, we use a plain UD qp */
876         if (sqpn) {
877                 if (mlx4_is_mfunc(dev->dev) &&
878                     (!mlx4_is_master(dev->dev) ||
879                      !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
880                         if (init_attr->qp_type == IB_QPT_GSI)
881                                 qp_type = MLX4_IB_QPT_PROXY_GSI;
882                         else {
883                                 if (mlx4_is_master(dev->dev) ||
884                                     qp0_enabled_vf(dev->dev, sqpn))
885                                         qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
886                                 else
887                                         qp_type = MLX4_IB_QPT_PROXY_SMI;
888                         }
889                 }
890                 qpn = sqpn;
891                 /* add extra sg entry for tunneling */
892                 init_attr->cap.max_recv_sge++;
893         } else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
894                 struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
895                         container_of(init_attr,
896                                      struct mlx4_ib_qp_tunnel_init_attr, init_attr);
897                 if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
898                      tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
899                     !mlx4_is_master(dev->dev))
900                         return -EINVAL;
901                 if (tnl_init->proxy_qp_type == IB_QPT_GSI)
902                         qp_type = MLX4_IB_QPT_TUN_GSI;
903                 else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
904                          mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
905                                              tnl_init->port))
906                         qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
907                 else
908                         qp_type = MLX4_IB_QPT_TUN_SMI;
909                 /* we are definitely in the PPF here, since we are creating
910                  * tunnel QPs. base_tunnel_sqpn is therefore valid. */
911                 qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
912                         + tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
913                 sqpn = qpn;
914         }
915
916         if (!*caller_qp) {
917                 if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
918                     (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
919                                 MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
920                         sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
921                         if (!sqp)
922                                 return -ENOMEM;
923                         qp = &sqp->qp;
924                         qp->pri.vid = 0xFFFF;
925                         qp->alt.vid = 0xFFFF;
926                 } else {
927                         qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL);
928                         if (!qp)
929                                 return -ENOMEM;
930                         qp->pri.vid = 0xFFFF;
931                         qp->alt.vid = 0xFFFF;
932                 }
933         } else
934                 qp = *caller_qp;
935
936         qp->mlx4_ib_qp_type = qp_type;
937
938         mutex_init(&qp->mutex);
939         spin_lock_init(&qp->sq.lock);
940         spin_lock_init(&qp->rq.lock);
941         INIT_LIST_HEAD(&qp->gid_list);
942         INIT_LIST_HEAD(&qp->steering_rules);
943
944         qp->state        = IB_QPS_RESET;
945         if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
946                 qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
947
948
949         if (udata) {
950                 union {
951                         struct mlx4_ib_create_qp qp;
952                         struct mlx4_ib_create_wq wq;
953                 } ucmd;
954                 size_t copy_len;
955                 int shift;
956                 int n;
957
958                 copy_len = (src == MLX4_IB_QP_SRC) ?
959                            sizeof(struct mlx4_ib_create_qp) :
960                            min(sizeof(struct mlx4_ib_create_wq), udata->inlen);
961
962                 if (ib_copy_from_udata(&ucmd, udata, copy_len)) {
963                         err = -EFAULT;
964                         goto err;
965                 }
966
967                 if (src == MLX4_IB_RWQ_SRC) {
968                         if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] ||
969                             ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) {
970                                 pr_debug("user command isn't supported\n");
971                                 err = -EOPNOTSUPP;
972                                 goto err;
973                         }
974
975                         if (ucmd.wq.log_range_size >
976                             ilog2(dev->dev->caps.max_rss_tbl_sz)) {
977                                 pr_debug("WQN range size must be equal or smaller than %d\n",
978                                          dev->dev->caps.max_rss_tbl_sz);
979                                 err = -EOPNOTSUPP;
980                                 goto err;
981                         }
982                         range_size = 1 << ucmd.wq.log_range_size;
983                 } else {
984                         qp->inl_recv_sz = ucmd.qp.inl_recv_sz;
985                 }
986
987                 if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
988                         if (!(dev->dev->caps.flags &
989                               MLX4_DEV_CAP_FLAG_FCS_KEEP)) {
990                                 pr_debug("scatter FCS is unsupported\n");
991                                 err = -EOPNOTSUPP;
992                                 goto err;
993                         }
994
995                         qp->flags |= MLX4_IB_QP_SCATTER_FCS;
996                 }
997
998                 err = set_rq_size(dev, &init_attr->cap, udata,
999                                   qp_has_rq(init_attr), qp, qp->inl_recv_sz);
1000                 if (err)
1001                         goto err;
1002
1003                 if (src == MLX4_IB_QP_SRC) {
1004                         qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch;
1005
1006                         err = set_user_sq_size(dev, qp,
1007                                                (struct mlx4_ib_create_qp *)
1008                                                &ucmd);
1009                         if (err)
1010                                 goto err;
1011                 } else {
1012                         qp->sq_no_prefetch = 1;
1013                         qp->sq.wqe_cnt = 1;
1014                         qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE;
1015                         /* Allocated buffer expects to have at least that SQ
1016                          * size.
1017                          */
1018                         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
1019                                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
1020                 }
1021
1022                 qp->umem =
1023                         ib_umem_get(udata,
1024                                     (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr :
1025                                                               ucmd.wq.buf_addr,
1026                                     qp->buf_size, 0, 0);
1027                 if (IS_ERR(qp->umem)) {
1028                         err = PTR_ERR(qp->umem);
1029                         goto err;
1030                 }
1031
1032                 n = ib_umem_page_count(qp->umem);
1033                 shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
1034                 err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
1035
1036                 if (err)
1037                         goto err_buf;
1038
1039                 err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
1040                 if (err)
1041                         goto err_mtt;
1042
1043                 if (qp_has_rq(init_attr)) {
1044                         err = mlx4_ib_db_map_user(
1045                                 context, udata,
1046                                 (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
1047                                                           ucmd.wq.db_addr,
1048                                 &qp->db);
1049                         if (err)
1050                                 goto err_mtt;
1051                 }
1052                 qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS;
1053         } else {
1054                 err = set_rq_size(dev, &init_attr->cap, udata,
1055                                   qp_has_rq(init_attr), qp, 0);
1056                 if (err)
1057                         goto err;
1058
1059                 qp->sq_no_prefetch = 0;
1060
1061                 if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
1062                         qp->flags |= MLX4_IB_QP_LSO;
1063
1064                 if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
1065                         if (dev->steering_support ==
1066                             MLX4_STEERING_MODE_DEVICE_MANAGED)
1067                                 qp->flags |= MLX4_IB_QP_NETIF;
1068                         else
1069                                 goto err;
1070                 }
1071
1072                 err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
1073                 if (err)
1074                         goto err;
1075
1076                 if (qp_has_rq(init_attr)) {
1077                         err = mlx4_db_alloc(dev->dev, &qp->db, 0);
1078                         if (err)
1079                                 goto err;
1080
1081                         *qp->db.db = 0;
1082                 }
1083
1084                 if (mlx4_buf_alloc(dev->dev, qp->buf_size,  PAGE_SIZE * 2,
1085                                    &qp->buf)) {
1086                         err = -ENOMEM;
1087                         goto err_db;
1088                 }
1089
1090                 err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
1091                                     &qp->mtt);
1092                 if (err)
1093                         goto err_buf;
1094
1095                 err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
1096                 if (err)
1097                         goto err_mtt;
1098
1099                 qp->sq.wrid = kvmalloc_array(qp->sq.wqe_cnt,
1100                                              sizeof(u64), GFP_KERNEL);
1101                 qp->rq.wrid = kvmalloc_array(qp->rq.wqe_cnt,
1102                                              sizeof(u64), GFP_KERNEL);
1103                 if (!qp->sq.wrid || !qp->rq.wrid) {
1104                         err = -ENOMEM;
1105                         goto err_wrid;
1106                 }
1107                 qp->mqp.usage = MLX4_RES_USAGE_DRIVER;
1108         }
1109
1110         if (sqpn) {
1111                 if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
1112                     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
1113                         if (alloc_proxy_bufs(pd->device, qp)) {
1114                                 err = -ENOMEM;
1115                                 goto err_wrid;
1116                         }
1117                 }
1118         } else if (src == MLX4_IB_RWQ_SRC) {
1119                 err = mlx4_ib_alloc_wqn(context, qp, range_size, &qpn);
1120                 if (err)
1121                         goto err_wrid;
1122         } else {
1123                 /* Raw packet QPNs may not have bits 6,7 set in their qp_num;
1124                  * otherwise, the WQE BlueFlame setup flow wrongly causes
1125                  * VLAN insertion. */
1126                 if (init_attr->qp_type == IB_QPT_RAW_PACKET)
1127                         err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
1128                                                     (init_attr->cap.max_send_wr ?
1129                                                      MLX4_RESERVE_ETH_BF_QP : 0) |
1130                                                     (init_attr->cap.max_recv_wr ?
1131                                                      MLX4_RESERVE_A0_QP : 0),
1132                                                     qp->mqp.usage);
1133                 else
1134                         if (qp->flags & MLX4_IB_QP_NETIF)
1135                                 err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
1136                         else
1137                                 err = mlx4_qp_reserve_range(dev->dev, 1, 1,
1138                                                             &qpn, 0, qp->mqp.usage);
1139                 if (err)
1140                         goto err_proxy;
1141         }
1142
1143         if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
1144                 qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1145
1146         err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
1147         if (err)
1148                 goto err_qpn;
1149
1150         if (init_attr->qp_type == IB_QPT_XRC_TGT)
1151                 qp->mqp.qpn |= (1 << 23);
1152
1153         /*
1154          * Hardware wants QPN written in big-endian order (after
1155          * shifting) for send doorbell.  Precompute this value to save
1156          * a little bit when posting sends.
1157          */
1158         qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
1159
1160         qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event :
1161                                                   mlx4_ib_wq_event;
1162
1163         if (!*caller_qp)
1164                 *caller_qp = qp;
1165
1166         spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1167         mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
1168                          to_mcq(init_attr->recv_cq));
1169         /* Maintain device to QPs access, needed for further handling
1170          * via reset flow
1171          */
1172         list_add_tail(&qp->qps_list, &dev->qp_list);
1173         /* Maintain CQ to QPs access, needed for further handling
1174          * via reset flow
1175          */
1176         mcq = to_mcq(init_attr->send_cq);
1177         list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
1178         mcq = to_mcq(init_attr->recv_cq);
1179         list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
1180         mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
1181                            to_mcq(init_attr->recv_cq));
1182         spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1183         return 0;
1184
1185 err_qpn:
1186         if (!sqpn) {
1187                 if (qp->flags & MLX4_IB_QP_NETIF)
1188                         mlx4_ib_steer_qp_free(dev, qpn, 1);
1189                 else if (src == MLX4_IB_RWQ_SRC)
1190                         mlx4_ib_release_wqn(context, qp, 0);
1191                 else
1192                         mlx4_qp_release_range(dev->dev, qpn, 1);
1193         }
1194 err_proxy:
1195         if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
1196                 free_proxy_bufs(pd->device, qp);
1197 err_wrid:
1198         if (udata) {
1199                 if (qp_has_rq(init_attr))
1200                         mlx4_ib_db_unmap_user(context, &qp->db);
1201         } else {
1202                 kvfree(qp->sq.wrid);
1203                 kvfree(qp->rq.wrid);
1204         }
1205
1206 err_mtt:
1207         mlx4_mtt_cleanup(dev->dev, &qp->mtt);
1208
1209 err_buf:
1210         if (qp->umem)
1211                 ib_umem_release(qp->umem);
1212         else
1213                 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
1214
1215 err_db:
1216         if (!udata && qp_has_rq(init_attr))
1217                 mlx4_db_free(dev->dev, &qp->db);
1218
1219 err:
1220         if (!sqp && !*caller_qp)
1221                 kfree(qp);
1222         kfree(sqp);
1223
1224         return err;
1225 }
1226
1227 static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
1228 {
1229         switch (state) {
1230         case IB_QPS_RESET:      return MLX4_QP_STATE_RST;
1231         case IB_QPS_INIT:       return MLX4_QP_STATE_INIT;
1232         case IB_QPS_RTR:        return MLX4_QP_STATE_RTR;
1233         case IB_QPS_RTS:        return MLX4_QP_STATE_RTS;
1234         case IB_QPS_SQD:        return MLX4_QP_STATE_SQD;
1235         case IB_QPS_SQE:        return MLX4_QP_STATE_SQER;
1236         case IB_QPS_ERR:        return MLX4_QP_STATE_ERR;
1237         default:                return -1;
1238         }
1239 }
1240
1241 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
1242         __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
1243 {
1244         if (send_cq == recv_cq) {
1245                 spin_lock(&send_cq->lock);
1246                 __acquire(&recv_cq->lock);
1247         } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1248                 spin_lock(&send_cq->lock);
1249                 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
1250         } else {
1251                 spin_lock(&recv_cq->lock);
1252                 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
1253         }
1254 }
1255
1256 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
1257         __releases(&send_cq->lock) __releases(&recv_cq->lock)
1258 {
1259         if (send_cq == recv_cq) {
1260                 __release(&recv_cq->lock);
1261                 spin_unlock(&send_cq->lock);
1262         } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1263                 spin_unlock(&recv_cq->lock);
1264                 spin_unlock(&send_cq->lock);
1265         } else {
1266                 spin_unlock(&send_cq->lock);
1267                 spin_unlock(&recv_cq->lock);
1268         }
1269 }
1270
1271 static void del_gid_entries(struct mlx4_ib_qp *qp)
1272 {
1273         struct mlx4_ib_gid_entry *ge, *tmp;
1274
1275         list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
1276                 list_del(&ge->list);
1277                 kfree(ge);
1278         }
1279 }
1280
1281 static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
1282 {
1283         if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
1284                 return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
1285         else
1286                 return to_mpd(qp->ibqp.pd);
1287 }
1288
1289 static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src,
1290                     struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
1291 {
1292         switch (qp->ibqp.qp_type) {
1293         case IB_QPT_XRC_TGT:
1294                 *send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
1295                 *recv_cq = *send_cq;
1296                 break;
1297         case IB_QPT_XRC_INI:
1298                 *send_cq = to_mcq(qp->ibqp.send_cq);
1299                 *recv_cq = *send_cq;
1300                 break;
1301         default:
1302                 *recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) :
1303                                                      to_mcq(qp->ibwq.cq);
1304                 *send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) :
1305                                                      *recv_cq;
1306                 break;
1307         }
1308 }
1309
1310 static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1311 {
1312         if (qp->state != IB_QPS_RESET) {
1313                 int i;
1314
1315                 for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size);
1316                      i++) {
1317                         struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i];
1318                         struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
1319
1320                         mutex_lock(&wq->mutex);
1321
1322                         wq->rss_usecnt--;
1323
1324                         mutex_unlock(&wq->mutex);
1325                 }
1326
1327                 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
1328                                    MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
1329                         pr_warn("modify QP %06x to RESET failed.\n",
1330                                 qp->mqp.qpn);
1331         }
1332
1333         mlx4_qp_remove(dev->dev, &qp->mqp);
1334         mlx4_qp_free(dev->dev, &qp->mqp);
1335         mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
1336         del_gid_entries(qp);
1337         kfree(qp->rss_ctx);
1338 }
1339
1340 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
1341                               enum mlx4_ib_source_type src, bool is_user)
1342 {
1343         struct mlx4_ib_cq *send_cq, *recv_cq;
1344         unsigned long flags;
1345
1346         if (qp->state != IB_QPS_RESET) {
1347                 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
1348                                    MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
1349                         pr_warn("modify QP %06x to RESET failed.\n",
1350                                qp->mqp.qpn);
1351                 if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
1352                         mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
1353                         qp->pri.smac = 0;
1354                         qp->pri.smac_port = 0;
1355                 }
1356                 if (qp->alt.smac) {
1357                         mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
1358                         qp->alt.smac = 0;
1359                 }
1360                 if (qp->pri.vid < 0x1000) {
1361                         mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
1362                         qp->pri.vid = 0xFFFF;
1363                         qp->pri.candidate_vid = 0xFFFF;
1364                         qp->pri.update_vid = 0;
1365                 }
1366                 if (qp->alt.vid < 0x1000) {
1367                         mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
1368                         qp->alt.vid = 0xFFFF;
1369                         qp->alt.candidate_vid = 0xFFFF;
1370                         qp->alt.update_vid = 0;
1371                 }
1372         }
1373
1374         get_cqs(qp, src, &send_cq, &recv_cq);
1375
1376         spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1377         mlx4_ib_lock_cqs(send_cq, recv_cq);
1378
1379         /* del from lists under both locks above to protect reset flow paths */
1380         list_del(&qp->qps_list);
1381         list_del(&qp->cq_send_list);
1382         list_del(&qp->cq_recv_list);
1383         if (!is_user) {
1384                 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
1385                                  qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
1386                 if (send_cq != recv_cq)
1387                         __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
1388         }
1389
1390         mlx4_qp_remove(dev->dev, &qp->mqp);
1391
1392         mlx4_ib_unlock_cqs(send_cq, recv_cq);
1393         spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1394
1395         mlx4_qp_free(dev->dev, &qp->mqp);
1396
1397         if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
1398                 if (qp->flags & MLX4_IB_QP_NETIF)
1399                         mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
1400                 else if (src == MLX4_IB_RWQ_SRC)
1401                         mlx4_ib_release_wqn(to_mucontext(
1402                                             qp->ibwq.uobject->context), qp, 1);
1403                 else
1404                         mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
1405         }
1406
1407         mlx4_mtt_cleanup(dev->dev, &qp->mtt);
1408
1409         if (is_user) {
1410                 if (qp->rq.wqe_cnt) {
1411                         struct mlx4_ib_ucontext *mcontext = !src ?
1412                                 to_mucontext(qp->ibqp.uobject->context) :
1413                                 to_mucontext(qp->ibwq.uobject->context);
1414                         mlx4_ib_db_unmap_user(mcontext, &qp->db);
1415                 }
1416                 ib_umem_release(qp->umem);
1417         } else {
1418                 kvfree(qp->sq.wrid);
1419                 kvfree(qp->rq.wrid);
1420                 if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
1421                     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
1422                         free_proxy_bufs(&dev->ib_dev, qp);
1423                 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
1424                 if (qp->rq.wqe_cnt)
1425                         mlx4_db_free(dev->dev, &qp->db);
1426         }
1427
1428         del_gid_entries(qp);
1429 }
1430
1431 static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
1432 {
1433         /* Native or PPF */
1434         if (!mlx4_is_mfunc(dev->dev) ||
1435             (mlx4_is_master(dev->dev) &&
1436              attr->create_flags & MLX4_IB_SRIOV_SQP)) {
1437                 return  dev->dev->phys_caps.base_sqpn +
1438                         (attr->qp_type == IB_QPT_SMI ? 0 : 2) +
1439                         attr->port_num - 1;
1440         }
1441         /* PF or VF -- creating proxies */
1442         if (attr->qp_type == IB_QPT_SMI)
1443                 return dev->dev->caps.spec_qps[attr->port_num - 1].qp0_proxy;
1444         else
1445                 return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
1446 }
1447
1448 static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
1449                                         struct ib_qp_init_attr *init_attr,
1450                                         struct ib_udata *udata)
1451 {
1452         struct mlx4_ib_qp *qp = NULL;
1453         int err;
1454         int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1455         u16 xrcdn = 0;
1456
1457         if (init_attr->rwq_ind_tbl)
1458                 return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
1459
1460         /*
1461          * We only support LSO, vendor flag1, and multicast loopback blocking,
1462          * and only for kernel UD QPs.
1463          */
1464         if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
1465                                         MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
1466                                         MLX4_IB_SRIOV_TUNNEL_QP |
1467                                         MLX4_IB_SRIOV_SQP |
1468                                         MLX4_IB_QP_NETIF |
1469                                         MLX4_IB_QP_CREATE_ROCE_V2_GSI))
1470                 return ERR_PTR(-EINVAL);
1471
1472         if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
1473                 if (init_attr->qp_type != IB_QPT_UD)
1474                         return ERR_PTR(-EINVAL);
1475         }
1476
1477         if (init_attr->create_flags) {
1478                 if (udata && init_attr->create_flags & ~(sup_u_create_flags))
1479                         return ERR_PTR(-EINVAL);
1480
1481                 if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
1482                                                  MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
1483                                                  MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) &&
1484                      init_attr->qp_type != IB_QPT_UD) ||
1485                     (init_attr->create_flags & MLX4_IB_SRIOV_SQP &&
1486                      init_attr->qp_type > IB_QPT_GSI) ||
1487                     (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
1488                      init_attr->qp_type != IB_QPT_GSI))
1489                         return ERR_PTR(-EINVAL);
1490         }
1491
1492         switch (init_attr->qp_type) {
1493         case IB_QPT_XRC_TGT:
1494                 pd = to_mxrcd(init_attr->xrcd)->pd;
1495                 xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
1496                 init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
1497                 /* fall through */
1498         case IB_QPT_XRC_INI:
1499                 if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
1500                         return ERR_PTR(-ENOSYS);
1501                 init_attr->recv_cq = init_attr->send_cq;
1502                 /* fall through */
1503         case IB_QPT_RC:
1504         case IB_QPT_UC:
1505         case IB_QPT_RAW_PACKET:
1506                 qp = kzalloc(sizeof(*qp), GFP_KERNEL);
1507                 if (!qp)
1508                         return ERR_PTR(-ENOMEM);
1509                 qp->pri.vid = 0xFFFF;
1510                 qp->alt.vid = 0xFFFF;
1511                 /* fall through */
1512         case IB_QPT_UD:
1513         {
1514                 err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
1515                                        init_attr, udata, 0, &qp);
1516                 if (err) {
1517                         kfree(qp);
1518                         return ERR_PTR(err);
1519                 }
1520
1521                 qp->ibqp.qp_num = qp->mqp.qpn;
1522                 qp->xrcdn = xrcdn;
1523
1524                 break;
1525         }
1526         case IB_QPT_SMI:
1527         case IB_QPT_GSI:
1528         {
1529                 int sqpn;
1530
1531                 /* Userspace is not allowed to create special QPs: */
1532                 if (udata)
1533                         return ERR_PTR(-EINVAL);
1534                 if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
1535                         int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
1536                                                         1, 1, &sqpn, 0,
1537                                                         MLX4_RES_USAGE_DRIVER);
1538
1539                         if (res)
1540                                 return ERR_PTR(res);
1541                 } else {
1542                         sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
1543                 }
1544
1545                 err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC,
1546                                        init_attr, udata, sqpn, &qp);
1547                 if (err)
1548                         return ERR_PTR(err);
1549
1550                 qp->port        = init_attr->port_num;
1551                 qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
1552                         init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1;
1553                 break;
1554         }
1555         default:
1556                 /* Don't support raw QPs */
1557                 return ERR_PTR(-EINVAL);
1558         }
1559
1560         return &qp->ibqp;
1561 }
1562
1563 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
1564                                 struct ib_qp_init_attr *init_attr,
1565                                 struct ib_udata *udata) {
1566         struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
1567         struct ib_qp *ibqp;
1568         struct mlx4_ib_dev *dev = to_mdev(device);
1569
1570         ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
1571
1572         if (!IS_ERR(ibqp) &&
1573             (init_attr->qp_type == IB_QPT_GSI) &&
1574             !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
1575                 struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
1576                 int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
1577
1578                 if (is_eth &&
1579                     dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) {
1580                         init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI;
1581                         sqp->roce_v2_gsi = ib_create_qp(pd, init_attr);
1582
1583                         if (IS_ERR(sqp->roce_v2_gsi)) {
1584                                 pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
1585                                 sqp->roce_v2_gsi = NULL;
1586                         } else {
1587                                 sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
1588                                 sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
1589                         }
1590
1591                         init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
1592                 }
1593         }
1594         return ibqp;
1595 }
1596
1597 static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
1598 {
1599         struct mlx4_ib_dev *dev = to_mdev(qp->device);
1600         struct mlx4_ib_qp *mqp = to_mqp(qp);
1601
1602         if (is_qp0(dev, mqp))
1603                 mlx4_CLOSE_PORT(dev->dev, mqp->port);
1604
1605         if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI &&
1606             dev->qp1_proxy[mqp->port - 1] == mqp) {
1607                 mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
1608                 dev->qp1_proxy[mqp->port - 1] = NULL;
1609                 mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
1610         }
1611
1612         if (mqp->counter_index)
1613                 mlx4_ib_free_qp_counter(dev, mqp);
1614
1615         if (qp->rwq_ind_tbl) {
1616                 destroy_qp_rss(dev, mqp);
1617         } else {
1618                 destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, qp->uobject);
1619         }
1620
1621         if (is_sqp(dev, mqp))
1622                 kfree(to_msqp(mqp));
1623         else
1624                 kfree(mqp);
1625
1626         return 0;
1627 }
1628
1629 int mlx4_ib_destroy_qp(struct ib_qp *qp)
1630 {
1631         struct mlx4_ib_qp *mqp = to_mqp(qp);
1632
1633         if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
1634                 struct mlx4_ib_sqp *sqp = to_msqp(mqp);
1635
1636                 if (sqp->roce_v2_gsi)
1637                         ib_destroy_qp(sqp->roce_v2_gsi);
1638         }
1639
1640         return _mlx4_ib_destroy_qp(qp);
1641 }
1642
1643 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
1644 {
1645         switch (type) {
1646         case MLX4_IB_QPT_RC:            return MLX4_QP_ST_RC;
1647         case MLX4_IB_QPT_UC:            return MLX4_QP_ST_UC;
1648         case MLX4_IB_QPT_UD:            return MLX4_QP_ST_UD;
1649         case MLX4_IB_QPT_XRC_INI:
1650         case MLX4_IB_QPT_XRC_TGT:       return MLX4_QP_ST_XRC;
1651         case MLX4_IB_QPT_SMI:
1652         case MLX4_IB_QPT_GSI:
1653         case MLX4_IB_QPT_RAW_PACKET:    return MLX4_QP_ST_MLX;
1654
1655         case MLX4_IB_QPT_PROXY_SMI_OWNER:
1656         case MLX4_IB_QPT_TUN_SMI_OWNER: return (mlx4_is_mfunc(dev->dev) ?
1657                                                 MLX4_QP_ST_MLX : -1);
1658         case MLX4_IB_QPT_PROXY_SMI:
1659         case MLX4_IB_QPT_TUN_SMI:
1660         case MLX4_IB_QPT_PROXY_GSI:
1661         case MLX4_IB_QPT_TUN_GSI:       return (mlx4_is_mfunc(dev->dev) ?
1662                                                 MLX4_QP_ST_UD : -1);
1663         default:                        return -1;
1664         }
1665 }
1666
1667 static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
1668                                    int attr_mask)
1669 {
1670         u8 dest_rd_atomic;
1671         u32 access_flags;
1672         u32 hw_access_flags = 0;
1673
1674         if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1675                 dest_rd_atomic = attr->max_dest_rd_atomic;
1676         else
1677                 dest_rd_atomic = qp->resp_depth;
1678
1679         if (attr_mask & IB_QP_ACCESS_FLAGS)
1680                 access_flags = attr->qp_access_flags;
1681         else
1682                 access_flags = qp->atomic_rd_en;
1683
1684         if (!dest_rd_atomic)
1685                 access_flags &= IB_ACCESS_REMOTE_WRITE;
1686
1687         if (access_flags & IB_ACCESS_REMOTE_READ)
1688                 hw_access_flags |= MLX4_QP_BIT_RRE;
1689         if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
1690                 hw_access_flags |= MLX4_QP_BIT_RAE;
1691         if (access_flags & IB_ACCESS_REMOTE_WRITE)
1692                 hw_access_flags |= MLX4_QP_BIT_RWE;
1693
1694         return cpu_to_be32(hw_access_flags);
1695 }
1696
1697 static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
1698                             int attr_mask)
1699 {
1700         if (attr_mask & IB_QP_PKEY_INDEX)
1701                 sqp->pkey_index = attr->pkey_index;
1702         if (attr_mask & IB_QP_QKEY)
1703                 sqp->qkey = attr->qkey;
1704         if (attr_mask & IB_QP_SQ_PSN)
1705                 sqp->send_psn = attr->sq_psn;
1706 }
1707
1708 static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
1709 {
1710         path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
1711 }
1712
1713 static int _mlx4_set_path(struct mlx4_ib_dev *dev,
1714                           const struct rdma_ah_attr *ah,
1715                           u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
1716                           struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
1717 {
1718         int vidx;
1719         int smac_index;
1720         int err;
1721
1722         path->grh_mylmc = rdma_ah_get_path_bits(ah) & 0x7f;
1723         path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
1724         if (rdma_ah_get_static_rate(ah)) {
1725                 path->static_rate = rdma_ah_get_static_rate(ah) +
1726                                     MLX4_STAT_RATE_OFFSET;
1727                 while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
1728                        !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
1729                         --path->static_rate;
1730         } else
1731                 path->static_rate = 0;
1732
1733         if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) {
1734                 const struct ib_global_route *grh = rdma_ah_read_grh(ah);
1735                 int real_sgid_index =
1736                         mlx4_ib_gid_index_to_real_index(dev, grh->sgid_attr);
1737
1738                 if (real_sgid_index < 0)
1739                         return real_sgid_index;
1740                 if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) {
1741                         pr_err("sgid_index (%u) too large. max is %d\n",
1742                                real_sgid_index, dev->dev->caps.gid_table_len[port] - 1);
1743                         return -1;
1744                 }
1745
1746                 path->grh_mylmc |= 1 << 7;
1747                 path->mgid_index = real_sgid_index;
1748                 path->hop_limit  = grh->hop_limit;
1749                 path->tclass_flowlabel =
1750                         cpu_to_be32((grh->traffic_class << 20) |
1751                                     (grh->flow_label));
1752                 memcpy(path->rgid, grh->dgid.raw, 16);
1753         }
1754
1755         if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) {
1756                 if (!(rdma_ah_get_ah_flags(ah) & IB_AH_GRH))
1757                         return -1;
1758
1759                 path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
1760                         ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 7) << 3);
1761
1762                 path->feup |= MLX4_FEUP_FORCE_ETH_UP;
1763                 if (vlan_tag < 0x1000) {
1764                         if (smac_info->vid < 0x1000) {
1765                                 /* both valid vlan ids */
1766                                 if (smac_info->vid != vlan_tag) {
1767                                         /* different VIDs.  unreg old and reg new */
1768                                         err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1769                                         if (err)
1770                                                 return err;
1771                                         smac_info->candidate_vid = vlan_tag;
1772                                         smac_info->candidate_vlan_index = vidx;
1773                                         smac_info->candidate_vlan_port = port;
1774                                         smac_info->update_vid = 1;
1775                                         path->vlan_index = vidx;
1776                                 } else {
1777                                         path->vlan_index = smac_info->vlan_index;
1778                                 }
1779                         } else {
1780                                 /* no current vlan tag in qp */
1781                                 err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1782                                 if (err)
1783                                         return err;
1784                                 smac_info->candidate_vid = vlan_tag;
1785                                 smac_info->candidate_vlan_index = vidx;
1786                                 smac_info->candidate_vlan_port = port;
1787                                 smac_info->update_vid = 1;
1788                                 path->vlan_index = vidx;
1789                         }
1790                         path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
1791                         path->fl = 1 << 6;
1792                 } else {
1793                         /* have current vlan tag. unregister it at modify-qp success */
1794                         if (smac_info->vid < 0x1000) {
1795                                 smac_info->candidate_vid = 0xFFFF;
1796                                 smac_info->update_vid = 1;
1797                         }
1798                 }
1799
1800                 /* get smac_index for RoCE use.
1801                  * If no smac was yet assigned, register one.
1802                  * If one was already assigned, but the new mac differs,
1803                  * unregister the old one and register the new one.
1804                 */
1805                 if ((!smac_info->smac && !smac_info->smac_port) ||
1806                     smac_info->smac != smac) {
1807                         /* register candidate now, unreg if needed, after success */
1808                         smac_index = mlx4_register_mac(dev->dev, port, smac);
1809                         if (smac_index >= 0) {
1810                                 smac_info->candidate_smac_index = smac_index;
1811                                 smac_info->candidate_smac = smac;
1812                                 smac_info->candidate_smac_port = port;
1813                         } else {
1814                                 return -EINVAL;
1815                         }
1816                 } else {
1817                         smac_index = smac_info->smac_index;
1818                 }
1819                 memcpy(path->dmac, ah->roce.dmac, 6);
1820                 path->ackto = MLX4_IB_LINK_TYPE_ETH;
1821                 /* put MAC table smac index for IBoE */
1822                 path->grh_mylmc = (u8) (smac_index) | 0x80;
1823         } else {
1824                 path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
1825                         ((port - 1) << 6) | ((rdma_ah_get_sl(ah) & 0xf) << 2);
1826         }
1827
1828         return 0;
1829 }
1830
1831 static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
1832                          enum ib_qp_attr_mask qp_attr_mask,
1833                          struct mlx4_ib_qp *mqp,
1834                          struct mlx4_qp_path *path, u8 port,
1835                          u16 vlan_id, u8 *smac)
1836 {
1837         return _mlx4_set_path(dev, &qp->ah_attr,
1838                               mlx4_mac_to_u64(smac),
1839                               vlan_id,
1840                               path, &mqp->pri, port);
1841 }
1842
1843 static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
1844                              const struct ib_qp_attr *qp,
1845                              enum ib_qp_attr_mask qp_attr_mask,
1846                              struct mlx4_ib_qp *mqp,
1847                              struct mlx4_qp_path *path, u8 port)
1848 {
1849         return _mlx4_set_path(dev, &qp->alt_ah_attr,
1850                               0,
1851                               0xffff,
1852                               path, &mqp->alt, port);
1853 }
1854
1855 static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1856 {
1857         struct mlx4_ib_gid_entry *ge, *tmp;
1858
1859         list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
1860                 if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
1861                         ge->added = 1;
1862                         ge->port = qp->port;
1863                 }
1864         }
1865 }
1866
1867 static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
1868                                     struct mlx4_ib_qp *qp,
1869                                     struct mlx4_qp_context *context)
1870 {
1871         u64 u64_mac;
1872         int smac_index;
1873
1874         u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
1875
1876         context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
1877         if (!qp->pri.smac && !qp->pri.smac_port) {
1878                 smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
1879                 if (smac_index >= 0) {
1880                         qp->pri.candidate_smac_index = smac_index;
1881                         qp->pri.candidate_smac = u64_mac;
1882                         qp->pri.candidate_smac_port = qp->port;
1883                         context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
1884                 } else {
1885                         return -ENOENT;
1886                 }
1887         }
1888         return 0;
1889 }
1890
1891 static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
1892 {
1893         struct counter_index *new_counter_index;
1894         int err;
1895         u32 tmp_idx;
1896
1897         if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) !=
1898             IB_LINK_LAYER_ETHERNET ||
1899             !(qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) ||
1900             !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK))
1901                 return 0;
1902
1903         err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER);
1904         if (err)
1905                 return err;
1906
1907         new_counter_index = kmalloc(sizeof(*new_counter_index), GFP_KERNEL);
1908         if (!new_counter_index) {
1909                 mlx4_counter_free(dev->dev, tmp_idx);
1910                 return -ENOMEM;
1911         }
1912
1913         new_counter_index->index = tmp_idx;
1914         new_counter_index->allocated = 1;
1915         qp->counter_index = new_counter_index;
1916
1917         mutex_lock(&dev->counters_table[qp->port - 1].mutex);
1918         list_add_tail(&new_counter_index->list,
1919                       &dev->counters_table[qp->port - 1].counters_list);
1920         mutex_unlock(&dev->counters_table[qp->port - 1].mutex);
1921
1922         return 0;
1923 }
1924
1925 enum {
1926         MLX4_QPC_ROCE_MODE_1 = 0,
1927         MLX4_QPC_ROCE_MODE_2 = 2,
1928         MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff
1929 };
1930
1931 static u8 gid_type_to_qpc(enum ib_gid_type gid_type)
1932 {
1933         switch (gid_type) {
1934         case IB_GID_TYPE_ROCE:
1935                 return MLX4_QPC_ROCE_MODE_1;
1936         case IB_GID_TYPE_ROCE_UDP_ENCAP:
1937                 return MLX4_QPC_ROCE_MODE_2;
1938         default:
1939                 return MLX4_QPC_ROCE_MODE_UNDEFINED;
1940         }
1941 }
1942
1943 /*
1944  * Go over all RSS QP's childes (WQs) and apply their HW state according to
1945  * their logic state if the RSS QP is the first RSS QP associated for the WQ.
1946  */
1947 static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num,
1948                             struct ib_udata *udata)
1949 {
1950         int err = 0;
1951         int i;
1952
1953         for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
1954                 struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
1955                 struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
1956
1957                 mutex_lock(&wq->mutex);
1958
1959                 /* Mlx4_ib restrictions:
1960                  * WQ's is associated to a port according to the RSS QP it is
1961                  * associates to.
1962                  * In case the WQ is associated to a different port by another
1963                  * RSS QP, return a failure.
1964                  */
1965                 if ((wq->rss_usecnt > 0) && (wq->port != port_num)) {
1966                         err = -EINVAL;
1967                         mutex_unlock(&wq->mutex);
1968                         break;
1969                 }
1970                 wq->port = port_num;
1971                 if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) {
1972                         err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY, udata);
1973                         if (err) {
1974                                 mutex_unlock(&wq->mutex);
1975                                 break;
1976                         }
1977                 }
1978                 wq->rss_usecnt++;
1979
1980                 mutex_unlock(&wq->mutex);
1981         }
1982
1983         if (i && err) {
1984                 int j;
1985
1986                 for (j = (i - 1); j >= 0; j--) {
1987                         struct ib_wq *ibwq = ind_tbl->ind_tbl[j];
1988                         struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
1989
1990                         mutex_lock(&wq->mutex);
1991
1992                         if ((wq->rss_usecnt == 1) &&
1993                             (ibwq->state == IB_WQS_RDY))
1994                                 if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET,
1995                                                        udata))
1996                                         pr_warn("failed to reverse WQN=0x%06x\n",
1997                                                 ibwq->wq_num);
1998                         wq->rss_usecnt--;
1999
2000                         mutex_unlock(&wq->mutex);
2001                 }
2002         }
2003
2004         return err;
2005 }
2006
2007 static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl,
2008                                 struct ib_udata *udata)
2009 {
2010         int i;
2011
2012         for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
2013                 struct ib_wq *ibwq = ind_tbl->ind_tbl[i];
2014                 struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq);
2015
2016                 mutex_lock(&wq->mutex);
2017
2018                 if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY))
2019                         if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET, udata))
2020                                 pr_warn("failed to reverse WQN=%x\n",
2021                                         ibwq->wq_num);
2022                 wq->rss_usecnt--;
2023
2024                 mutex_unlock(&wq->mutex);
2025         }
2026 }
2027
2028 static void fill_qp_rss_context(struct mlx4_qp_context *context,
2029                                 struct mlx4_ib_qp *qp)
2030 {
2031         struct mlx4_rss_context *rss_context;
2032
2033         rss_context = (void *)context + offsetof(struct mlx4_qp_context,
2034                         pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH;
2035
2036         rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz);
2037         rss_context->default_qpn =
2038                 cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff);
2039         if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6))
2040                 rss_context->base_qpn_udp = rss_context->default_qpn;
2041         rss_context->flags = qp->rss_ctx->flags;
2042         /* Currently support just toeplitz */
2043         rss_context->hash_fn = MLX4_RSS_HASH_TOP;
2044
2045         memcpy(rss_context->rss_key, qp->rss_ctx->rss_key,
2046                MLX4_EN_RSS_KEY_SIZE);
2047 }
2048
2049 static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
2050                                const struct ib_qp_attr *attr, int attr_mask,
2051                                enum ib_qp_state cur_state,
2052                                enum ib_qp_state new_state,
2053                                struct ib_udata *udata)
2054 {
2055         struct ib_srq  *ibsrq;
2056         const struct ib_gid_attr *gid_attr = NULL;
2057         struct ib_rwq_ind_table *rwq_ind_tbl;
2058         enum ib_qp_type qp_type;
2059         struct mlx4_ib_dev *dev;
2060         struct mlx4_ib_qp *qp;
2061         struct mlx4_ib_pd *pd;
2062         struct mlx4_ib_cq *send_cq, *recv_cq;
2063         struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
2064                 udata, struct mlx4_ib_ucontext, ibucontext);
2065         struct mlx4_qp_context *context;
2066         enum mlx4_qp_optpar optpar = 0;
2067         int sqd_event;
2068         int steer_qp = 0;
2069         int err = -EINVAL;
2070         int counter_index;
2071
2072         if (src_type == MLX4_IB_RWQ_SRC) {
2073                 struct ib_wq *ibwq;
2074
2075                 ibwq        = (struct ib_wq *)src;
2076                 ibsrq       = NULL;
2077                 rwq_ind_tbl = NULL;
2078                 qp_type     = IB_QPT_RAW_PACKET;
2079                 qp          = to_mqp((struct ib_qp *)ibwq);
2080                 dev         = to_mdev(ibwq->device);
2081                 pd          = to_mpd(ibwq->pd);
2082         } else {
2083                 struct ib_qp *ibqp;
2084
2085                 ibqp        = (struct ib_qp *)src;
2086                 ibsrq       = ibqp->srq;
2087                 rwq_ind_tbl = ibqp->rwq_ind_tbl;
2088                 qp_type     = ibqp->qp_type;
2089                 qp          = to_mqp(ibqp);
2090                 dev         = to_mdev(ibqp->device);
2091                 pd          = get_pd(qp);
2092         }
2093
2094         /* APM is not supported under RoCE */
2095         if (attr_mask & IB_QP_ALT_PATH &&
2096             rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
2097             IB_LINK_LAYER_ETHERNET)
2098                 return -ENOTSUPP;
2099
2100         context = kzalloc(sizeof *context, GFP_KERNEL);
2101         if (!context)
2102                 return -ENOMEM;
2103
2104         context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
2105                                      (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
2106
2107         if (!(attr_mask & IB_QP_PATH_MIG_STATE))
2108                 context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
2109         else {
2110                 optpar |= MLX4_QP_OPTPAR_PM_STATE;
2111                 switch (attr->path_mig_state) {
2112                 case IB_MIG_MIGRATED:
2113                         context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
2114                         break;
2115                 case IB_MIG_REARM:
2116                         context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
2117                         break;
2118                 case IB_MIG_ARMED:
2119                         context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
2120                         break;
2121                 }
2122         }
2123
2124         if (qp->inl_recv_sz)
2125                 context->param3 |= cpu_to_be32(1 << 25);
2126
2127         if (qp->flags & MLX4_IB_QP_SCATTER_FCS)
2128                 context->param3 |= cpu_to_be32(1 << 29);
2129
2130         if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI)
2131                 context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
2132         else if (qp_type == IB_QPT_RAW_PACKET)
2133                 context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
2134         else if (qp_type == IB_QPT_UD) {
2135                 if (qp->flags & MLX4_IB_QP_LSO)
2136                         context->mtu_msgmax = (IB_MTU_4096 << 5) |
2137                                               ilog2(dev->dev->caps.max_gso_sz);
2138                 else
2139                         context->mtu_msgmax = (IB_MTU_4096 << 5) | 13;
2140         } else if (attr_mask & IB_QP_PATH_MTU) {
2141                 if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
2142                         pr_err("path MTU (%u) is invalid\n",
2143                                attr->path_mtu);
2144                         goto out;
2145                 }
2146                 context->mtu_msgmax = (attr->path_mtu << 5) |
2147                         ilog2(dev->dev->caps.max_msg_sz);
2148         }
2149
2150         if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */
2151                 if (qp->rq.wqe_cnt)
2152                         context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
2153                 context->rq_size_stride |= qp->rq.wqe_shift - 4;
2154         }
2155
2156         if (qp->sq.wqe_cnt)
2157                 context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
2158         context->sq_size_stride |= qp->sq.wqe_shift - 4;
2159
2160         if (new_state == IB_QPS_RESET && qp->counter_index)
2161                 mlx4_ib_free_qp_counter(dev, qp);
2162
2163         if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
2164                 context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
2165                 context->xrcd = cpu_to_be32((u32) qp->xrcdn);
2166                 if (qp_type == IB_QPT_RAW_PACKET)
2167                         context->param3 |= cpu_to_be32(1 << 30);
2168         }
2169
2170         if (ucontext)
2171                 context->usr_page = cpu_to_be32(
2172                         mlx4_to_hw_uar_index(dev->dev, ucontext->uar.index));
2173         else
2174                 context->usr_page = cpu_to_be32(
2175                         mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index));
2176
2177         if (attr_mask & IB_QP_DEST_QPN)
2178                 context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
2179
2180         if (attr_mask & IB_QP_PORT) {
2181                 if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
2182                     !(attr_mask & IB_QP_AV)) {
2183                         mlx4_set_sched(&context->pri_path, attr->port_num);
2184                         optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
2185                 }
2186         }
2187
2188         if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
2189                 err = create_qp_lb_counter(dev, qp);
2190                 if (err)
2191                         goto out;
2192
2193                 counter_index =
2194                         dev->counters_table[qp->port - 1].default_counter;
2195                 if (qp->counter_index)
2196                         counter_index = qp->counter_index->index;
2197
2198                 if (counter_index != -1) {
2199                         context->pri_path.counter_index = counter_index;
2200                         optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
2201                         if (qp->counter_index) {
2202                                 context->pri_path.fl |=
2203                                         MLX4_FL_ETH_SRC_CHECK_MC_LB;
2204                                 context->pri_path.vlan_control |=
2205                                         MLX4_CTRL_ETH_SRC_CHECK_IF_COUNTER;
2206                         }
2207                 } else
2208                         context->pri_path.counter_index =
2209                                 MLX4_SINK_COUNTER_INDEX(dev->dev);
2210
2211                 if (qp->flags & MLX4_IB_QP_NETIF) {
2212                         mlx4_ib_steer_qp_reg(dev, qp, 1);
2213                         steer_qp = 1;
2214                 }
2215
2216                 if (qp_type == IB_QPT_GSI) {
2217                         enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ?
2218                                 IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE;
2219                         u8 qpc_roce_mode = gid_type_to_qpc(gid_type);
2220
2221                         context->rlkey_roce_mode |= (qpc_roce_mode << 6);
2222                 }
2223         }
2224
2225         if (attr_mask & IB_QP_PKEY_INDEX) {
2226                 if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
2227                         context->pri_path.disable_pkey_check = 0x40;
2228                 context->pri_path.pkey_index = attr->pkey_index;
2229                 optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
2230         }
2231
2232         if (attr_mask & IB_QP_AV) {
2233                 u8 port_num = mlx4_is_bonded(dev->dev) ? 1 :
2234                         attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2235                 u16 vlan = 0xffff;
2236                 u8 smac[ETH_ALEN];
2237                 int is_eth =
2238                         rdma_cap_eth_ah(&dev->ib_dev, port_num) &&
2239                         rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH;
2240
2241                 if (is_eth) {
2242                         gid_attr = attr->ah_attr.grh.sgid_attr;
2243                         vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev);
2244                         memcpy(smac, gid_attr->ndev->dev_addr, ETH_ALEN);
2245                 }
2246
2247                 if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
2248                                   port_num, vlan, smac))
2249                         goto out;
2250
2251                 optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
2252                            MLX4_QP_OPTPAR_SCHED_QUEUE);
2253
2254                 if (is_eth &&
2255                     (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) {
2256                         u8 qpc_roce_mode = gid_type_to_qpc(gid_attr->gid_type);
2257
2258                         if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) {
2259                                 err = -EINVAL;
2260                                 goto out;
2261                         }
2262                         context->rlkey_roce_mode |= (qpc_roce_mode << 6);
2263                 }
2264
2265         }
2266
2267         if (attr_mask & IB_QP_TIMEOUT) {
2268                 context->pri_path.ackto |= attr->timeout << 3;
2269                 optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
2270         }
2271
2272         if (attr_mask & IB_QP_ALT_PATH) {
2273                 if (attr->alt_port_num == 0 ||
2274                     attr->alt_port_num > dev->dev->caps.num_ports)
2275                         goto out;
2276
2277                 if (attr->alt_pkey_index >=
2278                     dev->dev->caps.pkey_table_len[attr->alt_port_num])
2279                         goto out;
2280
2281                 if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
2282                                       &context->alt_path,
2283                                       attr->alt_port_num))
2284                         goto out;
2285
2286                 context->alt_path.pkey_index = attr->alt_pkey_index;
2287                 context->alt_path.ackto = attr->alt_timeout << 3;
2288                 optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
2289         }
2290
2291         context->pd = cpu_to_be32(pd->pdn);
2292
2293         if (!rwq_ind_tbl) {
2294                 context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
2295                 get_cqs(qp, src_type, &send_cq, &recv_cq);
2296         } else { /* Set dummy CQs to be compatible with HV and PRM */
2297                 send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq);
2298                 recv_cq = send_cq;
2299         }
2300         context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
2301         context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
2302
2303         /* Set "fast registration enabled" for all kernel QPs */
2304         if (!ucontext)
2305                 context->params1 |= cpu_to_be32(1 << 11);
2306
2307         if (attr_mask & IB_QP_RNR_RETRY) {
2308                 context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
2309                 optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
2310         }
2311
2312         if (attr_mask & IB_QP_RETRY_CNT) {
2313                 context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
2314                 optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
2315         }
2316
2317         if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
2318                 if (attr->max_rd_atomic)
2319                         context->params1 |=
2320                                 cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
2321                 optpar |= MLX4_QP_OPTPAR_SRA_MAX;
2322         }
2323
2324         if (attr_mask & IB_QP_SQ_PSN)
2325                 context->next_send_psn = cpu_to_be32(attr->sq_psn);
2326
2327         if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
2328                 if (attr->max_dest_rd_atomic)
2329                         context->params2 |=
2330                                 cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
2331                 optpar |= MLX4_QP_OPTPAR_RRA_MAX;
2332         }
2333
2334         if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
2335                 context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
2336                 optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
2337         }
2338
2339         if (ibsrq)
2340                 context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
2341
2342         if (attr_mask & IB_QP_MIN_RNR_TIMER) {
2343                 context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
2344                 optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
2345         }
2346         if (attr_mask & IB_QP_RQ_PSN)
2347                 context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
2348
2349         /* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
2350         if (attr_mask & IB_QP_QKEY) {
2351                 if (qp->mlx4_ib_qp_type &
2352                     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
2353                         context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
2354                 else {
2355                         if (mlx4_is_mfunc(dev->dev) &&
2356                             !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
2357                             (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
2358                             MLX4_RESERVED_QKEY_BASE) {
2359                                 pr_err("Cannot use reserved QKEY"
2360                                        " 0x%x (range 0xffff0000..0xffffffff"
2361                                        " is reserved)\n", attr->qkey);
2362                                 err = -EINVAL;
2363                                 goto out;
2364                         }
2365                         context->qkey = cpu_to_be32(attr->qkey);
2366                 }
2367                 optpar |= MLX4_QP_OPTPAR_Q_KEY;
2368         }
2369
2370         if (ibsrq)
2371                 context->srqn = cpu_to_be32(1 << 24 |
2372                                             to_msrq(ibsrq)->msrq.srqn);
2373
2374         if (qp->rq.wqe_cnt &&
2375             cur_state == IB_QPS_RESET &&
2376             new_state == IB_QPS_INIT)
2377                 context->db_rec_addr = cpu_to_be64(qp->db.dma);
2378
2379         if (cur_state == IB_QPS_INIT &&
2380             new_state == IB_QPS_RTR  &&
2381             (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI ||
2382              qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) {
2383                 context->pri_path.sched_queue = (qp->port - 1) << 6;
2384                 if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
2385                     qp->mlx4_ib_qp_type &
2386                     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
2387                         context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
2388                         if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
2389                                 context->pri_path.fl = 0x80;
2390                 } else {
2391                         if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
2392                                 context->pri_path.fl = 0x80;
2393                         context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
2394                 }
2395                 if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
2396                     IB_LINK_LAYER_ETHERNET) {
2397                         if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
2398                             qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
2399                                 context->pri_path.feup = 1 << 7; /* don't fsm */
2400                         /* handle smac_index */
2401                         if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
2402                             qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
2403                             qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
2404                                 err = handle_eth_ud_smac_index(dev, qp, context);
2405                                 if (err) {
2406                                         err = -EINVAL;
2407                                         goto out;
2408                                 }
2409                                 if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
2410                                         dev->qp1_proxy[qp->port - 1] = qp;
2411                         }
2412                 }
2413         }
2414
2415         if (qp_type == IB_QPT_RAW_PACKET) {
2416                 context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
2417                                         MLX4_IB_LINK_TYPE_ETH;
2418                 if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
2419                         /* set QP to receive both tunneled & non-tunneled packets */
2420                         if (!rwq_ind_tbl)
2421                                 context->srqn = cpu_to_be32(7 << 28);
2422                 }
2423         }
2424
2425         if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
2426                 int is_eth = rdma_port_get_link_layer(
2427                                 &dev->ib_dev, qp->port) ==
2428                                 IB_LINK_LAYER_ETHERNET;
2429                 if (is_eth) {
2430                         context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
2431                         optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
2432                 }
2433         }
2434
2435         if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD  &&
2436             attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
2437                 sqd_event = 1;
2438         else
2439                 sqd_event = 0;
2440
2441         if (!ucontext &&
2442             cur_state == IB_QPS_RESET &&
2443             new_state == IB_QPS_INIT)
2444                 context->rlkey_roce_mode |= (1 << 4);
2445
2446         /*
2447          * Before passing a kernel QP to the HW, make sure that the
2448          * ownership bits of the send queue are set and the SQ
2449          * headroom is stamped so that the hardware doesn't start
2450          * processing stale work requests.
2451          */
2452         if (!ucontext &&
2453             cur_state == IB_QPS_RESET &&
2454             new_state == IB_QPS_INIT) {
2455                 struct mlx4_wqe_ctrl_seg *ctrl;
2456                 int i;
2457
2458                 for (i = 0; i < qp->sq.wqe_cnt; ++i) {
2459                         ctrl = get_send_wqe(qp, i);
2460                         ctrl->owner_opcode = cpu_to_be32(1 << 31);
2461                         ctrl->qpn_vlan.fence_size =
2462                                 1 << (qp->sq.wqe_shift - 4);
2463                         stamp_send_wqe(qp, i);
2464                 }
2465         }
2466
2467         if (rwq_ind_tbl &&
2468             cur_state == IB_QPS_RESET &&
2469             new_state == IB_QPS_INIT) {
2470                 fill_qp_rss_context(context, qp);
2471                 context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
2472         }
2473
2474         err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
2475                              to_mlx4_state(new_state), context, optpar,
2476                              sqd_event, &qp->mqp);
2477         if (err)
2478                 goto out;
2479
2480         qp->state = new_state;
2481
2482         if (attr_mask & IB_QP_ACCESS_FLAGS)
2483                 qp->atomic_rd_en = attr->qp_access_flags;
2484         if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
2485                 qp->resp_depth = attr->max_dest_rd_atomic;
2486         if (attr_mask & IB_QP_PORT) {
2487                 qp->port = attr->port_num;
2488                 update_mcg_macs(dev, qp);
2489         }
2490         if (attr_mask & IB_QP_ALT_PATH)
2491                 qp->alt_port = attr->alt_port_num;
2492
2493         if (is_sqp(dev, qp))
2494                 store_sqp_attrs(to_msqp(qp), attr, attr_mask);
2495
2496         /*
2497          * If we moved QP0 to RTR, bring the IB link up; if we moved
2498          * QP0 to RESET or ERROR, bring the link back down.
2499          */
2500         if (is_qp0(dev, qp)) {
2501                 if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
2502                         if (mlx4_INIT_PORT(dev->dev, qp->port))
2503                                 pr_warn("INIT_PORT failed for port %d\n",
2504                                        qp->port);
2505
2506                 if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
2507                     (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
2508                         mlx4_CLOSE_PORT(dev->dev, qp->port);
2509         }
2510
2511         /*
2512          * If we moved a kernel QP to RESET, clean up all old CQ
2513          * entries and reinitialize the QP.
2514          */
2515         if (new_state == IB_QPS_RESET) {
2516                 if (!ucontext) {
2517                         mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
2518                                          ibsrq ? to_msrq(ibsrq) : NULL);
2519                         if (send_cq != recv_cq)
2520                                 mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
2521
2522                         qp->rq.head = 0;
2523                         qp->rq.tail = 0;
2524                         qp->sq.head = 0;
2525                         qp->sq.tail = 0;
2526                         qp->sq_next_wqe = 0;
2527                         if (qp->rq.wqe_cnt)
2528                                 *qp->db.db  = 0;
2529
2530                         if (qp->flags & MLX4_IB_QP_NETIF)
2531                                 mlx4_ib_steer_qp_reg(dev, qp, 0);
2532                 }
2533                 if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
2534                         mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2535                         qp->pri.smac = 0;
2536                         qp->pri.smac_port = 0;
2537                 }
2538                 if (qp->alt.smac) {
2539                         mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2540                         qp->alt.smac = 0;
2541                 }
2542                 if (qp->pri.vid < 0x1000) {
2543                         mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
2544                         qp->pri.vid = 0xFFFF;
2545                         qp->pri.candidate_vid = 0xFFFF;
2546                         qp->pri.update_vid = 0;
2547                 }
2548
2549                 if (qp->alt.vid < 0x1000) {
2550                         mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
2551                         qp->alt.vid = 0xFFFF;
2552                         qp->alt.candidate_vid = 0xFFFF;
2553                         qp->alt.update_vid = 0;
2554                 }
2555         }
2556 out:
2557         if (err && qp->counter_index)
2558                 mlx4_ib_free_qp_counter(dev, qp);
2559         if (err && steer_qp)
2560                 mlx4_ib_steer_qp_reg(dev, qp, 0);
2561         kfree(context);
2562         if (qp->pri.candidate_smac ||
2563             (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
2564                 if (err) {
2565                         mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
2566                 } else {
2567                         if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
2568                                 mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2569                         qp->pri.smac = qp->pri.candidate_smac;
2570                         qp->pri.smac_index = qp->pri.candidate_smac_index;
2571                         qp->pri.smac_port = qp->pri.candidate_smac_port;
2572                 }
2573                 qp->pri.candidate_smac = 0;
2574                 qp->pri.candidate_smac_index = 0;
2575                 qp->pri.candidate_smac_port = 0;
2576         }
2577         if (qp->alt.candidate_smac) {
2578                 if (err) {
2579                         mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
2580                 } else {
2581                         if (qp->alt.smac)
2582                                 mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2583                         qp->alt.smac = qp->alt.candidate_smac;
2584                         qp->alt.smac_index = qp->alt.candidate_smac_index;
2585                         qp->alt.smac_port = qp->alt.candidate_smac_port;
2586                 }
2587                 qp->alt.candidate_smac = 0;
2588                 qp->alt.candidate_smac_index = 0;
2589                 qp->alt.candidate_smac_port = 0;
2590         }
2591
2592         if (qp->pri.update_vid) {
2593                 if (err) {
2594                         if (qp->pri.candidate_vid < 0x1000)
2595                                 mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
2596                                                      qp->pri.candidate_vid);
2597                 } else {
2598                         if (qp->pri.vid < 0x1000)
2599                                 mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
2600                                                      qp->pri.vid);
2601                         qp->pri.vid = qp->pri.candidate_vid;
2602                         qp->pri.vlan_port = qp->pri.candidate_vlan_port;
2603                         qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
2604                 }
2605                 qp->pri.candidate_vid = 0xFFFF;
2606                 qp->pri.update_vid = 0;
2607         }
2608
2609         if (qp->alt.update_vid) {
2610                 if (err) {
2611                         if (qp->alt.candidate_vid < 0x1000)
2612                                 mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
2613                                                      qp->alt.candidate_vid);
2614                 } else {
2615                         if (qp->alt.vid < 0x1000)
2616                                 mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
2617                                                      qp->alt.vid);
2618                         qp->alt.vid = qp->alt.candidate_vid;
2619                         qp->alt.vlan_port = qp->alt.candidate_vlan_port;
2620                         qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
2621                 }
2622                 qp->alt.candidate_vid = 0xFFFF;
2623                 qp->alt.update_vid = 0;
2624         }
2625
2626         return err;
2627 }
2628
2629 enum {
2630         MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE       |
2631                                               IB_QP_PORT),
2632 };
2633
2634 static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2635                               int attr_mask, struct ib_udata *udata)
2636 {
2637         struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
2638         struct mlx4_ib_qp *qp = to_mqp(ibqp);
2639         enum ib_qp_state cur_state, new_state;
2640         int err = -EINVAL;
2641         mutex_lock(&qp->mutex);
2642
2643         cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
2644         new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
2645
2646         if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
2647                                 attr_mask)) {
2648                 pr_debug("qpn 0x%x: invalid attribute mask specified "
2649                          "for transition %d to %d. qp_type %d,"
2650                          " attr_mask 0x%x\n",
2651                          ibqp->qp_num, cur_state, new_state,
2652                          ibqp->qp_type, attr_mask);
2653                 goto out;
2654         }
2655
2656         if (ibqp->rwq_ind_tbl) {
2657                 if (!(((cur_state == IB_QPS_RESET) &&
2658                        (new_state == IB_QPS_INIT)) ||
2659                       ((cur_state == IB_QPS_INIT)  &&
2660                        (new_state == IB_QPS_RTR)))) {
2661                         pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n",
2662                                  ibqp->qp_num, cur_state, new_state);
2663
2664                         err = -EOPNOTSUPP;
2665                         goto out;
2666                 }
2667
2668                 if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) {
2669                         pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n",
2670                                  ibqp->qp_num, attr_mask, cur_state, new_state);
2671
2672                         err = -EOPNOTSUPP;
2673                         goto out;
2674                 }
2675         }
2676
2677         if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
2678                 if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
2679                         if ((ibqp->qp_type == IB_QPT_RC) ||
2680                             (ibqp->qp_type == IB_QPT_UD) ||
2681                             (ibqp->qp_type == IB_QPT_UC) ||
2682                             (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
2683                             (ibqp->qp_type == IB_QPT_XRC_INI)) {
2684                                 attr->port_num = mlx4_ib_bond_next_port(dev);
2685                         }
2686                 } else {
2687                         /* no sense in changing port_num
2688                          * when ports are bonded */
2689                         attr_mask &= ~IB_QP_PORT;
2690                 }
2691         }
2692
2693         if ((attr_mask & IB_QP_PORT) &&
2694             (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
2695                 pr_debug("qpn 0x%x: invalid port number (%d) specified "
2696                          "for transition %d to %d. qp_type %d\n",
2697                          ibqp->qp_num, attr->port_num, cur_state,
2698                          new_state, ibqp->qp_type);
2699                 goto out;
2700         }
2701
2702         if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
2703             (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
2704              IB_LINK_LAYER_ETHERNET))
2705                 goto out;
2706
2707         if (attr_mask & IB_QP_PKEY_INDEX) {
2708                 int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2709                 if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
2710                         pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
2711                                  "for transition %d to %d. qp_type %d\n",
2712                                  ibqp->qp_num, attr->pkey_index, cur_state,
2713                                  new_state, ibqp->qp_type);
2714                         goto out;
2715                 }
2716         }
2717
2718         if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
2719             attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
2720                 pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
2721                          "Transition %d to %d. qp_type %d\n",
2722                          ibqp->qp_num, attr->max_rd_atomic, cur_state,
2723                          new_state, ibqp->qp_type);
2724                 goto out;
2725         }
2726
2727         if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
2728             attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
2729                 pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
2730                          "Transition %d to %d. qp_type %d\n",
2731                          ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
2732                          new_state, ibqp->qp_type);
2733                 goto out;
2734         }
2735
2736         if (cur_state == new_state && cur_state == IB_QPS_RESET) {
2737                 err = 0;
2738                 goto out;
2739         }
2740
2741         if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) {
2742                 err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num,
2743                                        udata);
2744                 if (err)
2745                         goto out;
2746         }
2747
2748         err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask,
2749                                   cur_state, new_state, udata);
2750
2751         if (ibqp->rwq_ind_tbl && err)
2752                 bring_down_rss_rwqs(ibqp->rwq_ind_tbl, udata);
2753
2754         if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
2755                 attr->port_num = 1;
2756
2757 out:
2758         mutex_unlock(&qp->mutex);
2759         return err;
2760 }
2761
2762 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2763                       int attr_mask, struct ib_udata *udata)
2764 {
2765         struct mlx4_ib_qp *mqp = to_mqp(ibqp);
2766         int ret;
2767
2768         ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
2769
2770         if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
2771                 struct mlx4_ib_sqp *sqp = to_msqp(mqp);
2772                 int err = 0;
2773
2774                 if (sqp->roce_v2_gsi)
2775                         err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask);
2776                 if (err)
2777                         pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n",
2778                                err);
2779         }
2780         return ret;
2781 }
2782
2783 static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
2784 {
2785         int i;
2786         for (i = 0; i < dev->caps.num_ports; i++) {
2787                 if (qpn == dev->caps.spec_qps[i].qp0_proxy ||
2788                     qpn == dev->caps.spec_qps[i].qp0_tunnel) {
2789                         *qkey = dev->caps.spec_qps[i].qp0_qkey;
2790                         return 0;
2791                 }
2792         }
2793         return -EINVAL;
2794 }
2795
2796 static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
2797                                   const struct ib_ud_wr *wr,
2798                                   void *wqe, unsigned *mlx_seg_len)
2799 {
2800         struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
2801         struct ib_device *ib_dev = &mdev->ib_dev;
2802         struct mlx4_wqe_mlx_seg *mlx = wqe;
2803         struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
2804         struct mlx4_ib_ah *ah = to_mah(wr->ah);
2805         u16 pkey;
2806         u32 qkey;
2807         int send_size;
2808         int header_size;
2809         int spc;
2810         int i;
2811
2812         if (wr->wr.opcode != IB_WR_SEND)
2813                 return -EINVAL;
2814
2815         send_size = 0;
2816
2817         for (i = 0; i < wr->wr.num_sge; ++i)
2818                 send_size += wr->wr.sg_list[i].length;
2819
2820         /* for proxy-qp0 sends, need to add in size of tunnel header */
2821         /* for tunnel-qp0 sends, tunnel header is already in s/g list */
2822         if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
2823                 send_size += sizeof (struct mlx4_ib_tunnel_header);
2824
2825         ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
2826
2827         if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
2828                 sqp->ud_header.lrh.service_level =
2829                         be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
2830                 sqp->ud_header.lrh.destination_lid =
2831                         cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2832                 sqp->ud_header.lrh.source_lid =
2833                         cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2834         }
2835
2836         mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
2837
2838         /* force loopback */
2839         mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
2840         mlx->rlid = sqp->ud_header.lrh.destination_lid;
2841
2842         sqp->ud_header.lrh.virtual_lane    = 0;
2843         sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
2844         ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
2845         sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
2846         if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
2847                 sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
2848         else
2849                 sqp->ud_header.bth.destination_qpn =
2850                         cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel);
2851
2852         sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
2853         if (mlx4_is_master(mdev->dev)) {
2854                 if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
2855                         return -EINVAL;
2856         } else {
2857                 if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
2858                         return -EINVAL;
2859         }
2860         sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
2861         sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
2862
2863         sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
2864         sqp->ud_header.immediate_present = 0;
2865
2866         header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
2867
2868         /*
2869          * Inline data segments may not cross a 64 byte boundary.  If
2870          * our UD header is bigger than the space available up to the
2871          * next 64 byte boundary in the WQE, use two inline data
2872          * segments to hold the UD header.
2873          */
2874         spc = MLX4_INLINE_ALIGN -
2875               ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
2876         if (header_size <= spc) {
2877                 inl->byte_count = cpu_to_be32(1 << 31 | header_size);
2878                 memcpy(inl + 1, sqp->header_buf, header_size);
2879                 i = 1;
2880         } else {
2881                 inl->byte_count = cpu_to_be32(1 << 31 | spc);
2882                 memcpy(inl + 1, sqp->header_buf, spc);
2883
2884                 inl = (void *) (inl + 1) + spc;
2885                 memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
2886                 /*
2887                  * Need a barrier here to make sure all the data is
2888                  * visible before the byte_count field is set.
2889                  * Otherwise the HCA prefetcher could grab the 64-byte
2890                  * chunk with this inline segment and get a valid (!=
2891                  * 0xffffffff) byte count but stale data, and end up
2892                  * generating a packet with bad headers.
2893                  *
2894                  * The first inline segment's byte_count field doesn't
2895                  * need a barrier, because it comes after a
2896                  * control/MLX segment and therefore is at an offset
2897                  * of 16 mod 64.
2898                  */
2899                 wmb();
2900                 inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
2901                 i = 2;
2902         }
2903
2904         *mlx_seg_len =
2905         ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
2906         return 0;
2907 }
2908
2909 static u8 sl_to_vl(struct mlx4_ib_dev *dev, u8 sl, int port_num)
2910 {
2911         union sl2vl_tbl_to_u64 tmp_vltab;
2912         u8 vl;
2913
2914         if (sl > 15)
2915                 return 0xf;
2916         tmp_vltab.sl64 = atomic64_read(&dev->sl2vl[port_num - 1]);
2917         vl = tmp_vltab.sl8[sl >> 1];
2918         if (sl & 1)
2919                 vl &= 0x0f;
2920         else
2921                 vl >>= 4;
2922         return vl;
2923 }
2924
2925 static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num,
2926                                 int index, union ib_gid *gid,
2927                                 enum ib_gid_type *gid_type)
2928 {
2929         struct mlx4_ib_iboe *iboe = &ibdev->iboe;
2930         struct mlx4_port_gid_table *port_gid_table;
2931         unsigned long flags;
2932
2933         port_gid_table = &iboe->gids[port_num - 1];
2934         spin_lock_irqsave(&iboe->lock, flags);
2935         memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid));
2936         *gid_type = port_gid_table->gids[index].gid_type;
2937         spin_unlock_irqrestore(&iboe->lock, flags);
2938         if (rdma_is_zero_gid(gid))
2939                 return -ENOENT;
2940
2941         return 0;
2942 }
2943
2944 #define MLX4_ROCEV2_QP1_SPORT 0xC000
2945 static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr,
2946                             void *wqe, unsigned *mlx_seg_len)
2947 {
2948         struct ib_device *ib_dev = sqp->qp.ibqp.device;
2949         struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
2950         struct mlx4_wqe_mlx_seg *mlx = wqe;
2951         struct mlx4_wqe_ctrl_seg *ctrl = wqe;
2952         struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
2953         struct mlx4_ib_ah *ah = to_mah(wr->ah);
2954         union ib_gid sgid;
2955         u16 pkey;
2956         int send_size;
2957         int header_size;
2958         int spc;
2959         int i;
2960         int err = 0;
2961         u16 vlan = 0xffff;
2962         bool is_eth;
2963         bool is_vlan = false;
2964         bool is_grh;
2965         bool is_udp = false;
2966         int ip_version = 0;
2967
2968         send_size = 0;
2969         for (i = 0; i < wr->wr.num_sge; ++i)
2970                 send_size += wr->wr.sg_list[i].length;
2971
2972         is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
2973         is_grh = mlx4_ib_ah_grh_present(ah);
2974         if (is_eth) {
2975                 enum ib_gid_type gid_type;
2976                 if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
2977                         /* When multi-function is enabled, the ib_core gid
2978                          * indexes don't necessarily match the hw ones, so
2979                          * we must use our own cache */
2980                         err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
2981                                                            be32_to_cpu(ah->av.ib.port_pd) >> 24,
2982                                                            ah->av.ib.gid_index, &sgid.raw[0]);
2983                         if (err)
2984                                 return err;
2985                 } else  {
2986                         err = fill_gid_by_hw_index(ibdev, sqp->qp.port,
2987                                             ah->av.ib.gid_index,
2988                                             &sgid, &gid_type);
2989                         if (!err) {
2990                                 is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
2991                                 if (is_udp) {
2992                                         if (ipv6_addr_v4mapped((struct in6_addr *)&sgid))
2993                                                 ip_version = 4;
2994                                         else
2995                                                 ip_version = 6;
2996                                         is_grh = false;
2997                                 }
2998                         } else {
2999                                 return err;
3000                         }
3001                 }
3002                 if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
3003                         vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
3004                         is_vlan = 1;
3005                 }
3006         }
3007         err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
3008                           ip_version, is_udp, 0, &sqp->ud_header);
3009         if (err)
3010                 return err;
3011
3012         if (!is_eth) {
3013                 sqp->ud_header.lrh.service_level =
3014                         be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
3015                 sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
3016                 sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
3017         }
3018
3019         if (is_grh || (ip_version == 6)) {
3020                 sqp->ud_header.grh.traffic_class =
3021                         (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
3022                 sqp->ud_header.grh.flow_label    =
3023                         ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
3024                 sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
3025                 if (is_eth) {
3026                         memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
3027                 } else {
3028                         if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
3029                                 /* When multi-function is enabled, the ib_core gid
3030                                  * indexes don't necessarily match the hw ones, so
3031                                  * we must use our own cache
3032                                  */
3033                                 sqp->ud_header.grh.source_gid.global.subnet_prefix =
3034                                         cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
3035                                                                     demux[sqp->qp.port - 1].
3036                                                                     subnet_prefix)));
3037                                 sqp->ud_header.grh.source_gid.global.interface_id =
3038                                         to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
3039                                                        guid_cache[ah->av.ib.gid_index];
3040                         } else {
3041                                 sqp->ud_header.grh.source_gid =
3042                                         ah->ibah.sgid_attr->gid;
3043                         }
3044                 }
3045                 memcpy(sqp->ud_header.grh.destination_gid.raw,
3046                        ah->av.ib.dgid, 16);
3047         }
3048
3049         if (ip_version == 4) {
3050                 sqp->ud_header.ip4.tos =
3051                         (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
3052                 sqp->ud_header.ip4.id = 0;
3053                 sqp->ud_header.ip4.frag_off = htons(IP_DF);
3054                 sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit;
3055
3056                 memcpy(&sqp->ud_header.ip4.saddr,
3057                        sgid.raw + 12, 4);
3058                 memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4);
3059                 sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header);
3060         }
3061
3062         if (is_udp) {
3063                 sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT);
3064                 sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT);
3065                 sqp->ud_header.udp.csum = 0;
3066         }
3067
3068         mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
3069
3070         if (!is_eth) {
3071                 mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
3072                                           (sqp->ud_header.lrh.destination_lid ==
3073                                            IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
3074                                           (sqp->ud_header.lrh.service_level << 8));
3075                 if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
3076                         mlx->flags |= cpu_to_be32(0x1); /* force loopback */
3077                 mlx->rlid = sqp->ud_header.lrh.destination_lid;
3078         }
3079
3080         switch (wr->wr.opcode) {
3081         case IB_WR_SEND:
3082                 sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
3083                 sqp->ud_header.immediate_present = 0;
3084                 break;
3085         case IB_WR_SEND_WITH_IMM:
3086                 sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
3087                 sqp->ud_header.immediate_present = 1;
3088                 sqp->ud_header.immediate_data    = wr->wr.ex.imm_data;
3089                 break;
3090         default:
3091                 return -EINVAL;
3092         }
3093
3094         if (is_eth) {
3095                 struct in6_addr in6;
3096                 u16 ether_type;
3097                 u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
3098
3099                 ether_type = (!is_udp) ? ETH_P_IBOE:
3100                         (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6);
3101
3102                 mlx->sched_prio = cpu_to_be16(pcp);
3103
3104                 ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac);
3105                 memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
3106                 memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
3107                 memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
3108                 memcpy(&in6, sgid.raw, sizeof(in6));
3109
3110
3111                 if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
3112                         mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
3113                 if (!is_vlan) {
3114                         sqp->ud_header.eth.type = cpu_to_be16(ether_type);
3115                 } else {
3116                         sqp->ud_header.vlan.type = cpu_to_be16(ether_type);
3117                         sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
3118                 }
3119         } else {
3120                 sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 :
3121                                                         sl_to_vl(to_mdev(ib_dev),
3122                                                                  sqp->ud_header.lrh.service_level,
3123                                                                  sqp->qp.port);
3124                 if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
3125                         return -EINVAL;
3126                 if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
3127                         sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
3128         }
3129         sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
3130         if (!sqp->qp.ibqp.qp_num)
3131                 ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
3132         else
3133                 ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index, &pkey);
3134         sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
3135         sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
3136         sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
3137         sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
3138                                                sqp->qkey : wr->remote_qkey);
3139         sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
3140
3141         header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
3142
3143         if (0) {
3144                 pr_err("built UD header of size %d:\n", header_size);
3145                 for (i = 0; i < header_size / 4; ++i) {
3146                         if (i % 8 == 0)
3147                                 pr_err("  [%02x] ", i * 4);
3148                         pr_cont(" %08x",
3149                                 be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
3150                         if ((i + 1) % 8 == 0)
3151                                 pr_cont("\n");
3152                 }
3153                 pr_err("\n");
3154         }
3155
3156         /*
3157          * Inline data segments may not cross a 64 byte boundary.  If
3158          * our UD header is bigger than the space available up to the
3159          * next 64 byte boundary in the WQE, use two inline data
3160          * segments to hold the UD header.
3161          */
3162         spc = MLX4_INLINE_ALIGN -
3163                 ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
3164         if (header_size <= spc) {
3165                 inl->byte_count = cpu_to_be32(1 << 31 | header_size);
3166                 memcpy(inl + 1, sqp->header_buf, header_size);
3167                 i = 1;
3168         } else {
3169                 inl->byte_count = cpu_to_be32(1 << 31 | spc);
3170                 memcpy(inl + 1, sqp->header_buf, spc);
3171
3172                 inl = (void *) (inl + 1) + spc;
3173                 memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
3174                 /*
3175                  * Need a barrier here to make sure all the data is
3176                  * visible before the byte_count field is set.
3177                  * Otherwise the HCA prefetcher could grab the 64-byte
3178                  * chunk with this inline segment and get a valid (!=
3179                  * 0xffffffff) byte count but stale data, and end up
3180                  * generating a packet with bad headers.
3181                  *
3182                  * The first inline segment's byte_count field doesn't
3183                  * need a barrier, because it comes after a
3184                  * control/MLX segment and therefore is at an offset
3185                  * of 16 mod 64.
3186                  */
3187                 wmb();
3188                 inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
3189                 i = 2;
3190         }
3191
3192         *mlx_seg_len =
3193                 ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
3194         return 0;
3195 }
3196
3197 static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
3198 {
3199         unsigned cur;
3200         struct mlx4_ib_cq *cq;
3201
3202         cur = wq->head - wq->tail;
3203         if (likely(cur + nreq < wq->max_post))
3204                 return 0;
3205
3206         cq = to_mcq(ib_cq);
3207         spin_lock(&cq->lock);
3208         cur = wq->head - wq->tail;
3209         spin_unlock(&cq->lock);
3210
3211         return cur + nreq >= wq->max_post;
3212 }
3213
3214 static __be32 convert_access(int acc)
3215 {
3216         return (acc & IB_ACCESS_REMOTE_ATOMIC ?
3217                 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)       : 0) |
3218                (acc & IB_ACCESS_REMOTE_WRITE  ?
3219                 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
3220                (acc & IB_ACCESS_REMOTE_READ   ?
3221                 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)  : 0) |
3222                (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
3223                 cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
3224 }
3225
3226 static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg,
3227                         const struct ib_reg_wr *wr)
3228 {
3229         struct mlx4_ib_mr *mr = to_mmr(wr->mr);
3230
3231         fseg->flags             = convert_access(wr->access);
3232         fseg->mem_key           = cpu_to_be32(wr->key);
3233         fseg->buf_list          = cpu_to_be64(mr->page_map);
3234         fseg->start_addr        = cpu_to_be64(mr->ibmr.iova);
3235         fseg->reg_len           = cpu_to_be64(mr->ibmr.length);
3236         fseg->offset            = 0; /* XXX -- is this just for ZBVA? */
3237         fseg->page_size         = cpu_to_be32(ilog2(mr->ibmr.page_size));
3238         fseg->reserved[0]       = 0;
3239         fseg->reserved[1]       = 0;
3240 }
3241
3242 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
3243 {
3244         memset(iseg, 0, sizeof(*iseg));
3245         iseg->mem_key = cpu_to_be32(rkey);
3246 }
3247
3248 static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
3249                                           u64 remote_addr, u32 rkey)
3250 {
3251         rseg->raddr    = cpu_to_be64(remote_addr);
3252         rseg->rkey     = cpu_to_be32(rkey);
3253         rseg->reserved = 0;
3254 }
3255
3256 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
3257                            const struct ib_atomic_wr *wr)
3258 {
3259         if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
3260                 aseg->swap_add = cpu_to_be64(wr->swap);
3261                 aseg->compare  = cpu_to_be64(wr->compare_add);
3262         } else if (wr->wr.opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
3263                 aseg->swap_add = cpu_to_be64(wr->compare_add);
3264                 aseg->compare  = cpu_to_be64(wr->compare_add_mask);
3265         } else {
3266                 aseg->swap_add = cpu_to_be64(wr->compare_add);
3267                 aseg->compare  = 0;
3268         }
3269
3270 }
3271
3272 static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
3273                                   const struct ib_atomic_wr *wr)
3274 {
3275         aseg->swap_add          = cpu_to_be64(wr->swap);
3276         aseg->swap_add_mask     = cpu_to_be64(wr->swap_mask);
3277         aseg->compare           = cpu_to_be64(wr->compare_add);
3278         aseg->compare_mask      = cpu_to_be64(wr->compare_add_mask);
3279 }
3280
3281 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
3282                              const struct ib_ud_wr *wr)
3283 {
3284         memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av));
3285         dseg->dqpn = cpu_to_be32(wr->remote_qpn);
3286         dseg->qkey = cpu_to_be32(wr->remote_qkey);
3287         dseg->vlan = to_mah(wr->ah)->av.eth.vlan;
3288         memcpy(dseg->mac, to_mah(wr->ah)->av.eth.mac, 6);
3289 }
3290
3291 static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
3292                                     struct mlx4_wqe_datagram_seg *dseg,
3293                                     const struct ib_ud_wr *wr,
3294                                     enum mlx4_ib_qp_type qpt)
3295 {
3296         union mlx4_ext_av *av = &to_mah(wr->ah)->av;
3297         struct mlx4_av sqp_av = {0};
3298         int port = *((u8 *) &av->ib.port_pd) & 0x3;
3299
3300         /* force loopback */
3301         sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
3302         sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
3303         sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
3304                         cpu_to_be32(0xf0000000);
3305
3306         memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
3307         if (qpt == MLX4_IB_QPT_PROXY_GSI)
3308                 dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp1_tunnel);
3309         else
3310                 dseg->dqpn = cpu_to_be32(dev->dev->caps.spec_qps[port - 1].qp0_tunnel);
3311         /* Use QKEY from the QP context, which is set by master */
3312         dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
3313 }
3314
3315 static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe,
3316                                 unsigned *mlx_seg_len)
3317 {
3318         struct mlx4_wqe_inline_seg *inl = wqe;
3319         struct mlx4_ib_tunnel_header hdr;
3320         struct mlx4_ib_ah *ah = to_mah(wr->ah);
3321         int spc;
3322         int i;
3323
3324         memcpy(&hdr.av, &ah->av, sizeof hdr.av);
3325         hdr.remote_qpn = cpu_to_be32(wr->remote_qpn);
3326         hdr.pkey_index = cpu_to_be16(wr->pkey_index);
3327         hdr.qkey = cpu_to_be32(wr->remote_qkey);
3328         memcpy(hdr.mac, ah->av.eth.mac, 6);
3329         hdr.vlan = ah->av.eth.vlan;
3330
3331         spc = MLX4_INLINE_ALIGN -
3332                 ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
3333         if (sizeof (hdr) <= spc) {
3334                 memcpy(inl + 1, &hdr, sizeof (hdr));
3335                 wmb();
3336                 inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
3337                 i = 1;
3338         } else {
3339                 memcpy(inl + 1, &hdr, spc);
3340                 wmb();
3341                 inl->byte_count = cpu_to_be32(1 << 31 | spc);
3342
3343                 inl = (void *) (inl + 1) + spc;
3344                 memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
3345                 wmb();
3346                 inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
3347                 i = 2;
3348         }
3349
3350         *mlx_seg_len =
3351                 ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
3352 }
3353
3354 static void set_mlx_icrc_seg(void *dseg)
3355 {
3356         u32 *t = dseg;
3357         struct mlx4_wqe_inline_seg *iseg = dseg;
3358
3359         t[1] = 0;
3360
3361         /*
3362          * Need a barrier here before writing the byte_count field to
3363          * make sure that all the data is visible before the
3364          * byte_count field is set.  Otherwise, if the segment begins
3365          * a new cacheline, the HCA prefetcher could grab the 64-byte
3366          * chunk and get a valid (!= * 0xffffffff) byte count but
3367          * stale data, and end up sending the wrong data.
3368          */
3369         wmb();
3370
3371         iseg->byte_count = cpu_to_be32((1 << 31) | 4);
3372 }
3373
3374 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
3375 {
3376         dseg->lkey       = cpu_to_be32(sg->lkey);
3377         dseg->addr       = cpu_to_be64(sg->addr);
3378
3379         /*
3380          * Need a barrier here before writing the byte_count field to
3381          * make sure that all the data is visible before the
3382          * byte_count field is set.  Otherwise, if the segment begins
3383          * a new cacheline, the HCA prefetcher could grab the 64-byte
3384          * chunk and get a valid (!= * 0xffffffff) byte count but
3385          * stale data, and end up sending the wrong data.
3386          */
3387         wmb();
3388
3389         dseg->byte_count = cpu_to_be32(sg->length);
3390 }
3391
3392 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
3393 {
3394         dseg->byte_count = cpu_to_be32(sg->length);
3395         dseg->lkey       = cpu_to_be32(sg->lkey);
3396         dseg->addr       = cpu_to_be64(sg->addr);
3397 }
3398
3399 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe,
3400                          const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp,
3401                          unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh)
3402 {
3403         unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16);
3404
3405         if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
3406                 *blh = cpu_to_be32(1 << 6);
3407
3408         if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
3409                      wr->wr.num_sge > qp->sq.max_gs - (halign >> 4)))
3410                 return -EINVAL;
3411
3412         memcpy(wqe->header, wr->header, wr->hlen);
3413
3414         *lso_hdr_sz  = cpu_to_be32(wr->mss << 16 | wr->hlen);
3415         *lso_seg_len = halign;
3416         return 0;
3417 }
3418
3419 static __be32 send_ieth(const struct ib_send_wr *wr)
3420 {
3421         switch (wr->opcode) {
3422         case IB_WR_SEND_WITH_IMM:
3423         case IB_WR_RDMA_WRITE_WITH_IMM:
3424                 return wr->ex.imm_data;
3425
3426         case IB_WR_SEND_WITH_INV:
3427                 return cpu_to_be32(wr->ex.invalidate_rkey);
3428
3429         default:
3430                 return 0;
3431         }
3432 }
3433
3434 static void add_zero_len_inline(void *wqe)
3435 {
3436         struct mlx4_wqe_inline_seg *inl = wqe;
3437         memset(wqe, 0, 16);
3438         inl->byte_count = cpu_to_be32(1 << 31);
3439 }
3440
3441 static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
3442                               const struct ib_send_wr **bad_wr, bool drain)
3443 {
3444         struct mlx4_ib_qp *qp = to_mqp(ibqp);
3445         void *wqe;
3446         struct mlx4_wqe_ctrl_seg *ctrl;
3447         struct mlx4_wqe_data_seg *dseg;
3448         unsigned long flags;
3449         int nreq;
3450         int err = 0;
3451         unsigned ind;
3452         int uninitialized_var(size);
3453         unsigned uninitialized_var(seglen);
3454         __be32 dummy;
3455         __be32 *lso_wqe;
3456         __be32 uninitialized_var(lso_hdr_sz);
3457         __be32 blh;
3458         int i;
3459         struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
3460
3461         if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
3462                 struct mlx4_ib_sqp *sqp = to_msqp(qp);
3463
3464                 if (sqp->roce_v2_gsi) {
3465                         struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
3466                         enum ib_gid_type gid_type;
3467                         union ib_gid gid;
3468
3469                         if (!fill_gid_by_hw_index(mdev, sqp->qp.port,
3470                                            ah->av.ib.gid_index,
3471                                            &gid, &gid_type))
3472                                 qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
3473                                                 to_mqp(sqp->roce_v2_gsi) : qp;
3474                         else
3475                                 pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n",
3476                                        ah->av.ib.gid_index);
3477                 }
3478         }
3479
3480         spin_lock_irqsave(&qp->sq.lock, flags);
3481         if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
3482             !drain) {
3483                 err = -EIO;
3484                 *bad_wr = wr;
3485                 nreq = 0;
3486                 goto out;
3487         }
3488
3489         ind = qp->sq_next_wqe;
3490
3491         for (nreq = 0; wr; ++nreq, wr = wr->next) {
3492                 lso_wqe = &dummy;
3493                 blh = 0;
3494
3495                 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
3496                         err = -ENOMEM;
3497                         *bad_wr = wr;
3498                         goto out;
3499                 }
3500
3501                 if (unlikely(wr->num_sge > qp->sq.max_gs)) {
3502                         err = -EINVAL;
3503                         *bad_wr = wr;
3504                         goto out;
3505                 }
3506
3507                 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
3508                 qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
3509
3510                 ctrl->srcrb_flags =
3511                         (wr->send_flags & IB_SEND_SIGNALED ?
3512                          cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
3513                         (wr->send_flags & IB_SEND_SOLICITED ?
3514                          cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
3515                         ((wr->send_flags & IB_SEND_IP_CSUM) ?
3516                          cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
3517                                      MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
3518                         qp->sq_signal_bits;
3519
3520                 ctrl->imm = send_ieth(wr);
3521
3522                 wqe += sizeof *ctrl;
3523                 size = sizeof *ctrl / 16;
3524
3525                 switch (qp->mlx4_ib_qp_type) {
3526                 case MLX4_IB_QPT_RC:
3527                 case MLX4_IB_QPT_UC:
3528                         switch (wr->opcode) {
3529                         case IB_WR_ATOMIC_CMP_AND_SWP:
3530                         case IB_WR_ATOMIC_FETCH_AND_ADD:
3531                         case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
3532                                 set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
3533                                               atomic_wr(wr)->rkey);
3534                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3535
3536                                 set_atomic_seg(wqe, atomic_wr(wr));
3537                                 wqe  += sizeof (struct mlx4_wqe_atomic_seg);
3538
3539                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
3540                                          sizeof (struct mlx4_wqe_atomic_seg)) / 16;
3541
3542                                 break;
3543
3544                         case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
3545                                 set_raddr_seg(wqe, atomic_wr(wr)->remote_addr,
3546                                               atomic_wr(wr)->rkey);
3547                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3548
3549                                 set_masked_atomic_seg(wqe, atomic_wr(wr));
3550                                 wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
3551
3552                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
3553                                          sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
3554
3555                                 break;
3556
3557                         case IB_WR_RDMA_READ:
3558                         case IB_WR_RDMA_WRITE:
3559                         case IB_WR_RDMA_WRITE_WITH_IMM:
3560                                 set_raddr_seg(wqe, rdma_wr(wr)->remote_addr,
3561                                               rdma_wr(wr)->rkey);
3562                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
3563                                 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
3564                                 break;
3565
3566                         case IB_WR_LOCAL_INV:
3567                                 ctrl->srcrb_flags |=
3568                                         cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3569                                 set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
3570                                 wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
3571                                 size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
3572                                 break;
3573
3574                         case IB_WR_REG_MR:
3575                                 ctrl->srcrb_flags |=
3576                                         cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3577                                 set_reg_seg(wqe, reg_wr(wr));
3578                                 wqe  += sizeof(struct mlx4_wqe_fmr_seg);
3579                                 size += sizeof(struct mlx4_wqe_fmr_seg) / 16;
3580                                 break;
3581
3582                         default:
3583                                 /* No extra segments required for sends */
3584                                 break;
3585                         }
3586                         break;
3587
3588                 case MLX4_IB_QPT_TUN_SMI_OWNER:
3589                         err =  build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
3590                                         ctrl, &seglen);
3591                         if (unlikely(err)) {
3592                                 *bad_wr = wr;
3593                                 goto out;
3594                         }
3595                         wqe  += seglen;
3596                         size += seglen / 16;
3597                         break;
3598                 case MLX4_IB_QPT_TUN_SMI:
3599                 case MLX4_IB_QPT_TUN_GSI:
3600                         /* this is a UD qp used in MAD responses to slaves. */
3601                         set_datagram_seg(wqe, ud_wr(wr));
3602                         /* set the forced-loopback bit in the data seg av */
3603                         *(__be32 *) wqe |= cpu_to_be32(0x80000000);
3604                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3605                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3606                         break;
3607                 case MLX4_IB_QPT_UD:
3608                         set_datagram_seg(wqe, ud_wr(wr));
3609                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3610                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3611
3612                         if (wr->opcode == IB_WR_LSO) {
3613                                 err = build_lso_seg(wqe, ud_wr(wr), qp, &seglen,
3614                                                 &lso_hdr_sz, &blh);
3615                                 if (unlikely(err)) {
3616                                         *bad_wr = wr;
3617                                         goto out;
3618                                 }
3619                                 lso_wqe = (__be32 *) wqe;
3620                                 wqe  += seglen;
3621                                 size += seglen / 16;
3622                         }
3623                         break;
3624
3625                 case MLX4_IB_QPT_PROXY_SMI_OWNER:
3626                         err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
3627                                         ctrl, &seglen);
3628                         if (unlikely(err)) {
3629                                 *bad_wr = wr;
3630                                 goto out;
3631                         }
3632                         wqe  += seglen;
3633                         size += seglen / 16;
3634                         /* to start tunnel header on a cache-line boundary */
3635                         add_zero_len_inline(wqe);
3636                         wqe += 16;
3637                         size++;
3638                         build_tunnel_header(ud_wr(wr), wqe, &seglen);
3639                         wqe  += seglen;
3640                         size += seglen / 16;
3641                         break;
3642                 case MLX4_IB_QPT_PROXY_SMI:
3643                 case MLX4_IB_QPT_PROXY_GSI:
3644                         /* If we are tunneling special qps, this is a UD qp.
3645                          * In this case we first add a UD segment targeting
3646                          * the tunnel qp, and then add a header with address
3647                          * information */
3648                         set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe,
3649                                                 ud_wr(wr),
3650                                                 qp->mlx4_ib_qp_type);
3651                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
3652                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
3653                         build_tunnel_header(ud_wr(wr), wqe, &seglen);
3654                         wqe  += seglen;
3655                         size += seglen / 16;
3656                         break;
3657
3658                 case MLX4_IB_QPT_SMI:
3659                 case MLX4_IB_QPT_GSI:
3660                         err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl,
3661                                         &seglen);
3662                         if (unlikely(err)) {
3663                                 *bad_wr = wr;
3664                                 goto out;
3665                         }
3666                         wqe  += seglen;
3667                         size += seglen / 16;
3668                         break;
3669
3670                 default:
3671                         break;
3672                 }
3673
3674                 /*
3675                  * Write data segments in reverse order, so as to
3676                  * overwrite cacheline stamp last within each
3677                  * cacheline.  This avoids issues with WQE
3678                  * prefetching.
3679                  */
3680
3681                 dseg = wqe;
3682                 dseg += wr->num_sge - 1;
3683                 size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
3684
3685                 /* Add one more inline data segment for ICRC for MLX sends */
3686                 if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
3687                              qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
3688                              qp->mlx4_ib_qp_type &
3689                              (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
3690                         set_mlx_icrc_seg(dseg + 1);
3691                         size += sizeof (struct mlx4_wqe_data_seg) / 16;
3692                 }
3693
3694                 for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
3695                         set_data_seg(dseg, wr->sg_list + i);
3696
3697                 /*
3698                  * Possibly overwrite stamping in cacheline with LSO
3699                  * segment only after making sure all data segments
3700                  * are written.
3701                  */
3702                 wmb();
3703                 *lso_wqe = lso_hdr_sz;
3704
3705                 ctrl->qpn_vlan.fence_size = (wr->send_flags & IB_SEND_FENCE ?
3706                                              MLX4_WQE_CTRL_FENCE : 0) | size;
3707
3708                 /*
3709                  * Make sure descriptor is fully written before
3710                  * setting ownership bit (because HW can start
3711                  * executing as soon as we do).
3712                  */
3713                 wmb();
3714
3715                 if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
3716                         *bad_wr = wr;
3717                         err = -EINVAL;
3718                         goto out;
3719                 }
3720
3721                 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
3722                         (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
3723
3724                 /*
3725                  * We can improve latency by not stamping the last
3726                  * send queue WQE until after ringing the doorbell, so
3727                  * only stamp here if there are still more WQEs to post.
3728                  */
3729                 if (wr->next)
3730                         stamp_send_wqe(qp, ind + qp->sq_spare_wqes);
3731                 ind++;
3732         }
3733
3734 out:
3735         if (likely(nreq)) {
3736                 qp->sq.head += nreq;
3737
3738                 /*
3739                  * Make sure that descriptors are written before
3740                  * doorbell record.
3741                  */
3742                 wmb();
3743
3744                 writel_relaxed(qp->doorbell_qpn,
3745                         to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
3746
3747                 /*
3748                  * Make sure doorbells don't leak out of SQ spinlock
3749                  * and reach the HCA out of order.
3750                  */
3751                 mmiowb();
3752
3753                 stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1);
3754
3755                 qp->sq_next_wqe = ind;
3756         }
3757
3758         spin_unlock_irqrestore(&qp->sq.lock, flags);
3759
3760         return err;
3761 }
3762
3763 int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
3764                       const struct ib_send_wr **bad_wr)
3765 {
3766         return _mlx4_ib_post_send(ibqp, wr, bad_wr, false);
3767 }
3768
3769 static int _mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
3770                               const struct ib_recv_wr **bad_wr, bool drain)
3771 {
3772         struct mlx4_ib_qp *qp = to_mqp(ibqp);
3773         struct mlx4_wqe_data_seg *scat;
3774         unsigned long flags;
3775         int err = 0;
3776         int nreq;
3777         int ind;
3778         int max_gs;
3779         int i;
3780         struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
3781
3782         max_gs = qp->rq.max_gs;
3783         spin_lock_irqsave(&qp->rq.lock, flags);
3784
3785         if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR &&
3786             !drain) {
3787                 err = -EIO;
3788                 *bad_wr = wr;
3789                 nreq = 0;
3790                 goto out;
3791         }
3792
3793         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
3794
3795         for (nreq = 0; wr; ++nreq, wr = wr->next) {
3796                 if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
3797                         err = -ENOMEM;
3798                         *bad_wr = wr;
3799                         goto out;
3800                 }
3801
3802                 if (unlikely(wr->num_sge > qp->rq.max_gs)) {
3803                         err = -EINVAL;
3804                         *bad_wr = wr;
3805                         goto out;
3806                 }
3807
3808                 scat = get_recv_wqe(qp, ind);
3809
3810                 if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
3811                     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
3812                         ib_dma_sync_single_for_device(ibqp->device,
3813                                                       qp->sqp_proxy_rcv[ind].map,
3814                                                       sizeof (struct mlx4_ib_proxy_sqp_hdr),
3815                                                       DMA_FROM_DEVICE);
3816                         scat->byte_count =
3817                                 cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
3818                         /* use dma lkey from upper layer entry */
3819                         scat->lkey = cpu_to_be32(wr->sg_list->lkey);
3820                         scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
3821                         scat++;
3822                         max_gs--;
3823                 }
3824
3825                 for (i = 0; i < wr->num_sge; ++i)
3826                         __set_data_seg(scat + i, wr->sg_list + i);
3827
3828                 if (i < max_gs) {
3829                         scat[i].byte_count = 0;
3830                         scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
3831                         scat[i].addr       = 0;
3832                 }
3833
3834                 qp->rq.wrid[ind] = wr->wr_id;
3835
3836                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
3837         }
3838
3839 out:
3840         if (likely(nreq)) {
3841                 qp->rq.head += nreq;
3842
3843                 /*
3844                  * Make sure that descriptors are written before
3845                  * doorbell record.
3846                  */
3847                 wmb();
3848
3849                 *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
3850         }
3851
3852         spin_unlock_irqrestore(&qp->rq.lock, flags);
3853
3854         return err;
3855 }
3856
3857 int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
3858                       const struct ib_recv_wr **bad_wr)
3859 {
3860         return _mlx4_ib_post_recv(ibqp, wr, bad_wr, false);
3861 }
3862
3863 static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
3864 {
3865         switch (mlx4_state) {
3866         case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
3867         case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
3868         case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
3869         case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
3870         case MLX4_QP_STATE_SQ_DRAINING:
3871         case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
3872         case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
3873         case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
3874         default:                     return -1;
3875         }
3876 }
3877
3878 static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
3879 {
3880         switch (mlx4_mig_state) {
3881         case MLX4_QP_PM_ARMED:          return IB_MIG_ARMED;
3882         case MLX4_QP_PM_REARM:          return IB_MIG_REARM;
3883         case MLX4_QP_PM_MIGRATED:       return IB_MIG_MIGRATED;
3884         default: return -1;
3885         }
3886 }
3887
3888 static int to_ib_qp_access_flags(int mlx4_flags)
3889 {
3890         int ib_flags = 0;
3891
3892         if (mlx4_flags & MLX4_QP_BIT_RRE)
3893                 ib_flags |= IB_ACCESS_REMOTE_READ;
3894         if (mlx4_flags & MLX4_QP_BIT_RWE)
3895                 ib_flags |= IB_ACCESS_REMOTE_WRITE;
3896         if (mlx4_flags & MLX4_QP_BIT_RAE)
3897                 ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
3898
3899         return ib_flags;
3900 }
3901
3902 static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev,
3903                             struct rdma_ah_attr *ah_attr,
3904                             struct mlx4_qp_path *path)
3905 {
3906         struct mlx4_dev *dev = ibdev->dev;
3907         u8 port_num = path->sched_queue & 0x40 ? 2 : 1;
3908
3909         memset(ah_attr, 0, sizeof(*ah_attr));
3910         if (port_num == 0 || port_num > dev->caps.num_ports)
3911                 return;
3912         ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num);
3913
3914         if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE)
3915                 rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) |
3916                                ((path->sched_queue & 4) << 1));
3917         else
3918                 rdma_ah_set_sl(ah_attr, (path->sched_queue >> 2) & 0xf);
3919         rdma_ah_set_port_num(ah_attr, port_num);
3920
3921         rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
3922         rdma_ah_set_path_bits(ah_attr, path->grh_mylmc & 0x7f);
3923         rdma_ah_set_static_rate(ah_attr,
3924                                 path->static_rate ? path->static_rate - 5 : 0);
3925         if (path->grh_mylmc & (1 << 7)) {
3926                 rdma_ah_set_grh(ah_attr, NULL,
3927                                 be32_to_cpu(path->tclass_flowlabel) & 0xfffff,
3928                                 path->mgid_index,
3929                                 path->hop_limit,
3930                                 (be32_to_cpu(path->tclass_flowlabel)
3931                                  >> 20) & 0xff);
3932                 rdma_ah_set_dgid_raw(ah_attr, path->rgid);
3933         }
3934 }
3935
3936 int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
3937                      struct ib_qp_init_attr *qp_init_attr)
3938 {
3939         struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
3940         struct mlx4_ib_qp *qp = to_mqp(ibqp);
3941         struct mlx4_qp_context context;
3942         int mlx4_state;
3943         int err = 0;
3944
3945         if (ibqp->rwq_ind_tbl)
3946                 return -EOPNOTSUPP;
3947
3948         mutex_lock(&qp->mutex);
3949
3950         if (qp->state == IB_QPS_RESET) {
3951                 qp_attr->qp_state = IB_QPS_RESET;
3952                 goto done;
3953         }
3954
3955         err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
3956         if (err) {
3957                 err = -EINVAL;
3958                 goto out;
3959         }
3960
3961         mlx4_state = be32_to_cpu(context.flags) >> 28;
3962
3963         qp->state                    = to_ib_qp_state(mlx4_state);
3964         qp_attr->qp_state            = qp->state;
3965         qp_attr->path_mtu            = context.mtu_msgmax >> 5;
3966         qp_attr->path_mig_state      =
3967                 to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
3968         qp_attr->qkey                = be32_to_cpu(context.qkey);
3969         qp_attr->rq_psn              = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
3970         qp_attr->sq_psn              = be32_to_cpu(context.next_send_psn) & 0xffffff;
3971         qp_attr->dest_qp_num         = be32_to_cpu(context.remote_qpn) & 0xffffff;
3972         qp_attr->qp_access_flags     =
3973                 to_ib_qp_access_flags(be32_to_cpu(context.params2));
3974
3975         if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
3976                 to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
3977                 to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
3978                 qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
3979                 qp_attr->alt_port_num   =
3980                         rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
3981         }
3982
3983         qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
3984         if (qp_attr->qp_state == IB_QPS_INIT)
3985                 qp_attr->port_num = qp->port;
3986         else
3987                 qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
3988
3989         /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
3990         qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
3991
3992         qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
3993
3994         qp_attr->max_dest_rd_atomic =
3995                 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
3996         qp_attr->min_rnr_timer      =
3997                 (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
3998         qp_attr->timeout            = context.pri_path.ackto >> 3;
3999         qp_attr->retry_cnt          = (be32_to_cpu(context.params1) >> 16) & 0x7;
4000         qp_attr->rnr_retry          = (be32_to_cpu(context.params1) >> 13) & 0x7;
4001         qp_attr->alt_timeout        = context.alt_path.ackto >> 3;
4002
4003 done:
4004         qp_attr->cur_qp_state        = qp_attr->qp_state;
4005         qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
4006         qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
4007
4008         if (!ibqp->uobject) {
4009                 qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
4010                 qp_attr->cap.max_send_sge = qp->sq.max_gs;
4011         } else {
4012                 qp_attr->cap.max_send_wr  = 0;
4013                 qp_attr->cap.max_send_sge = 0;
4014         }
4015
4016         /*
4017          * We don't support inline sends for kernel QPs (yet), and we
4018          * don't know what userspace's value should be.
4019          */
4020         qp_attr->cap.max_inline_data = 0;
4021
4022         qp_init_attr->cap            = qp_attr->cap;
4023
4024         qp_init_attr->create_flags = 0;
4025         if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
4026                 qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
4027
4028         if (qp->flags & MLX4_IB_QP_LSO)
4029                 qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
4030
4031         if (qp->flags & MLX4_IB_QP_NETIF)
4032                 qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
4033
4034         qp_init_attr->sq_sig_type =
4035                 qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
4036                 IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
4037
4038 out:
4039         mutex_unlock(&qp->mutex);
4040         return err;
4041 }
4042
4043 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
4044                                 struct ib_wq_init_attr *init_attr,
4045                                 struct ib_udata *udata)
4046 {
4047         struct mlx4_ib_dev *dev;
4048         struct ib_qp_init_attr ib_qp_init_attr;
4049         struct mlx4_ib_qp *qp;
4050         struct mlx4_ib_create_wq ucmd;
4051         int err, required_cmd_sz;
4052
4053         if (!udata)
4054                 return ERR_PTR(-EINVAL);
4055
4056         required_cmd_sz = offsetof(typeof(ucmd), comp_mask) +
4057                           sizeof(ucmd.comp_mask);
4058         if (udata->inlen < required_cmd_sz) {
4059                 pr_debug("invalid inlen\n");
4060                 return ERR_PTR(-EINVAL);
4061         }
4062
4063         if (udata->inlen > sizeof(ucmd) &&
4064             !ib_is_udata_cleared(udata, sizeof(ucmd),
4065                                  udata->inlen - sizeof(ucmd))) {
4066                 pr_debug("inlen is not supported\n");
4067                 return ERR_PTR(-EOPNOTSUPP);
4068         }
4069
4070         if (udata->outlen)
4071                 return ERR_PTR(-EOPNOTSUPP);
4072
4073         dev = to_mdev(pd->device);
4074
4075         if (init_attr->wq_type != IB_WQT_RQ) {
4076                 pr_debug("unsupported wq type %d\n", init_attr->wq_type);
4077                 return ERR_PTR(-EOPNOTSUPP);
4078         }
4079
4080         if (init_attr->create_flags & ~IB_WQ_FLAGS_SCATTER_FCS) {
4081                 pr_debug("unsupported create_flags %u\n",
4082                          init_attr->create_flags);
4083                 return ERR_PTR(-EOPNOTSUPP);
4084         }
4085
4086         qp = kzalloc(sizeof(*qp), GFP_KERNEL);
4087         if (!qp)
4088                 return ERR_PTR(-ENOMEM);
4089
4090         qp->pri.vid = 0xFFFF;
4091         qp->alt.vid = 0xFFFF;
4092
4093         memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr));
4094         ib_qp_init_attr.qp_context = init_attr->wq_context;
4095         ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET;
4096         ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr;
4097         ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge;
4098         ib_qp_init_attr.recv_cq = init_attr->cq;
4099         ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */
4100
4101         if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS)
4102                 ib_qp_init_attr.create_flags |= IB_QP_CREATE_SCATTER_FCS;
4103
4104         err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr,
4105                                udata, 0, &qp);
4106         if (err) {
4107                 kfree(qp);
4108                 return ERR_PTR(err);
4109         }
4110
4111         qp->ibwq.event_handler = init_attr->event_handler;
4112         qp->ibwq.wq_num = qp->mqp.qpn;
4113         qp->ibwq.state = IB_WQS_RESET;
4114
4115         return &qp->ibwq;
4116 }
4117
4118 static int ib_wq2qp_state(enum ib_wq_state state)
4119 {
4120         switch (state) {
4121         case IB_WQS_RESET:
4122                 return IB_QPS_RESET;
4123         case IB_WQS_RDY:
4124                 return IB_QPS_RTR;
4125         default:
4126                 return IB_QPS_ERR;
4127         }
4128 }
4129
4130 static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state,
4131                               struct ib_udata *udata)
4132 {
4133         struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4134         enum ib_qp_state qp_cur_state;
4135         enum ib_qp_state qp_new_state;
4136         int attr_mask;
4137         int err;
4138
4139         /* ib_qp.state represents the WQ HW state while ib_wq.state represents
4140          * the WQ logic state.
4141          */
4142         qp_cur_state = qp->state;
4143         qp_new_state = ib_wq2qp_state(new_state);
4144
4145         if (ib_wq2qp_state(new_state) == qp_cur_state)
4146                 return 0;
4147
4148         if (new_state == IB_WQS_RDY) {
4149                 struct ib_qp_attr attr = {};
4150
4151                 attr.port_num = qp->port;
4152                 attr_mask = IB_QP_PORT;
4153
4154                 err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr,
4155                                           attr_mask, IB_QPS_RESET, IB_QPS_INIT,
4156                                           udata);
4157                 if (err) {
4158                         pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n",
4159                                  ibwq->wq_num);
4160                         return err;
4161                 }
4162
4163                 qp_cur_state = IB_QPS_INIT;
4164         }
4165
4166         attr_mask = 0;
4167         err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask,
4168                                   qp_cur_state,  qp_new_state, udata);
4169
4170         if (err && (qp_cur_state == IB_QPS_INIT)) {
4171                 qp_new_state = IB_QPS_RESET;
4172                 if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL,
4173                                         attr_mask, IB_QPS_INIT, IB_QPS_RESET,
4174                                         udata)) {
4175                         pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n",
4176                                 ibwq->wq_num);
4177                         qp_new_state = IB_QPS_INIT;
4178                 }
4179         }
4180
4181         qp->state = qp_new_state;
4182
4183         return err;
4184 }
4185
4186 int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
4187                       u32 wq_attr_mask, struct ib_udata *udata)
4188 {
4189         struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4190         struct mlx4_ib_modify_wq ucmd = {};
4191         size_t required_cmd_sz;
4192         enum ib_wq_state cur_state, new_state;
4193         int err = 0;
4194
4195         required_cmd_sz = offsetof(typeof(ucmd), reserved) +
4196                                    sizeof(ucmd.reserved);
4197         if (udata->inlen < required_cmd_sz)
4198                 return -EINVAL;
4199
4200         if (udata->inlen > sizeof(ucmd) &&
4201             !ib_is_udata_cleared(udata, sizeof(ucmd),
4202                                  udata->inlen - sizeof(ucmd)))
4203                 return -EOPNOTSUPP;
4204
4205         if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)))
4206                 return -EFAULT;
4207
4208         if (ucmd.comp_mask || ucmd.reserved)
4209                 return -EOPNOTSUPP;
4210
4211         if (wq_attr_mask & IB_WQ_FLAGS)
4212                 return -EOPNOTSUPP;
4213
4214         cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state :
4215                                                      ibwq->state;
4216         new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state;
4217
4218         if (cur_state  < IB_WQS_RESET || cur_state  > IB_WQS_ERR ||
4219             new_state < IB_WQS_RESET || new_state > IB_WQS_ERR)
4220                 return -EINVAL;
4221
4222         if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR))
4223                 return -EINVAL;
4224
4225         if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET))
4226                 return -EINVAL;
4227
4228         /* Need to protect against the parent RSS which also may modify WQ
4229          * state.
4230          */
4231         mutex_lock(&qp->mutex);
4232
4233         /* Can update HW state only if a RSS QP has already associated to this
4234          * WQ, so we can apply its port on the WQ.
4235          */
4236         if (qp->rss_usecnt)
4237                 err = _mlx4_ib_modify_wq(ibwq, new_state, udata);
4238
4239         if (!err)
4240                 ibwq->state = new_state;
4241
4242         mutex_unlock(&qp->mutex);
4243
4244         return err;
4245 }
4246
4247 int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
4248 {
4249         struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
4250         struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
4251
4252         if (qp->counter_index)
4253                 mlx4_ib_free_qp_counter(dev, qp);
4254
4255         destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
4256
4257         kfree(qp);
4258
4259         return 0;
4260 }
4261
4262 struct ib_rwq_ind_table
4263 *mlx4_ib_create_rwq_ind_table(struct ib_device *device,
4264                               struct ib_rwq_ind_table_init_attr *init_attr,
4265                               struct ib_udata *udata)
4266 {
4267         struct ib_rwq_ind_table *rwq_ind_table;
4268         struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
4269         unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
4270         unsigned int base_wqn;
4271         size_t min_resp_len;
4272         int i;
4273         int err;
4274
4275         if (udata->inlen > 0 &&
4276             !ib_is_udata_cleared(udata, 0,
4277                                  udata->inlen))
4278                 return ERR_PTR(-EOPNOTSUPP);
4279
4280         min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
4281         if (udata->outlen && udata->outlen < min_resp_len)
4282                 return ERR_PTR(-EINVAL);
4283
4284         if (ind_tbl_size >
4285             device->attrs.rss_caps.max_rwq_indirection_table_size) {
4286                 pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
4287                          ind_tbl_size,
4288                          device->attrs.rss_caps.max_rwq_indirection_table_size);
4289                 return ERR_PTR(-EINVAL);
4290         }
4291
4292         base_wqn = init_attr->ind_tbl[0]->wq_num;
4293
4294         if (base_wqn % ind_tbl_size) {
4295                 pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
4296                          base_wqn);
4297                 return ERR_PTR(-EINVAL);
4298         }
4299
4300         for (i = 1; i < ind_tbl_size; i++) {
4301                 if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
4302                         pr_debug("indirection table's WQNs aren't consecutive\n");
4303                         return ERR_PTR(-EINVAL);
4304                 }
4305         }
4306
4307         rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
4308         if (!rwq_ind_table)
4309                 return ERR_PTR(-ENOMEM);
4310
4311         if (udata->outlen) {
4312                 resp.response_length = offsetof(typeof(resp), response_length) +
4313                                         sizeof(resp.response_length);
4314                 err = ib_copy_to_udata(udata, &resp, resp.response_length);
4315                 if (err)
4316                         goto err;
4317         }
4318
4319         return rwq_ind_table;
4320
4321 err:
4322         kfree(rwq_ind_table);
4323         return ERR_PTR(err);
4324 }
4325
4326 int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
4327 {
4328         kfree(ib_rwq_ind_tbl);
4329         return 0;
4330 }
4331
4332 struct mlx4_ib_drain_cqe {
4333         struct ib_cqe cqe;
4334         struct completion done;
4335 };
4336
4337 static void mlx4_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc)
4338 {
4339         struct mlx4_ib_drain_cqe *cqe = container_of(wc->wr_cqe,
4340                                                      struct mlx4_ib_drain_cqe,
4341                                                      cqe);
4342
4343         complete(&cqe->done);
4344 }
4345
4346 /* This function returns only once the drained WR was completed */
4347 static void handle_drain_completion(struct ib_cq *cq,
4348                                     struct mlx4_ib_drain_cqe *sdrain,
4349                                     struct mlx4_ib_dev *dev)
4350 {
4351         struct mlx4_dev *mdev = dev->dev;
4352
4353         if (cq->poll_ctx == IB_POLL_DIRECT) {
4354                 while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0)
4355                         ib_process_cq_direct(cq, -1);
4356                 return;
4357         }
4358
4359         if (mdev->persist->state == MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4360                 struct mlx4_ib_cq *mcq = to_mcq(cq);
4361                 bool triggered = false;
4362                 unsigned long flags;
4363
4364                 spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
4365                 /* Make sure that the CQ handler won't run if wasn't run yet */
4366                 if (!mcq->mcq.reset_notify_added)
4367                         mcq->mcq.reset_notify_added = 1;
4368                 else
4369                         triggered = true;
4370                 spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
4371
4372                 if (triggered) {
4373                         /* Wait for any scheduled/running task to be ended */
4374                         switch (cq->poll_ctx) {
4375                         case IB_POLL_SOFTIRQ:
4376                                 irq_poll_disable(&cq->iop);
4377                                 irq_poll_enable(&cq->iop);
4378                                 break;
4379                         case IB_POLL_WORKQUEUE:
4380                                 cancel_work_sync(&cq->work);
4381                                 break;
4382                         default:
4383                                 WARN_ON_ONCE(1);
4384                         }
4385                 }
4386
4387                 /* Run the CQ handler - this makes sure that the drain WR will
4388                  * be processed if wasn't processed yet.
4389                  */
4390                 mcq->mcq.comp(&mcq->mcq);
4391         }
4392
4393         wait_for_completion(&sdrain->done);
4394 }
4395
4396 void mlx4_ib_drain_sq(struct ib_qp *qp)
4397 {
4398         struct ib_cq *cq = qp->send_cq;
4399         struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
4400         struct mlx4_ib_drain_cqe sdrain;
4401         const struct ib_send_wr *bad_swr;
4402         struct ib_rdma_wr swr = {
4403                 .wr = {
4404                         .next = NULL,
4405                         { .wr_cqe       = &sdrain.cqe, },
4406                         .opcode = IB_WR_RDMA_WRITE,
4407                 },
4408         };
4409         int ret;
4410         struct mlx4_ib_dev *dev = to_mdev(qp->device);
4411         struct mlx4_dev *mdev = dev->dev;
4412
4413         ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
4414         if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4415                 WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
4416                 return;
4417         }
4418
4419         sdrain.cqe.done = mlx4_ib_drain_qp_done;
4420         init_completion(&sdrain.done);
4421
4422         ret = _mlx4_ib_post_send(qp, &swr.wr, &bad_swr, true);
4423         if (ret) {
4424                 WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
4425                 return;
4426         }
4427
4428         handle_drain_completion(cq, &sdrain, dev);
4429 }
4430
4431 void mlx4_ib_drain_rq(struct ib_qp *qp)
4432 {
4433         struct ib_cq *cq = qp->recv_cq;
4434         struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
4435         struct mlx4_ib_drain_cqe rdrain;
4436         struct ib_recv_wr rwr = {};
4437         const struct ib_recv_wr *bad_rwr;
4438         int ret;
4439         struct mlx4_ib_dev *dev = to_mdev(qp->device);
4440         struct mlx4_dev *mdev = dev->dev;
4441
4442         ret = ib_modify_qp(qp, &attr, IB_QP_STATE);
4443         if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) {
4444                 WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
4445                 return;
4446         }
4447
4448         rwr.wr_cqe = &rdrain.cqe;
4449         rdrain.cqe.done = mlx4_ib_drain_qp_done;
4450         init_completion(&rdrain.done);
4451
4452         ret = _mlx4_ib_post_recv(qp, &rwr, &bad_rwr, true);
4453         if (ret) {
4454                 WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
4455                 return;
4456         }
4457
4458         handle_drain_completion(cq, &rdrain, dev);
4459 }