[MLX4] fixed bug: send WQE buffer initialization was skipped in case of NDI provider...
[mirror/winof/.git] / hw / mlx4 / user / hca / qp.c
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4  * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34
35 #include "mlx4.h"
36 #include "doorbell.h"
37 #include "wqe.h"
38 #include "mlx4_debug.h"
39
40 #if defined(EVENT_TRACING)
41 #include "qp.tmh"
42 #endif
43
44 static enum mlx4_opcode_type __to_opcode(ib_send_wr_t *wr)
45 {
46
47         enum mlx4_opcode_type opcode = MLX4_OPCODE_INVALID;
48
49         switch (wr->wr_type) {
50         case WR_SEND: 
51                 opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? 
52                                         MLX4_OPCODE_SEND_IMM : MLX4_OPCODE_SEND;
53                 break;
54         case WR_RDMA_WRITE:     
55                 opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ?
56                                         MLX4_OPCODE_RDMA_WRITE_IMM : MLX4_OPCODE_RDMA_WRITE;
57                 break;
58         case WR_RDMA_READ:
59                 opcode = MLX4_OPCODE_RDMA_READ;
60                 break;
61         case WR_COMPARE_SWAP:
62                 opcode = MLX4_OPCODE_ATOMIC_CS;
63                 break;
64         case WR_FETCH_ADD:
65                 opcode = MLX4_OPCODE_ATOMIC_FA;
66                 break;
67         default:
68                 opcode = MLX4_OPCODE_INVALID;
69                 break;
70         }
71
72         return opcode;
73 }
74
75 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
76 {
77         return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
78 }
79
80 static void *get_send_wqe(struct mlx4_qp *qp, int n)
81 {
82         return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
83 }
84
85 /*
86  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
87  * first four bytes of every 64 byte chunk with 0xffffffff, except for
88  * the very first chunk of the WQE.
89  */
90 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
91 {
92         uint32_t *wqe = get_send_wqe(qp, n);
93         int i;
94
95         for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
96                 wqe[i] = 0xffffffff;
97 }
98
99 void mlx4_init_qp_indices(struct mlx4_qp *qp)
100 {
101         qp->sq.head      = 0;
102         qp->sq.tail      = 0;
103         qp->rq.head      = 0;
104         qp->rq.tail      = 0;
105 }
106
107 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
108 {
109         struct mlx4_wqe_ctrl_seg *ctrl;
110         int i;
111
112         for (i = 0; i < qp->sq.wqe_cnt; ++i) {
113                 ctrl = get_send_wqe(qp, i);
114                 ctrl->owner_opcode = htonl((uint32_t)1 << 31);
115
116                 stamp_send_wqe(qp, i);
117         }
118 }
119
120 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
121 {
122         int cur;
123
124         cur = wq->head - wq->tail;
125         if (cur + nreq < wq->max_post)
126                 return 0;
127
128         pthread_spin_lock(&cq->lock);
129         cur = wq->head - wq->tail;
130         pthread_spin_unlock(&cq->lock);
131
132         return cur + nreq >= wq->max_post;
133 }
134
135 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
136                                  uint64_t remote_addr, uint32_t rkey)
137 {
138         rseg->raddr    = cl_hton64(remote_addr);
139         rseg->rkey     = rkey;
140         rseg->reserved = 0;
141 }
142
143 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, ib_send_wr_t *wr)
144 {
145         if (wr->wr_type == WR_COMPARE_SWAP) {
146                 aseg->swap_add = wr->remote_ops.atomic2;
147                 aseg->compare  = wr->remote_ops.atomic1;
148         } else {
149                 aseg->swap_add = wr->remote_ops.atomic1;
150                 aseg->compare  = 0;
151         }
152 }
153
154 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, ib_send_wr_t *wr)
155 {
156         memcpy(dseg->av, &to_mah((struct ibv_ah *)wr->dgrm.ud.h_av)->av, sizeof (struct mlx4_av));
157         dseg->dqpn = wr->dgrm.ud.remote_qp;
158         dseg->qkey = wr->dgrm.ud.remote_qkey;
159 }
160
161 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, ib_local_ds_t *ds)
162 {
163         dseg->byte_count = cl_hton32(ds->length);
164         dseg->lkey       = cl_hton32(ds->lkey);
165         dseg->addr       = cl_hton64(ds->vaddr);
166 }
167
168 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, ib_local_ds_t *ds)
169 {
170         dseg->lkey       = cl_hton32(ds->lkey);
171         dseg->addr       = cl_hton64(ds->vaddr);
172
173         /*
174          * Need a barrier here before writing the byte_count field to
175          * make sure that all the data is visible before the
176          * byte_count field is set.  Otherwise, if the segment begins
177          * a new cacheline, the HCA prefetcher could grab the 64-byte
178          * chunk and get a valid (!= * 0xffffffff) byte count but
179          * stale data, and end up sending the wrong data.
180          */
181         wmb();
182
183         dseg->byte_count = cl_hton32(ds->length);
184 }
185
186 /*
187  * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
188  * implementations may use move-string-buffer assembler instructions,
189  * which do not guarantee order of copying.
190  */
191 static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
192 {
193 #ifdef _WIN64
194         uint64_t *d = (uint64_t *)dst;
195         uint64_t *s = (uint64_t *)src;
196
197         while (bytecnt > 0) {
198                 *d++ = *s++;
199                 *d++ = *s++;
200                 bytecnt -= 2 * sizeof (uint64_t);
201         }
202 #else
203         while (bytecnt > 0) {
204                 *dst++ = *src++;
205                 *dst++ = *src++;
206                 bytecnt -= 2 * sizeof (unsigned long);
207         }
208 #endif  
209 }
210
211 ib_api_status_t
212 mlx4_post_send(
213         IN              const   void* FUNC_PTR64                                h_qp,
214         IN                              ib_send_wr_t*   const           p_wr,
215                 OUT                     ib_send_wr_t**                          bad_wr)
216 {
217         struct ibv_qp *ibqp = (struct ibv_qp *)/*Ptr64ToPtr(*/h_qp/*)*/;
218         struct mlx4_qp *qp = to_mqp(ibqp);
219         struct mlx4_context *ctx;
220         uint8_t *wqe;
221         struct mlx4_wqe_ctrl_seg *ctrl = NULL;
222         enum mlx4_opcode_type opcode;
223         int ind;
224         int nreq;
225         int inl = 0;
226         ib_api_status_t status = IB_SUCCESS;
227         ib_send_wr_t *wr = p_wr;
228         int size = 0;
229         uint32_t i;
230
231         pthread_spin_lock(&qp->sq.lock);
232
233         /* XXX check that state is OK to post send */
234         if(ibqp->state == IBV_QPS_RESET) {
235                 status = IB_INVALID_QP_STATE;
236                 if (bad_wr)
237                         *bad_wr = wr;
238                 goto err_state;
239         }
240
241         ind = qp->sq.head;
242
243         for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
244                 if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
245                         status = IB_INSUFFICIENT_RESOURCES;
246                         if (bad_wr)
247                                 *bad_wr = wr;
248                         goto out;
249                 }
250
251                 if (wr->num_ds > (uint32_t)qp->sq.max_gs) {
252                         status = IB_INVALID_MAX_SGE;
253                         if (bad_wr)
254                                 *bad_wr = wr;
255                         goto out;
256                 }
257
258                 opcode = __to_opcode(wr);
259                 if (opcode == MLX4_OPCODE_INVALID) {
260                         status = IB_INVALID_WR_TYPE;
261                         if (bad_wr)
262                                 *bad_wr = wr;
263                         goto out;
264                 }
265
266                 wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
267                 ctrl = (struct mlx4_wqe_ctrl_seg *)wqe;
268                 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
269
270                 ctrl->xrcrb_flags =
271                         (wr->send_opt & IB_SEND_OPT_SIGNALED ?
272                          cl_hton32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
273                         (wr->send_opt & IB_SEND_OPT_SOLICITED ?
274                          cl_hton32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
275                         qp->sq_signal_bits;
276
277                 if (opcode == MLX4_OPCODE_SEND_IMM ||
278                     opcode == MLX4_OPCODE_RDMA_WRITE_IMM)
279                         ctrl->imm = wr->immediate_data;
280                 else
281                         ctrl->imm = 0;
282
283                 wqe += sizeof *ctrl;
284                 size = sizeof *ctrl / 16;
285
286                 switch (ibqp->qp_type) {
287 #ifdef XRC_SUPPORT
288                 case IBV_QPT_XRC:
289                         // TODO: why is the following line outcommented ?
290                         //ctrl->xrcrb_flags |= cl_hton32(wr->xrc_remote_srq_num << 8);
291                         /* fall thru */
292 #endif                  
293                 case IBV_QPT_RC:
294                 case IBV_QPT_UC:
295                         switch (opcode) {
296                         case MLX4_OPCODE_ATOMIC_CS:
297                         case MLX4_OPCODE_ATOMIC_FA:
298                                 set_raddr_seg((struct mlx4_wqe_raddr_seg *)wqe, wr->remote_ops.vaddr,
299                                                                 wr->remote_ops.rkey);
300                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
301
302                                 set_atomic_seg((struct mlx4_wqe_atomic_seg *)wqe, wr);
303                                 wqe  += sizeof (struct mlx4_wqe_atomic_seg);
304                                 size += (sizeof (struct mlx4_wqe_raddr_seg) +
305                                          sizeof (struct mlx4_wqe_atomic_seg)) / 16;
306
307                                 break;
308
309                         case MLX4_OPCODE_RDMA_READ:
310                                 inl = 1;
311                                 /* fall through */
312                         case MLX4_OPCODE_RDMA_WRITE:
313                         case MLX4_OPCODE_RDMA_WRITE_IMM:
314                                 set_raddr_seg((struct mlx4_wqe_raddr_seg *)wqe, wr->remote_ops.vaddr,
315                                                                 wr->remote_ops.rkey);
316                                 wqe  += sizeof (struct mlx4_wqe_raddr_seg);
317                                 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
318
319                                 break;
320
321                         default:
322                                 /* No extra segments required for sends */
323                                 break;
324                         }
325                         break;
326
327                 case IBV_QPT_UD:
328                         set_datagram_seg((struct mlx4_wqe_datagram_seg *)wqe, wr);
329                         wqe  += sizeof (struct mlx4_wqe_datagram_seg);
330                         size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
331                         break;
332
333                 default:
334                         break;
335                 }
336
337                 if (wr->send_opt & IB_SEND_OPT_INLINE && wr->num_ds) {
338                         struct mlx4_wqe_inline_seg *seg;
339                         uint8_t *addr;
340                         int len, seg_len;
341                         int num_seg;
342                         int off, to_copy;
343
344                         inl = 0;
345
346                         seg = (struct mlx4_wqe_inline_seg *)wqe;
347                         wqe += sizeof *seg;
348                         off = (int)(((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1));
349                         num_seg = 0;
350                         seg_len = 0;
351
352                         for (i = 0; i < wr->num_ds; ++i) {
353                                 addr = (uint8_t *)(uintptr_t)wr->ds_array[i].vaddr;
354                                 len  = wr->ds_array[i].length;
355                                 inl += len;
356
357                                 if ((uint32_t)inl > (uint32_t)qp->max_inline_data) {
358                                         inl = 0;
359                                         status = IB_INVALID_PARAMETER;
360                                         *bad_wr = wr;
361                                         goto out;
362                                 }
363
364                                 while (len >= MLX4_INLINE_ALIGN - off) {
365                                         to_copy = MLX4_INLINE_ALIGN - off;
366                                         memcpy(wqe, addr, to_copy);
367                                         len -= to_copy;
368                                         wqe += to_copy;
369                                         addr += to_copy;
370                                         seg_len += to_copy;
371                                         wmb(); /* see comment below */
372                                         seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
373                                         seg_len = 0;
374                                         seg = (struct mlx4_wqe_inline_seg *)wqe;
375                                         wqe += sizeof *seg;
376                                         off = sizeof *seg;
377                                         ++num_seg;
378                                 }
379
380                                 memcpy(wqe, addr, len);
381                                 wqe += len;
382                                 seg_len += len;
383                                 off += len;
384                         }
385
386                         if (seg_len) {
387                                 ++num_seg;
388                                 /*
389                                  * Need a barrier here to make sure
390                                  * all the data is visible before the
391                                  * byte_count field is set.  Otherwise
392                                  * the HCA prefetcher could grab the
393                                  * 64-byte chunk with this inline
394                                  * segment and get a valid (!=
395                                  * 0xffffffff) byte count but stale
396                                  * data, and end up sending the wrong
397                                  * data.
398                                  */
399                                 wmb();
400                                 seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
401                         }
402
403                         size += (inl + num_seg * sizeof * seg + 15) / 16;
404                 } else {
405                         struct mlx4_wqe_data_seg *seg = (struct mlx4_wqe_data_seg *)wqe;
406
407                         for (i = wr->num_ds; i > 0; --i)
408                                 set_data_seg(seg + i - 1, wr->ds_array + i - 1);
409
410                         size += wr->num_ds * (sizeof *seg / 16);
411                 }
412
413                 ctrl->fence_size = (uint8_t)((wr->send_opt & IB_SEND_OPT_FENCE ?
414                                                                         MLX4_WQE_CTRL_FENCE : 0) | size);
415
416                 /*
417                  * Make sure descriptor is fully written before
418                  * setting ownership bit (because HW can start
419                  * executing as soon as we do).
420                  */
421                 wmb();
422
423                 ctrl->owner_opcode = htonl(opcode) |
424                         (ind & qp->sq.wqe_cnt ? htonl((uint32_t)1 << 31) : 0);
425
426                 /*
427                  * We can improve latency by not stamping the last
428                  * send queue WQE until after ringing the doorbell, so
429                  * only stamp here if there are still more WQEs to post.
430                  */
431                 if (wr->p_next)
432                         stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
433                                        (qp->sq.wqe_cnt - 1));
434
435                 ++ind;
436
437                 MLX4_PRINT( TRACE_LEVEL_INFORMATION, MLX4_DBG_QP, ("qpn %#x, wr_id %#I64x, ix %d, solicited %d\n", 
438                         qp->ibv_qp.qp_num, wr->wr_id, ind - 1, wr->send_opt & IB_SEND_OPT_SOLICITED)); 
439         }
440
441 out:
442         ctx = to_mctx(ibqp->context);
443
444         if (nreq == 1 && inl && size > 1 && size < ctx->bf_buf_size / 16) {
445                 ctrl->owner_opcode |= htonl((qp->sq.head & 0xffff) << 8);
446                 *(uint32_t *) ctrl->reserved |= qp->doorbell_qpn;
447                 /*
448                  * Make sure that descriptor is written to memory
449                  * before writing to BlueFlame page.
450                  */
451                 wmb();
452
453                 ++qp->sq.head;
454
455                 pthread_spin_lock(&ctx->bf_lock);
456
457                 mlx4_bf_copy((unsigned long *) (ctx->bf_page + ctx->bf_offset),
458                                                 (unsigned long *) ctrl, align(size * 16, 64));
459
460                 wc_wmb();
461
462                 ctx->bf_offset ^= ctx->bf_buf_size;
463
464                 pthread_spin_unlock(&ctx->bf_lock);
465         }else if (nreq) {
466                 qp->sq.head += nreq;
467
468                 /*
469                  * Make sure that descriptors are written before
470                  * doorbell record.
471                  */
472                 wmb();
473
474                 *(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
475         }
476
477         if (nreq)
478                 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
479                                (qp->sq.wqe_cnt - 1));
480
481 err_state:
482         pthread_spin_unlock(&qp->sq.lock);
483
484         return status;
485 }
486
487
488 ib_api_status_t
489 mlx4_post_recv(
490         IN              const   void* FUNC_PTR64                                h_qp,
491         IN                              ib_recv_wr_t*   const           p_wr,
492                 OUT                     ib_recv_wr_t**                          bad_wr)
493 {
494         struct mlx4_qp *qp = to_mqp((struct ibv_qp *)/*Ptr64ToPtr(*/h_qp/*)*/);
495         struct mlx4_wqe_data_seg *scat;
496         ib_api_status_t status = IB_SUCCESS;
497         ib_recv_wr_t *wr = p_wr;
498         int nreq;
499         int ind;
500         uint32_t i;
501
502         pthread_spin_lock(&qp->rq.lock);
503
504         /* XXX check that state is OK to post receive */
505         if(qp->ibv_qp.state == IBV_QPS_RESET) {
506                 status = IB_INVALID_QP_STATE;
507                 if (bad_wr)
508                         *bad_wr = wr;
509                 goto err_state;
510         }
511
512         ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
513
514         for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
515                 if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
516                         status = IB_INSUFFICIENT_RESOURCES;
517                         if (bad_wr)
518                                 *bad_wr = wr;
519                         goto out;
520                 }
521
522                 if (wr->num_ds > (uint32_t)qp->rq.max_gs) {
523                         status = IB_INVALID_MAX_SGE;
524                         if (bad_wr)
525                                 *bad_wr = wr;
526                         goto out;
527                 }
528
529                 scat = get_recv_wqe(qp, ind);
530
531                 for (i = 0; i < wr->num_ds; ++i)
532                         __set_data_seg(scat + i, wr->ds_array + i);
533
534                 if (i < (uint32_t)qp->rq.max_gs) {
535                         scat[i].byte_count = 0;
536                         scat[i].lkey       = htonl(MLX4_INVALID_LKEY);
537                         scat[i].addr       = 0;
538                 }
539
540                 qp->rq.wrid[ind] = wr->wr_id;
541
542                 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
543
544                 MLX4_PRINT( TRACE_LEVEL_INFORMATION, MLX4_DBG_QP, ("qpn %#x, wr_id %#I64x, ix %d, \n", 
545                         qp->ibv_qp.qp_num, wr->wr_id, ind - 1)); 
546         }
547
548 out:
549         if (nreq) {
550                 qp->rq.head += nreq;
551
552                 /*
553                  * Make sure that descriptors are written before
554                  * doorbell record.
555                  */
556                 wmb();
557
558                 *qp->db = htonl(qp->rq.head & 0xffff);
559         }
560
561 err_state:
562         pthread_spin_unlock(&qp->rq.lock);
563
564         return status;
565 }
566
567 static int num_inline_segs(int data, enum ibv_qp_type type)
568 {
569         /*
570          * Inline data segments are not allowed to cross 64 byte
571          * boundaries.  For UD QPs, the data segments always start
572          * aligned to 64 bytes (16 byte control segment + 48 byte
573          * datagram segment); for other QPs, there will be a 16 byte
574          * control segment and possibly a 16 byte remote address
575          * segment, so in the worst case there will be only 32 bytes
576          * available for the first data segment.
577          */
578         if (type == IBV_QPT_UD)
579                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
580                          sizeof (struct mlx4_wqe_datagram_seg)) %
581                         MLX4_INLINE_ALIGN;
582         else
583                 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
584                          sizeof (struct mlx4_wqe_raddr_seg)) %
585                         MLX4_INLINE_ALIGN;
586
587         return (int)(data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
588                 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
589 }
590
591 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
592                            struct mlx4_qp *qp)
593 {
594         int size;
595         unsigned max_sq_sge;
596
597         max_sq_sge       = align(cap->max_inline_data +
598                                  num_inline_segs(cap->max_inline_data, type) *
599                                  sizeof (struct mlx4_wqe_inline_seg),
600                                  sizeof (struct mlx4_wqe_data_seg)) /
601                 sizeof (struct mlx4_wqe_data_seg);
602         if (max_sq_sge < cap->max_send_sge)
603                 max_sq_sge = cap->max_send_sge;
604
605         size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
606         switch (type) {
607         case IBV_QPT_UD:
608                 size += sizeof (struct mlx4_wqe_datagram_seg);
609                 break;
610
611         case IBV_QPT_UC:
612                 size += sizeof (struct mlx4_wqe_raddr_seg);
613                 break;
614
615 #ifdef XRC_SUPPORT
616         case IBV_QPT_XRC:
617 #endif          
618         case IBV_QPT_RC:
619                 size += sizeof (struct mlx4_wqe_raddr_seg);
620                 /*
621                  * An atomic op will require an atomic segment, a
622                  * remote address segment and one scatter entry.
623                  */
624                 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
625                             sizeof (struct mlx4_wqe_raddr_seg) +
626                             sizeof (struct mlx4_wqe_data_seg)))
627                         size = (sizeof (struct mlx4_wqe_atomic_seg) +
628                                 sizeof (struct mlx4_wqe_raddr_seg) +
629                                 sizeof (struct mlx4_wqe_data_seg));
630                 break;
631
632         default:
633                 break;
634         }
635
636         /* Make sure that we have enough space for a bind request */
637         if (size < sizeof (struct mlx4_wqe_bind_seg))
638                 size = sizeof (struct mlx4_wqe_bind_seg);
639
640         size += sizeof (struct mlx4_wqe_ctrl_seg);
641
642         for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
643              qp->sq.wqe_shift++)
644                 ; /* nothing */
645 }
646
647 int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
648                        enum ibv_qp_type type, struct mlx4_qp *qp)
649 {
650         UNREFERENCED_PARAMETER(type);
651         
652         qp->rq.max_gs    = cap->max_recv_sge;
653
654         qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
655         if (!qp->sq.wrid)
656                 return -1;
657
658         if (qp->rq.wqe_cnt) {
659                 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
660                 if (!qp->rq.wrid) {
661                         free(qp->sq.wrid);
662                         return -1;
663                 }
664         }
665
666         for (qp->rq.wqe_shift = 4;
667                 (1 << qp->rq.wqe_shift) < qp->rq.max_gs * (int) sizeof (struct mlx4_wqe_data_seg);
668                 qp->rq.wqe_shift++)
669                 ; /* nothing */
670
671         qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
672                 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
673         if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
674                 qp->rq.offset = 0;
675                 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
676         } else {
677                 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
678                 qp->sq.offset = 0;
679         }
680
681         if (mlx4_alloc_buf(&qp->buf, qp->buf_size, pd->context->page_size)) {
682                 free(qp->sq.wrid);
683                 if (qp->rq.wqe_cnt)
684                         free(qp->rq.wrid);
685                 return -1;
686         }
687
688         memset(qp->buf.buf, 0, qp->buf_size);
689         mlx4_qp_init_sq_ownership(qp);
690
691         return 0;
692 }
693
694 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
695                        enum ibv_qp_type type)
696 {
697         int wqe_size;
698         struct mlx4_context *ctx = to_mctx(qp->ibv_qp.context);
699
700         wqe_size = (1 << qp->sq.wqe_shift) - (int) sizeof (struct mlx4_wqe_ctrl_seg);
701         
702         switch (type) {
703         case IBV_QPT_UD:
704                 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
705                 break;
706
707         case IBV_QPT_UC:
708         case IBV_QPT_RC:
709 #ifdef XRC_SUPPORT
710         case IBV_QPT_XRC:
711 #endif          
712                 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
713                 break;
714
715         default:
716                 break;
717         }
718
719         qp->sq.max_gs        = wqe_size / sizeof (struct mlx4_wqe_data_seg);
720         cap->max_send_sge    = min(ctx->max_sge, qp->sq.max_gs);
721         qp->sq.max_post      = min(ctx->max_qp_wr,
722                                    qp->sq.wqe_cnt - qp->sq_spare_wqes);
723         cap->max_send_wr     = qp->sq.max_post;
724
725         /*
726          * Inline data segments can't cross a 64 byte boundary.  So
727          * subtract off one segment header for each 64-byte chunk,
728          * taking into account the fact that wqe_size will be 32 mod
729          * 64 for non-UD QPs.
730          */
731         qp->max_inline_data  = wqe_size -
732                 (int) sizeof (struct mlx4_wqe_inline_seg) *
733                 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
734         cap->max_inline_data = qp->max_inline_data;
735 }
736
737 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
738 {
739         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
740
741         if (ctx->qp_table[tind].refcnt)
742                 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
743         else
744                 return NULL;
745 }
746
747 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
748 {
749         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
750         int ret = 0;
751
752         pthread_mutex_lock(&ctx->qp_table_mutex);
753
754         if (!ctx->qp_table[tind].refcnt) {
755                 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
756                                                    sizeof (struct mlx4_qp *));
757                 if (!ctx->qp_table[tind].table) {
758                         ret = -1;
759                         goto out;
760                 }
761         }
762
763         ++ctx->qp_table[tind].refcnt;
764         ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765
766 out:
767         pthread_mutex_unlock(&ctx->qp_table_mutex);
768         return ret;
769 }
770
771 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
772 {
773         int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
774
775         pthread_mutex_lock(&ctx->qp_table_mutex);
776
777         if (!--ctx->qp_table[tind].refcnt)
778                 free(ctx->qp_table[tind].table);
779         else
780                 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
781
782         pthread_mutex_unlock(&ctx->qp_table_mutex);
783 }