1 commit 8e6b03bb781ee403e2aa3de9b9576ef42d919ce8
2 commit c0aa89f0b295dd0c20b2ff2b1d2eca10cdc84f4b
3 Author: Michael S. Tsirkin <mst@dev.mellanox.co.il>
4 Date: Thu Aug 30 15:51:40 2007 +0300
8 ConnectX supports shrinking wqe, such that a single WR can include
9 multiple units of wqe_shift. This way, WRs can differ in size, and
10 do not have to be a power of 2 in size, saving memory and speeding up
11 send WR posting. Unfortunately, if we do this wqe_index field in CQE
12 can't be used to look up the WR ID anymore, so do this only if
13 selective signalling is off.
15 Further, on 32-bit platforms, we can't use vmap to make
16 the QP buffer virtually contigious. Thus we have to use
17 constant-sized WRs to make sure a WR is always fully within
18 a single page-sized chunk.
20 Finally, we use WR with NOP opcode to avoid wrap-around
21 in the middle of WR. We set NoErrorCompletion bit to avoid getting
22 completions with error for NOP WRs. Since NEC is only supported
23 starting with firmware 2.2.232, we use constant-sized WRs
24 for older firmware. And, since MLX QPs only support SEND, we use
25 constant-sized WRs in this case.
27 When stamping during NOP posting, do stamping following setting of
28 the NOP wqe valid bit.
30 Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
31 Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
33 commit 8e6b03bb781ee403e2aa3de9b9576ef42d919ce8
34 commit c0aa89f0b295dd0c20b2ff2b1d2eca10cdc84f4b
35 Author: Michael S. Tsirkin <mst@dev.mellanox.co.il>
36 Date: Thu Aug 30 15:51:40 2007 +0300
38 IB/mlx4: shrinking WQE
40 ConnectX supports shrinking wqe, such that a single WR can include
41 multiple units of wqe_shift. This way, WRs can differ in size, and
42 do not have to be a power of 2 in size, saving memory and speeding up
43 send WR posting. Unfortunately, if we do this wqe_index field in CQE
44 can't be used to look up the WR ID anymore, so do this only if
45 selective signalling is off.
47 Further, on 32-bit platforms, we can't use vmap to make
48 the QP buffer virtually contigious. Thus we have to use
49 constant-sized WRs to make sure a WR is always fully within
50 a single page-sized chunk.
52 Finally, we use WR with NOP opcode to avoid wrap-around
53 in the middle of WR. We set NoErrorCompletion bit to avoid getting
54 completions with error for NOP WRs. Since NEC is only supported
55 starting with firmware 2.2.232, we use constant-sized WRs
56 for older firmware. And, since MLX QPs only support SEND, we use
57 constant-sized WRs in this case.
59 When stamping during NOP posting, do stamping following setting of
60 the NOP wqe valid bit.
62 Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
63 Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
65 Index: ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/cq.c
66 ===================================================================
67 --- ofed_kernel-2.6.16_sles10.orig/drivers/infiniband/hw/mlx4/cq.c 2008-01-22 13:19:40.000000000 +0200
68 +++ ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/cq.c 2008-01-22 13:20:13.000000000 +0200
69 @@ -353,6 +353,12 @@ static int mlx4_ib_poll_one(struct mlx4_
70 is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
71 MLX4_CQE_OPCODE_ERROR;
73 + if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
75 + printk(KERN_WARNING "Completion for NOP opcode detected!\n");
79 if ((be32_to_cpu(cqe->my_qpn) & (1 << 23)) && !is_send) {
81 * We do not have to take the XRC SRQ table lock here,
82 @@ -391,8 +397,10 @@ static int mlx4_ib_poll_one(struct mlx4_
86 - wqe_ctr = be16_to_cpu(cqe->wqe_index);
87 - wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
88 + if (!(*cur_qp)->sq_signal_bits) {
89 + wqe_ctr = be16_to_cpu(cqe->wqe_index);
90 + wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
92 wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
94 } else if (is_xrc_recv) {
95 Index: ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/mlx4_ib.h
96 ===================================================================
97 --- ofed_kernel-2.6.16_sles10.orig/drivers/infiniband/hw/mlx4/mlx4_ib.h 2008-01-22 13:19:40.000000000 +0200
98 +++ ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/mlx4_ib.h 2008-01-22 13:20:13.000000000 +0200
99 @@ -136,6 +136,8 @@ struct mlx4_ib_qp {
102 __be32 sq_signal_bits;
103 + unsigned sq_next_wqe;
104 + int sq_max_wqes_per_wr;
106 struct mlx4_ib_wq sq;
108 Index: ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/qp.c
109 ===================================================================
110 --- ofed_kernel-2.6.16_sles10.orig/drivers/infiniband/hw/mlx4/qp.c 2008-01-22 13:19:40.000000000 +0200
111 +++ ofed_kernel-2.6.16_sles10/drivers/infiniband/hw/mlx4/qp.c 2008-01-22 13:31:45.000000000 +0200
116 +#include <linux/log2.h>
117 #include <rdma/ib_cache.h>
118 #include <rdma/ib_pack.h>
120 @@ -97,7 +98,7 @@ static int is_qp0(struct mlx4_ib_dev *de
122 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
124 - if (qp->buf.nbufs == 1)
125 + if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1)
126 return qp->buf.u.direct.buf + offset;
128 return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
129 @@ -116,16 +117,88 @@ static void *get_send_wqe(struct mlx4_ib
132 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
133 - * first four bytes of every 64 byte chunk with 0xffffffff, except for
134 - * the very first chunk of the WQE.
135 + * first four bytes of every 64 byte chunk with
136 + * 0x7FFFFFF | (invalid_ownership_value << 31).
138 + * When max WR is than or equal to the WQE size,
139 + * as an optimization, we can stamp WQE with 0xffffffff,
140 + * and skip the very first chunk of the WQE.
142 -static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
143 +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
145 - u32 *wqe = get_send_wqe(qp, n);
153 + s = roundup(size, 1 << qp->sq.wqe_shift);
154 + if (qp->sq_max_wqes_per_wr > 1) {
155 + for (i = 0; i < s; i += 64) {
156 + ind = (i >> qp->sq.wqe_shift) + n;
157 + stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
158 + cpu_to_be32(0xffffffff);
159 + buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
160 + wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
164 + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
165 + for (i = 64; i < s; i += 64) {
172 +static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
174 + struct mlx4_wqe_ctrl_seg *ctrl;
175 + struct mlx4_wqe_inline_seg *inl;
179 + ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
180 + s = sizeof(struct mlx4_wqe_ctrl_seg);
182 + if (qp->ibqp.qp_type == IB_QPT_UD) {
183 + struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
184 + struct mlx4_av *av = (struct mlx4_av *)dgram->av;
185 + memset(dgram, 0, sizeof *dgram);
186 + av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
187 + s += sizeof(struct mlx4_wqe_datagram_seg);
190 + /* Pad the remainder of the WQE with an inline data segment. */
193 + inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
195 + ctrl->srcrb_flags = 0;
196 + ctrl->fence_size = size / 16;
198 + * Make sure descriptor is fully written before
199 + * setting ownership bit (because HW can start
200 + * executing as soon as we do).
204 - for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
205 - wqe[i] = 0xffffffff;
206 + ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
207 + (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
209 + stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
212 +/* Post NOP WQE to prevent wrap-around in the middle of WR */
213 +static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
215 + unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
216 + if (unlikely(s < qp->sq_max_wqes_per_wr)) {
217 + post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
223 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
224 @@ -258,6 +331,7 @@ static int set_kernel_sq_size(struct mlx
226 struct ib_qp_cap *cap = &init_attr->cap;
227 enum ib_qp_type type = init_attr->qp_type;
231 /* Sanity check SQ size before proceeding */
232 @@ -281,22 +355,69 @@ static int set_kernel_sq_size(struct mlx
236 - qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
237 - sizeof (struct mlx4_wqe_data_seg) +
239 - cap->max_inline_data +
240 - sizeof (struct mlx4_wqe_inline_seg)) +
241 - send_wqe_overhead(type)));
242 - qp->sq.wqe_shift = max(MLX4_IB_SQ_MIN_WQE_SHIFT, qp->sq.wqe_shift);
243 - qp->sq.max_gs = ((1 << qp->sq.wqe_shift) -reserve - send_wqe_overhead(type)) /
244 - sizeof (struct mlx4_wqe_data_seg);
245 + s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg) + reserve,
246 + cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
247 + send_wqe_overhead(type);
250 - * We need to leave 2 KB + 1 WQE of headroom in the SQ to
251 - * allow HW to prefetch.
252 + * Hermon supports shrinking wqe, such that a single WR can include
253 + * multiple units of wqe_shift. This way, WRs can differ in size, and
254 + * do not have to be a power of 2 in size, saving memory and speeding up
255 + * send WR posting. Unfortunately, if we do this wqe_index field in CQE
256 + * can't be used to look up the WR ID anymore, so do this only if
257 + * selective signalling is off.
259 + * Further, on 32-bit platforms, we can't use vmap to make
260 + * the QP buffer virtually contigious. Thus we have to use
261 + * constant-sized WRs to make sure a WR is always fully within
262 + * a single page-sized chunk.
264 + * Finally, we use NOP opcode to avoid wrap-around in the middle of WR.
265 + * We set NEC bit to avoid getting completions with error for NOP WRs.
266 + * Since NEC is only supported starting with firmware 2.2.232,
267 + * we use constant-sized WRs for older firmware.
269 + * And, since MLX QPs only support SEND, we use constant-sized WRs in this
272 + * We look for the smallest value of wqe_shift such that the resulting
273 + * number of wqes does not exceed device capabilities.
275 + * We set WQE size to at least 64 bytes, this way stamping invalidates each WQE.
277 - qp->sq_spare_wqes = MLX4_IB_SQ_HEADROOM(qp->sq.wqe_shift);
278 - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
279 + if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
280 + qp->sq_signal_bits && BITS_PER_LONG == 64 &&
281 + type != IB_QPT_SMI && type != IB_QPT_GSI)
282 + qp->sq.wqe_shift = ilog2(64);
284 + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
287 + if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
290 + qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << qp->sq.wqe_shift);
293 + * We need to leave 2 KB + 1 WR of headroom in the SQ to
294 + * allow HW to prefetch.
296 + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
297 + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
298 + qp->sq_max_wqes_per_wr +
299 + qp->sq_spare_wqes);
301 + if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
304 + if (qp->sq_max_wqes_per_wr <= 1)
307 + ++qp->sq.wqe_shift;
310 + qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) - reserve -
311 + send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
313 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
314 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
315 @@ -309,8 +430,7 @@ static int set_kernel_sq_size(struct mlx
318 cap->max_send_wr = qp->sq.max_post =
319 - min(qp->sq.wqe_cnt - qp->sq_spare_wqes,
320 - dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE);
321 + (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
322 cap->max_send_sge =min(qp->sq.max_gs,
323 min(dev->dev->caps.max_sq_sg,
324 dev->dev->caps.max_rq_sg));
325 @@ -360,6 +480,12 @@ static int create_qp_common(struct mlx4_
329 + qp->sq_next_wqe = 0;
331 + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
332 + qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
334 + qp->sq_signal_bits = 0;
336 err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
337 !!init_attr->srq || !!init_attr->xrc_domain , qp);
338 @@ -454,11 +580,6 @@ static int create_qp_common(struct mlx4_
340 qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
342 - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
343 - qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
345 - qp->sq_signal_bits = 0;
347 qp->mqp.event = mlx4_ib_qp_event;
350 @@ -969,7 +1090,7 @@ static int __mlx4_ib_modify_qp(struct ib
351 ctrl = get_send_wqe(qp, i);
352 ctrl->owner_opcode = cpu_to_be32(1 << 31);
354 - stamp_send_wqe(qp, i);
355 + stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
359 @@ -1022,6 +1143,7 @@ static int __mlx4_ib_modify_qp(struct ib
363 + qp->sq_next_wqe = 0;
364 if (!ibqp->srq && ibqp->qp_type != IB_QPT_XRC)
367 @@ -1356,13 +1478,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
374 + int uninitialized_var(stamp);
375 + int uninitialized_var(size);
378 spin_lock_irqsave(&qp->sq.lock, flags);
381 + ind = qp->sq_next_wqe;
383 for (nreq = 0; wr; ++nreq, wr = wr->next) {
384 if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
385 @@ -1378,7 +1501,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
388 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
389 - qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
390 + qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
393 (wr->send_flags & IB_SEND_SIGNALED ?
394 @@ -1511,16 +1634,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
395 ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
396 (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
398 + stamp = ind + qp->sq_spare_wqes;
399 + ind += DIV_ROUND_UP(size * 16, 1 << qp->sq.wqe_shift);
402 * We can improve latency by not stamping the last
403 * send queue WQE until after ringing the doorbell, so
404 * only stamp here if there are still more WQEs to post.
406 + * Same optimization applies to padding with NOP wqe
407 + * in case of WQE shrinking (used to prevent wrap-around
408 + * in the middle of WR).
411 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
412 - (qp->sq.wqe_cnt - 1));
414 + stamp_send_wqe(qp, stamp, size * 16);
415 + ind = pad_wraparound(qp, ind);
422 @@ -1542,8 +1672,10 @@ out:
426 - stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
427 - (qp->sq.wqe_cnt - 1));
428 + stamp_send_wqe(qp, stamp, size * 16);
430 + ind = pad_wraparound(qp, ind);
431 + qp->sq_next_wqe = ind;
434 spin_unlock_irqrestore(&qp->sq.lock, flags);
435 Index: ofed_kernel-2.6.16_sles10/drivers/net/mlx4/alloc.c
436 ===================================================================
437 --- ofed_kernel-2.6.16_sles10.orig/drivers/net/mlx4/alloc.c 2008-01-22 13:19:40.000000000 +0200
438 +++ ofed_kernel-2.6.16_sles10/drivers/net/mlx4/alloc.c 2008-01-22 13:20:13.000000000 +0200
439 @@ -152,6 +152,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev,
441 memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
444 + if (BITS_PER_LONG == 64) {
445 + struct page **pages;
446 + pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
449 + for (i = 0; i < buf->nbufs; ++i)
450 + pages[i] = virt_to_page(buf->u.page_list[i].buf);
451 + buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
453 + if (!buf->u.direct.buf)
459 @@ -171,6 +184,9 @@ void mlx4_buf_free(struct mlx4_dev *dev,
460 dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
463 + if (BITS_PER_LONG == 64)
464 + vunmap(buf->u.direct.buf);
466 for (i = 0; i < buf->nbufs; ++i)
467 if (buf->u.page_list[i].buf)
468 dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
469 Index: ofed_kernel-2.6.16_sles10/include/linux/mlx4/device.h
470 ===================================================================
471 --- ofed_kernel-2.6.16_sles10.orig/include/linux/mlx4/device.h 2008-01-22 13:19:40.000000000 +0200
472 +++ ofed_kernel-2.6.16_sles10/include/linux/mlx4/device.h 2008-01-22 13:20:13.000000000 +0200
473 @@ -134,6 +134,11 @@ enum {
474 MLX4_STAT_RATE_OFFSET = 5
477 +static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
479 + return (major << 32) | (minor << 16) | subminor;
485 @@ -193,7 +198,7 @@ struct mlx4_buf_list {
491 struct mlx4_buf_list direct;
492 struct mlx4_buf_list *page_list;
494 Index: ofed_kernel-2.6.16_sles10/include/linux/mlx4/qp.h
495 ===================================================================
496 --- ofed_kernel-2.6.16_sles10.orig/include/linux/mlx4/qp.h 2008-01-22 13:19:40.000000000 +0200
497 +++ ofed_kernel-2.6.16_sles10/include/linux/mlx4/qp.h 2008-01-22 13:20:13.000000000 +0200
498 @@ -155,7 +155,11 @@ struct mlx4_qp_context {
502 +/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
503 +#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
506 + MLX4_WQE_CTRL_NEC = 1 << 29,
507 MLX4_WQE_CTRL_FENCE = 1 << 6,
508 MLX4_WQE_CTRL_CQ_UPDATE = 3 << 2,
509 MLX4_WQE_CTRL_SOLICITED = 1 << 1,