cq: allow polling with an array of wc's
authorshefty <shefty@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Wed, 23 Jul 2008 20:20:52 +0000 (20:20 +0000)
committershefty <shefty@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Wed, 23 Jul 2008 20:20:52 +0000 (20:20 +0000)
Allow polling for a list of work completions using an array of
completion structures, rather than a linked list.  This avoids needing
to walk the list to link the structures together before calling poll,
which is a fast path operation.

A new completion structure is added that provides the qp_context
associated with a completion.  This avoids changes to the existing
ULPs, while taking advantage of the underlying UVP capabilities.
Providing the qp_context is useful when dealing with SRQ.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
git-svn-id: svn://openib.tc.cornell.edu/gen1/trunk@1437 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86

hw/mlx4/user/hca/cq.c
hw/mlx4/user/hca/mlx4.c
hw/mlx4/user/hca/mlx4.h
hw/mthca/user/mlnx_ual_osbypass.c
hw/mthca/user/mlnx_uvp.h
hw/mthca/user/mlnx_uvp_cq.c
hw/mthca/user/mlnx_uvp_verbs.h
inc/user/iba/ib_uvp.h

index 7c8b9f8..a95e183 100644 (file)
@@ -344,8 +344,31 @@ static int mlx4_poll_one(struct mlx4_cq *cq, struct mlx4_qp **cur_qp, ib_wc_t *w
        return CQ_OK;\r
 }\r
 \r
+int mlx4_poll_cq_array(const void* h_cq,\r
+                       const int num_entries, uvp_wc_t* const wc)\r
+{\r
+       struct mlx4_cq *cq = to_mcq((struct ibv_cq *)/*Ptr64ToPtr(*/ h_cq /*)*/);\r
+       struct mlx4_qp *qp;\r
+       int ne;\r
+       int err = CQ_EMPTY;\r
+\r
+       pthread_spin_lock(&cq->lock);\r
+       for (ne = 0; ne < num_entries; ne++) {\r
+               err = mlx4_poll_one(cq, &qp, (ib_wc_t *) &wc[ne]);\r
+               if (err != CQ_OK)\r
+                       break;\r
+               wc[ne].qp_context = qp->ibv_qp.qp_context;\r
+       }\r
+\r
+       if (ne)\r
+               update_cons_index(cq);\r
+       pthread_spin_unlock(&cq->lock);\r
+\r
+       return (err == CQ_OK || err == CQ_EMPTY) ? ne : err;\r
+}\r
+\r
 ib_api_status_t\r
-mlx4_poll_cq(\r
+mlx4_poll_cq_list(\r
        IN              const   void*                                           h_cq,\r
        IN      OUT                     ib_wc_t**       const                   pp_free_wclist,\r
                OUT                     ib_wc_t**       const                   pp_done_wclist)\r
index bb5c71e..1e9c203 100644 (file)
@@ -289,14 +289,15 @@ uvp_get_interface (
        /*\r
         * OS bypass (send, receive, poll/notify cq)\r
         */\r
-       p_uvp->post_send        = mlx4_post_send;\r
-       p_uvp->post_recv        = mlx4_post_recv;\r
+       p_uvp->post_send                = mlx4_post_send;\r
+       p_uvp->post_recv                = mlx4_post_recv;\r
        p_uvp->post_srq_recv    = mlx4_post_srq_recv;\r
-       p_uvp->poll_cq          = mlx4_poll_cq;\r
-       p_uvp->rearm_cq         = mlx4_arm_cq;\r
-       p_uvp->rearm_n_cq       = NULL; /* __enable_ncomp_cq_notify: Not implemented */;\r
-       p_uvp->peek_cq          = NULL; /* __peek_cq: Not implemented */\r
-       p_uvp->bind_mw          = NULL; /* __bind_mw: Not implemented */\r
+       p_uvp->poll_cq                  = mlx4_poll_cq_list;\r
+       p_uvp->poll_cq_array    = mlx4_poll_cq_array;\r
+       p_uvp->rearm_cq                 = mlx4_arm_cq;\r
+       p_uvp->rearm_n_cq               = NULL;\r
+       p_uvp->peek_cq                  = NULL;\r
+       p_uvp->bind_mw                  = NULL;\r
 \r
 #ifdef XRC_SUPPORT\r
        /*\r
index 9ba5dcc..e7f70bb 100644 (file)
@@ -303,7 +303,9 @@ void mlx4_free_buf(struct mlx4_buf *buf);
 uint32_t *mlx4_alloc_db(struct mlx4_context *context, enum mlx4_db_type type);\r
 void mlx4_free_db(struct mlx4_context *context, enum mlx4_db_type type, uint32_t *db);\r
 \r
-ib_api_status_t mlx4_poll_cq(const void* h_cq,\r
+int mlx4_poll_cq_array(const void* h_cq,\r
+                       const int num_entries, uvp_wc_t* const wc);\r
+ib_api_status_t mlx4_poll_cq_list(const void* h_cq,\r
                        ib_wc_t** const pp_free_wclist,\r
                        ib_wc_t** const pp_done_wclist);\r
 ib_api_status_t mlx4_arm_cq(const      void* h_cq, const       boolean_t solicited);\r
index 37262f7..02a5cad 100644 (file)
@@ -177,6 +177,25 @@ err_invalid_params:
 }\r
 \r
 \r
+static int\r
+__poll_cq_array (\r
+       IN              const   void*                                           h_cq,\r
+       IN              const   int                                                     num_entries,\r
+       IN      OUT                     uvp_wc_t*       const                   wc )\r
+{\r
+       int ne;\r
+       struct mthca_cq *cq = (struct mthca_cq *) h_cq;\r
+\r
+       UVP_ENTER(UVP_DBG_CQ);\r
+       CL_ASSERT (cq);\r
+\r
+       ne = cq->ibv_cq.context->ops.poll_cq(&cq->ibv_cq, num_entries, wc);\r
+\r
+       UVP_EXIT(UVP_DBG_CQ);\r
+       return ne;\r
+}\r
+\r
+\r
 static ib_api_status_t\r
 __enable_cq_notify (\r
        IN              const   void*                                           h_cq,\r
@@ -220,28 +239,16 @@ void
 mlnx_get_osbypass_interface (\r
     IN OUT     uvp_interface_t         *p_uvp )\r
 {\r
-\r
     CL_ASSERT(p_uvp);\r
 \r
-    /*\r
-     * Work Request Processing Verbs\r
-     * Should the types be same as Verbs?\r
-     */\r
     p_uvp->post_send = __post_send;\r
     p_uvp->post_recv = __post_recv;\r
     p_uvp->post_srq_recv = __post_srq_recv;\r
-\r
-    /*\r
-     * Completion Processing and \r
-     * Completion Notification Request Verbs.\r
-     * Should the types be same as Verbs?\r
-     */\r
     p_uvp->poll_cq  = __poll_cq;\r
     p_uvp->rearm_cq = __enable_cq_notify;\r
-    p_uvp->rearm_n_cq = NULL; /* __enable_ncomp_cq_notify: Not implemented */;\r
-    p_uvp->peek_cq  = NULL; /* __peek_cq: Not implemented */\r
-\r
-    /* Memory window bind */\r
-    p_uvp->bind_mw = NULL; /* __bind_mw: Not implemented */\r
+    p_uvp->rearm_n_cq = NULL;\r
+    p_uvp->peek_cq  = NULL;\r
+    p_uvp->bind_mw = NULL;\r
+       p_uvp->poll_cq_array = __poll_cq_array;\r
 }\r
 \r
index 8c6181c..cc7cb05 100644 (file)
@@ -278,7 +278,7 @@ struct ibv_cq *mthca_create_cq_pre(struct ibv_context *context, int *cqe,
 struct ibv_cq *mthca_create_cq_post(struct ibv_context *context, \r
                                 struct ibv_create_cq_resp *resp);\r
 int mthca_destroy_cq(struct ibv_cq *cq);\r
-int mthca_poll_cq(struct ibv_cq *cq, int ne, struct _ib_wc *wc);\r
+int mthca_poll_cq(struct ibv_cq *cq, int ne, struct _uvp_wc *wc);\r
 int mthca_poll_cq_list(struct ibv_cq *ibcq, \r
        struct _ib_wc** const pp_free_wclist,\r
        struct _ib_wc** const pp_done_wclist );\r
index 3d592c0..bfc0add 100644 (file)
-/*
- * Copyright (c) 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * $Id$
- */
-
-#include <mt_l2w.h>
-#include <opcode.h>
-#include "mlnx_uvp.h"
-#include "mlnx_uvp_doorbell.h"
-
-#if defined(EVENT_TRACING)
-#include "mlnx_uvp_cq.tmh"
-#endif
-
-
-enum {
-       MTHCA_CQ_DOORBELL       = 0x20
-};
-
-enum {
-       CQ_OK           =  0,
-       CQ_EMPTY        = -1,
-       CQ_POLL_ERR     = -2
-};
-
-#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
-#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
-#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
-#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
-#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
-
-#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
-#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
-#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
-
-enum {
-       MTHCA_CQ_ENTRY_OWNER_SW     = 0x00,
-       MTHCA_CQ_ENTRY_OWNER_HW     = 0x80,
-       MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
-};
-
-enum {
-       SYNDROME_LOCAL_LENGTH_ERR        = 0x01,
-       SYNDROME_LOCAL_QP_OP_ERR         = 0x02,
-       SYNDROME_LOCAL_EEC_OP_ERR        = 0x03,
-       SYNDROME_LOCAL_PROT_ERR          = 0x04,
-       SYNDROME_WR_FLUSH_ERR            = 0x05,
-       SYNDROME_MW_BIND_ERR             = 0x06,
-       SYNDROME_BAD_RESP_ERR            = 0x10,
-       SYNDROME_LOCAL_ACCESS_ERR        = 0x11,
-       SYNDROME_REMOTE_INVAL_REQ_ERR    = 0x12,
-       SYNDROME_REMOTE_ACCESS_ERR       = 0x13,
-       SYNDROME_REMOTE_OP_ERR           = 0x14,
-       SYNDROME_RETRY_EXC_ERR           = 0x15,
-       SYNDROME_RNR_RETRY_EXC_ERR       = 0x16,
-       SYNDROME_LOCAL_RDD_VIOL_ERR      = 0x20,
-       SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
-       SYNDROME_REMOTE_ABORTED_ERR      = 0x22,
-       SYNDROME_INVAL_EECN_ERR          = 0x23,
-       SYNDROME_INVAL_EEC_STATE_ERR     = 0x24
-};
-
-struct mthca_cqe {
-       uint32_t        my_qpn;
-       uint32_t        my_ee;
-       uint32_t        rqpn;
-       uint16_t        sl_g_mlpath;
-       uint16_t        rlid;
-       uint32_t        imm_etype_pkey_eec;
-       uint32_t        byte_cnt;
-       uint32_t        wqe;
-       uint8_t         opcode;
-       uint8_t         is_send;
-       uint8_t         reserved;
-       uint8_t         owner;
-};
-
-struct mthca_err_cqe {
-       uint32_t        my_qpn;
-       uint32_t        reserved1[3];
-       uint8_t         syndrome;
-       uint8_t         vendor_err;
-       uint16_t        db_cnt;
-       uint32_t        reserved2;
-       uint32_t        wqe;
-       uint8_t         opcode;
-       uint8_t         reserved3[2];
-       uint8_t         owner;
-};
-
-static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
-{
-       return (struct mthca_cqe *)((uint8_t*)cq->buf + entry * MTHCA_CQ_ENTRY_SIZE);
-}
-
-static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)
-{
-       struct mthca_cqe *cqe = get_cqe(cq, i);
-       return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
-}
-
-static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
-{
-       return cqe_sw(cq, cq->cons_index & cq->ibv_cq.cqe);
-}
-
-static inline void set_cqe_hw(struct mthca_cqe *cqe)
-{
-       cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
-}
-
-/*
- * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
- * should be correct before calling update_cons_index().
- */
-static inline void update_cons_index(struct mthca_cq *cq, int incr)
-{
-       uint32_t doorbell[2];
-
-       if (mthca_is_memfree(cq->ibv_cq.context)) {
-               *cq->set_ci_db = cl_hton32(cq->cons_index);
-               mb();
-       } else {
-               doorbell[0] = cl_hton32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);
-               doorbell[1] = cl_hton32(incr - 1);
-
-               mthca_write64(doorbell, to_mctx(cq->ibv_cq.context), MTHCA_CQ_DOORBELL);
-       }
-}
-
-
-static void dump_cqe(uint32_t print_lvl, void *cqe_ptr)
-{
-       uint32_t *cqe = cqe_ptr;
-       int i;
-       (void) cqe;     /* avoid warning if mthca_dbg compiled away... */
-
-       UVP_PRINT(print_lvl,UVP_DBG_CQ,("CQE content \n"));
-       UVP_PRINT(print_lvl,UVP_DBG_CQ,(" [%2x] %08x %08x %08x %08x \n",0
-               , cl_ntoh32(cqe[0]), cl_ntoh32(cqe[1]), cl_ntoh32(cqe[2]), cl_ntoh32(cqe[3])));
-       UVP_PRINT(print_lvl,UVP_DBG_CQ,(" [%2x] %08x %08x %08x %08x\n",16
-               , cl_ntoh32(cqe[4]), cl_ntoh32(cqe[5]), cl_ntoh32(cqe[6]), cl_ntoh32(cqe[7])));
-       
-}
-
-static int handle_error_cqe(struct mthca_cq *cq,
-                           struct mthca_qp *qp, int wqe_index, int is_send,
-                           struct mthca_err_cqe *cqe,
-                           struct _ib_wc *entry, int *free_cqe)
-{
-       int err;
-       int dbd;
-       uint32_t new_wqe;
-
-       if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
-               UVP_PRINT(TRACE_LEVEL_ERROR , UVP_DBG_CQ,("local QP operation err "
-                      "(QPN %06x, WQE @ %08x, CQN %06x, index %d, vendor_err %d)\n",
-                      cl_ntoh32(cqe->my_qpn), cl_ntoh32(cqe->wqe),
-                      cq->cqn, cq->cons_index, cqe->vendor_err));
-               dump_cqe(TRACE_LEVEL_VERBOSE, cqe);
-       }
-
-       /*
-        * For completions in error, only work request ID, status, vendor error
-        * (and freed resource count for RD) have to be set.
-        */
-       switch (cqe->syndrome) {
-       case SYNDROME_LOCAL_LENGTH_ERR:
-               entry->status = IB_WCS_LOCAL_LEN_ERR;
-               break;
-       case SYNDROME_LOCAL_QP_OP_ERR:
-               entry->status = IB_WCS_LOCAL_OP_ERR;
-               break;
-       case SYNDROME_LOCAL_PROT_ERR:
-               entry->status = IB_WCS_LOCAL_PROTECTION_ERR;
-               break;
-       case SYNDROME_WR_FLUSH_ERR:
-               entry->status = IB_WCS_WR_FLUSHED_ERR;
-               break;
-       case SYNDROME_MW_BIND_ERR:
-               entry->status = IB_WCS_MEM_WINDOW_BIND_ERR;
-               break;
-       case SYNDROME_BAD_RESP_ERR:
-               entry->status = IB_WCS_BAD_RESP_ERR;
-               break;
-       case SYNDROME_LOCAL_ACCESS_ERR:
-               entry->status = IB_WCS_LOCAL_ACCESS_ERR;
-               break;
-       case SYNDROME_REMOTE_INVAL_REQ_ERR:
-               entry->status = IB_WCS_REM_INVALID_REQ_ERR;
-               break;
-       case SYNDROME_REMOTE_ACCESS_ERR:
-               entry->status = IB_WCS_REM_ACCESS_ERR;
-               break;
-       case SYNDROME_REMOTE_OP_ERR:
-               entry->status = IB_WCS_REM_OP_ERR;
-               break;
-       case SYNDROME_RETRY_EXC_ERR:
-               entry->status = IB_WCS_TIMEOUT_RETRY_ERR;
-               break;
-       case SYNDROME_RNR_RETRY_EXC_ERR:
-               entry->status = IB_WCS_RNR_RETRY_ERR;
-               break;
-       case SYNDROME_LOCAL_EEC_OP_ERR:
-       case SYNDROME_LOCAL_RDD_VIOL_ERR:
-       case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
-       case SYNDROME_REMOTE_ABORTED_ERR:
-       case SYNDROME_INVAL_EECN_ERR:
-       case SYNDROME_INVAL_EEC_STATE_ERR:
-       default:
-               entry->status = IB_WCS_GENERAL_ERR;
-               break;
-       }
-
-       entry->vendor_specific = cqe->vendor_err;
-       
-       /*
-        * Mem-free HCAs always generate one CQE per WQE, even in the
-        * error case, so we don't have to check the doorbell count, etc.
-        */
-       if (mthca_is_memfree(cq->ibv_cq.context))
-               return 0;
-
-       err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe);
-       if (err)
-               return err;
-
-       /*
-        * If we're at the end of the WQE chain, or we've used up our
-        * doorbell count, free the CQE.  Otherwise just update it for
-        * the next poll operation.
-        * 
-        * This doesn't apply to mem-free HCAs, which never use the
-        * doorbell count field.  In that case we always free the CQE.
-        */
-       if (mthca_is_memfree(cq->ibv_cq.context) ||
-           !(new_wqe & cl_hton32(0x3f)) || (!cqe->db_cnt && dbd))
-               return 0;
-
-       cqe->db_cnt   = cl_hton16(cl_ntoh16(cqe->db_cnt) - dbd);
-       cqe->wqe      = new_wqe;
-       cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
-
-       *free_cqe = 0;
-
-       return 0;
-}
-
-static inline int mthca_poll_one(struct mthca_cq *cq,
-                                struct mthca_qp **cur_qp,
-                                int *freed,
-                                struct _ib_wc *entry)
-{
-       struct mthca_wq *wq;
-       struct mthca_cqe *cqe;
-       uint32_t qpn;
-       int wqe_index;
-       int is_error;
-       int is_send;
-       int free_cqe = 1;
-       int err = 0;
-
-       UVP_ENTER(UVP_DBG_CQ);
-       
-       cqe = next_cqe_sw(cq);
-       if (!cqe)
-               return -EAGAIN;
-
-       /*
-        * Make sure we read CQ entry contents after we've checked the
-        * ownership bit.
-        */
-       rmb();
-
-       { // debug print
-               UVP_PRINT(TRACE_LEVEL_VERBOSE,UVP_DBG_CQ,("%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
-                         cq->cqn, cq->cons_index, cl_ntoh32(cqe->my_qpn),
-                         cl_ntoh32(cqe->wqe)));
-               dump_cqe(TRACE_LEVEL_VERBOSE,cqe);
-       }
-       
-       qpn = cl_ntoh32(cqe->my_qpn);
-
-       is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
-               MTHCA_ERROR_CQE_OPCODE_MASK;
-       is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
-
-       if (!*cur_qp || cl_ntoh32(cqe->my_qpn) != (*cur_qp)->ibv_qp.qp_num) {
-               /*
-                * We do not have to take the QP table lock here,
-                * because CQs will be locked while QPs are removed
-                * from the table.
-                */
-               *cur_qp = mthca_find_qp(to_mctx(cq->ibv_cq.context), cl_ntoh32(cqe->my_qpn));
-               if (!*cur_qp) {
-                       UVP_PRINT(TRACE_LEVEL_WARNING,UVP_DBG_CQ, ("CQ entry for unknown QP %06x\n",
-                                  cl_ntoh32(cqe->my_qpn) & 0xffffff));
-                       err = -EINVAL;
-                       goto out;
-               }
-       }
-
-       if (is_send) {
-               wq = &(*cur_qp)->sq;
-               wqe_index = ((cl_ntoh32(cqe->wqe) - (*cur_qp)->send_wqe_offset) >> wq->wqe_shift);
-               entry->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->rq.max];
-       } else if ((*cur_qp)->ibv_qp.srq) {
-               struct mthca_srq * srq = to_msrq((*cur_qp)->ibv_qp.srq);
-               uint32_t wqe = cl_hton32(cqe->wqe);
-               wq = NULL;
-               wqe_index = wqe >> srq->wqe_shift;
-               entry->wr_id = srq->wrid[wqe_index];
-               mthca_free_srq_wqe(srq, wqe_index);
-       } else {
-               wq = &(*cur_qp)->rq;
-               wqe_index = cl_ntoh32(cqe->wqe) >> wq->wqe_shift;
-               entry->wr_id = (*cur_qp)->wrid[wqe_index];
-       }
-
-       if (wq) {
-               if ((int)wq->last_comp < wqe_index)
-                       wq->tail += wqe_index - wq->last_comp;
-               else
-                       wq->tail += wqe_index + wq->max - wq->last_comp;
-
-               wq->last_comp = wqe_index;
-       }
-
-       if (is_send) {
-               entry->recv.ud.recv_opt = 0;
-               switch (cqe->opcode) {
-               case MTHCA_OPCODE_RDMA_WRITE:
-                       entry->wc_type    = IB_WC_RDMA_WRITE;
-                       break;
-               case MTHCA_OPCODE_RDMA_WRITE_IMM:
-                       entry->wc_type    = IB_WC_RDMA_WRITE;
-                       entry->recv.ud.recv_opt |= IB_RECV_OPT_IMMEDIATE;
-                       break;
-               case MTHCA_OPCODE_SEND:
-                       entry->wc_type    = IB_WC_SEND;
-                       break;
-               case MTHCA_OPCODE_SEND_IMM:
-                       entry->wc_type    = IB_WC_SEND;
-                       entry->recv.ud.recv_opt |= IB_RECV_OPT_IMMEDIATE;
-                       break;
-               case MTHCA_OPCODE_RDMA_READ:
-                       entry->wc_type    = IB_WC_RDMA_READ;
-                       entry->length  = cl_ntoh32(cqe->byte_cnt);
-                       break;
-               case MTHCA_OPCODE_ATOMIC_CS:
-                       entry->wc_type    = IB_WC_COMPARE_SWAP;
-                       entry->length  = MTHCA_BYTES_PER_ATOMIC_COMPL;
-                       break;
-               case MTHCA_OPCODE_ATOMIC_FA:
-                       entry->wc_type    = IB_WC_FETCH_ADD;
-                       entry->length  = MTHCA_BYTES_PER_ATOMIC_COMPL;
-                       break;
-               case MTHCA_OPCODE_BIND_MW:
-                       entry->wc_type    = IB_WC_MW_BIND;
-                       break;
-               default:
-                       /* assume it's a send completion */
-                       entry->wc_type    = IB_WC_SEND;
-                       break;
-               }
-       } else {
-               entry->length = cl_ntoh32(cqe->byte_cnt);
-               switch (cqe->opcode & 0x1f) {
-               case IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE:
-               case IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
-                       entry->recv.ud.recv_opt  = IB_RECV_OPT_IMMEDIATE;
-                       entry->recv.ud.immediate_data = cqe->imm_etype_pkey_eec;
-                       entry->wc_type = IB_WC_RECV;
-                       break;
-               case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
-               case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
-                       entry->recv.ud.recv_opt  = IB_RECV_OPT_IMMEDIATE;
-                       entry->recv.ud.immediate_data = cqe->imm_etype_pkey_eec;
-                       entry->wc_type = IB_WC_RECV;
-                       break;
-               default:
-                       entry->recv.ud.recv_opt  = 0;
-                       entry->wc_type = IB_WC_RECV;
-                       break;
-               }
-               entry->recv.ud.remote_lid = cqe->rlid;
-               entry->recv.ud.remote_qp = cqe->rqpn & 0xffffff00;
-               entry->recv.ud.pkey_index     = (uint16_t)(cl_ntoh32(cqe->imm_etype_pkey_eec) >> 16);
-               entry->recv.ud.remote_sl           = cl_ntoh16(cqe->sl_g_mlpath) >> 12;
-               entry->recv.ud.path_bits = cl_ntoh16(cqe->sl_g_mlpath) & 0x7f;
-               entry->recv.ud.recv_opt      |= cl_ntoh16(cqe->sl_g_mlpath) & 0x80 ?
-                       IB_RECV_OPT_GRH_VALID : 0;
-       }
-
-
-       if (is_error) {
-               err = handle_error_cqe(cq, *cur_qp, wqe_index, is_send,
-                                      (struct mthca_err_cqe *) cqe,
-                                      entry, &free_cqe);
-       }
-       else
-               entry->status = IB_WCS_SUCCESS;
-
-out:
-       if (likely(free_cqe)) {
-               set_cqe_hw(cqe);
-               ++(*freed);
-               ++cq->cons_index;
-       }
-
-       UVP_EXIT(UVP_DBG_CQ);
-       return err;
-}
-
-int mthca_poll_cq(struct ibv_cq *ibcq, int num_entries, struct _ib_wc *entry)
-{
-       struct mthca_cq *cq = to_mcq(ibcq);
-       struct mthca_qp *qp = NULL;
-       int err = CQ_OK;
-       int freed = 0;
-       int npolled;
-       
-       cl_spinlock_acquire(&cq->lock);
-
-       for (npolled = 0; npolled < num_entries; ++npolled) {
-               err = mthca_poll_one(cq, &qp, &freed, entry + npolled);
-               if (err)
-                       break;
-       }
-
-       if (freed) {
-               wmb();
-               update_cons_index(cq, freed);
-       }
-
-       cl_spinlock_release(&cq->lock);
-
-       return (err == 0 || err == -EAGAIN) ? npolled : err;
-}
-
-int mthca_poll_cq_list(
-       IN              struct ibv_cq *ibcq, 
-       IN      OUT                     struct _ib_wc** const                           pp_free_wclist,
-               OUT                     struct _ib_wc** const                           pp_done_wclist )
-{
-       struct mthca_cq *cq = to_mcq(ibcq);
-       struct mthca_qp *qp = NULL;
-       int err = CQ_OK;
-       int freed = 0;
-       ib_wc_t         *wc_p, **next_pp;
-       uint32_t        wc_cnt = 0;
-
-       cl_spinlock_acquire(&cq->lock);
-
-       // loop through CQ
-       next_pp = pp_done_wclist;
-       wc_p = *pp_free_wclist;
-       while( wc_p ) {
-               // poll one CQE
-               err = mthca_poll_one(cq, &qp, &freed, wc_p);
-               if (err)
-                       break;
-
-               // prepare for the next loop
-               *next_pp = wc_p;
-               next_pp = &wc_p->p_next;
-               wc_p = wc_p->p_next;
-       }
-
-       // prepare the results
-       *pp_free_wclist = wc_p;         /* Set the head of the free list. */
-       *next_pp = NULL;                                                /* Clear the tail of the done list. */
-
-       // update consumer index
-       if (freed) {
-               wmb();
-               update_cons_index(cq, freed);
-       }
-
-       cl_spinlock_release(&cq->lock);
-       return (err == 0 || err == -EAGAIN)? 0 : err; 
-}
-
-int mthca_tavor_arm_cq(struct ibv_cq *cq, enum ib_cq_notify notify)
-{
-       uint32_t doorbell[2];
-
-       doorbell[0] = cl_hton32((notify == IB_CQ_SOLICITED ?
-                            MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
-                            MTHCA_TAVOR_CQ_DB_REQ_NOT)      |
-                           to_mcq(cq)->cqn);
-       doorbell[1] = 0xffffffff;
-
-       mthca_write64(doorbell, to_mctx(cq->context), MTHCA_CQ_DOORBELL);
-
-       return 0;
-}
-
-int mthca_arbel_arm_cq(struct ibv_cq *ibvcq, enum ib_cq_notify notify)
-{
-       struct mthca_cq *cq = to_mcq(ibvcq);
-       uint32_t doorbell[2];
-       uint32_t sn;
-       uint32_t ci;
-
-       sn = *cq->p_u_arm_sn & 3;
-       ci = cl_hton32(cq->cons_index);
-
-       doorbell[0] = ci;
-       doorbell[1] = cl_hton32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
-                           (notify == IB_CQ_SOLICITED ? 1 : 2));
-
-       mthca_write_db_rec(doorbell, cq->arm_db);
-
-       /*
-        * Make sure that the doorbell record in host memory is
-        * written before ringing the doorbell via PCI MMIO.
-        */
-       wmb();
-
-       doorbell[0] = cl_hton32((sn << 28)                       |
-                           (notify == IB_CQ_SOLICITED ?
-                            MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
-                            MTHCA_ARBEL_CQ_DB_REQ_NOT)      |
-                           cq->cqn);
-       doorbell[1] = ci;
-
-       mthca_write64(doorbell, to_mctx(ibvcq->context), MTHCA_CQ_DOORBELL);
-
-       return 0;
-}
-
-static inline int is_recv_cqe(struct mthca_cqe *cqe)
-{
-       if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
-           MTHCA_ERROR_CQE_OPCODE_MASK)
-               return !(cqe->opcode & 0x01);
-       else
-               return !(cqe->is_send & 0x80);
-}
-
-void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq)
-{
-       struct mthca_cqe *cqe;
-       uint32_t prod_index;
-       int nfreed = 0;
-
-       cl_spinlock_acquire(&cq->lock);
-
-       /*
-        * First we need to find the current producer index, so we
-        * know where to start cleaning from.  It doesn't matter if HW
-        * adds new entries after this loop -- the QP we're worried
-        * about is already in RESET, so the new entries won't come
-        * from our QP and therefore don't need to be checked.
-        */
-       for (prod_index = cq->cons_index;
-            cqe_sw(cq, prod_index & cq->ibv_cq.cqe);
-            ++prod_index)
-               if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
-                       break;
-
-       /*
-        * Now sweep backwards through the CQ, removing CQ entries
-        * that match our QP by copying older entries on top of them.
-        */
-       while ((int) --prod_index - (int) cq->cons_index >= 0) {
-               cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
-               if (cqe->my_qpn == cl_hton32(qpn)) {
-                       if (srq && is_recv_cqe(cqe))
-                               mthca_free_srq_wqe(srq,
-                                                  cl_ntoh32(cqe->wqe) >> srq->wqe_shift);
-                       ++nfreed;
-               } else if (nfreed)
-                       memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe),
-                              cqe, MTHCA_CQ_ENTRY_SIZE);
-       }
-
-       if (nfreed) {
-               mb();
-               cq->cons_index += nfreed;
-               update_cons_index(cq, nfreed);
-       }
-
-       cl_spinlock_release(&cq->lock);
-}
-
-void mthca_init_cq_buf(struct mthca_cq *cq, int nent)
-{
-       int i;
-
-       for (i = 0; i < nent; ++i)
-               set_cqe_hw(get_cqe(cq, i));
-
-       cq->cons_index = 0;
-}
+/*\r
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.\r
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.\r
+ *\r
+ * This software is available to you under a choice of one of two\r
+ * licenses.  You may choose to be licensed under the terms of the GNU\r
+ * General Public License (GPL) Version 2, available from the file\r
+ * COPYING in the main directory of this source tree, or the\r
+ * OpenIB.org BSD license below:\r
+ *\r
+ *     Redistribution and use in source and binary forms, with or\r
+ *     without modification, are permitted provided that the following\r
+ *     conditions are met:\r
+ *\r
+ *      - Redistributions of source code must retain the above\r
+ *        copyright notice, this list of conditions and the following\r
+ *        disclaimer.\r
+ *\r
+ *      - Redistributions in binary form must reproduce the above\r
+ *        copyright notice, this list of conditions and the following\r
+ *        disclaimer in the documentation and/or other materials\r
+ *        provided with the distribution.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ *\r
+ * $Id$\r
+ */\r
+\r
+#include <mt_l2w.h>\r
+#include <opcode.h>\r
+#include "mlnx_uvp.h"\r
+#include "mlnx_uvp_doorbell.h"\r
+#include <iba\ib_uvp.h>\r
+\r
+#if defined(EVENT_TRACING)\r
+#include "mlnx_uvp_cq.tmh"\r
+#endif\r
+\r
+\r
+enum {\r
+       MTHCA_CQ_DOORBELL       = 0x20\r
+};\r
+\r
+enum {\r
+       CQ_OK           =  0,\r
+       CQ_EMPTY        = -1,\r
+       CQ_POLL_ERR     = -2\r
+};\r
+\r
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)\r
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)\r
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)\r
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)\r
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)\r
+\r
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)\r
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)\r
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)\r
+\r
+enum {\r
+       MTHCA_CQ_ENTRY_OWNER_SW     = 0x00,\r
+       MTHCA_CQ_ENTRY_OWNER_HW     = 0x80,\r
+       MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe\r
+};\r
+\r
+enum {\r
+       SYNDROME_LOCAL_LENGTH_ERR        = 0x01,\r
+       SYNDROME_LOCAL_QP_OP_ERR         = 0x02,\r
+       SYNDROME_LOCAL_EEC_OP_ERR        = 0x03,\r
+       SYNDROME_LOCAL_PROT_ERR          = 0x04,\r
+       SYNDROME_WR_FLUSH_ERR            = 0x05,\r
+       SYNDROME_MW_BIND_ERR             = 0x06,\r
+       SYNDROME_BAD_RESP_ERR            = 0x10,\r
+       SYNDROME_LOCAL_ACCESS_ERR        = 0x11,\r
+       SYNDROME_REMOTE_INVAL_REQ_ERR    = 0x12,\r
+       SYNDROME_REMOTE_ACCESS_ERR       = 0x13,\r
+       SYNDROME_REMOTE_OP_ERR           = 0x14,\r
+       SYNDROME_RETRY_EXC_ERR           = 0x15,\r
+       SYNDROME_RNR_RETRY_EXC_ERR       = 0x16,\r
+       SYNDROME_LOCAL_RDD_VIOL_ERR      = 0x20,\r
+       SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,\r
+       SYNDROME_REMOTE_ABORTED_ERR      = 0x22,\r
+       SYNDROME_INVAL_EECN_ERR          = 0x23,\r
+       SYNDROME_INVAL_EEC_STATE_ERR     = 0x24\r
+};\r
+\r
+struct mthca_cqe {\r
+       uint32_t        my_qpn;\r
+       uint32_t        my_ee;\r
+       uint32_t        rqpn;\r
+       uint16_t        sl_g_mlpath;\r
+       uint16_t        rlid;\r
+       uint32_t        imm_etype_pkey_eec;\r
+       uint32_t        byte_cnt;\r
+       uint32_t        wqe;\r
+       uint8_t         opcode;\r
+       uint8_t         is_send;\r
+       uint8_t         reserved;\r
+       uint8_t         owner;\r
+};\r
+\r
+struct mthca_err_cqe {\r
+       uint32_t        my_qpn;\r
+       uint32_t        reserved1[3];\r
+       uint8_t         syndrome;\r
+       uint8_t         vendor_err;\r
+       uint16_t        db_cnt;\r
+       uint32_t        reserved2;\r
+       uint32_t        wqe;\r
+       uint8_t         opcode;\r
+       uint8_t         reserved3[2];\r
+       uint8_t         owner;\r
+};\r
+\r
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)\r
+{\r
+       return (struct mthca_cqe *)((uint8_t*)cq->buf + entry * MTHCA_CQ_ENTRY_SIZE);\r
+}\r
+\r
+static inline struct mthca_cqe *cqe_sw(struct mthca_cq *cq, int i)\r
+{\r
+       struct mthca_cqe *cqe = get_cqe(cq, i);\r
+       return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;\r
+}\r
+\r
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)\r
+{\r
+       return cqe_sw(cq, cq->cons_index & cq->ibv_cq.cqe);\r
+}\r
+\r
+static inline void set_cqe_hw(struct mthca_cqe *cqe)\r
+{\r
+       cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;\r
+}\r
+\r
+/*\r
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index\r
+ * should be correct before calling update_cons_index().\r
+ */\r
+static inline void update_cons_index(struct mthca_cq *cq, int incr)\r
+{\r
+       uint32_t doorbell[2];\r
+\r
+       if (mthca_is_memfree(cq->ibv_cq.context)) {\r
+               *cq->set_ci_db = cl_hton32(cq->cons_index);\r
+               mb();\r
+       } else {\r
+               doorbell[0] = cl_hton32(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn);\r
+               doorbell[1] = cl_hton32(incr - 1);\r
+\r
+               mthca_write64(doorbell, to_mctx(cq->ibv_cq.context), MTHCA_CQ_DOORBELL);\r
+       }\r
+}\r
+\r
+\r
+static void dump_cqe(uint32_t print_lvl, void *cqe_ptr)\r
+{\r
+       uint32_t *cqe = cqe_ptr;\r
+       int i;\r
+       (void) cqe;     /* avoid warning if mthca_dbg compiled away... */\r
+\r
+       UVP_PRINT(print_lvl,UVP_DBG_CQ,("CQE content \n"));\r
+       UVP_PRINT(print_lvl,UVP_DBG_CQ,(" [%2x] %08x %08x %08x %08x \n",0\r
+               , cl_ntoh32(cqe[0]), cl_ntoh32(cqe[1]), cl_ntoh32(cqe[2]), cl_ntoh32(cqe[3])));\r
+       UVP_PRINT(print_lvl,UVP_DBG_CQ,(" [%2x] %08x %08x %08x %08x\n",16\r
+               , cl_ntoh32(cqe[4]), cl_ntoh32(cqe[5]), cl_ntoh32(cqe[6]), cl_ntoh32(cqe[7])));\r
+       \r
+}\r
+\r
+static int handle_error_cqe(struct mthca_cq *cq,\r
+                           struct mthca_qp *qp, int wqe_index, int is_send,\r
+                           struct mthca_err_cqe *cqe,\r
+                           struct _ib_wc *entry, int *free_cqe)\r
+{\r
+       int err;\r
+       int dbd;\r
+       uint32_t new_wqe;\r
+\r
+       if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {\r
+               UVP_PRINT(TRACE_LEVEL_ERROR , UVP_DBG_CQ,("local QP operation err "\r
+                      "(QPN %06x, WQE @ %08x, CQN %06x, index %d, vendor_err %d)\n",\r
+                      cl_ntoh32(cqe->my_qpn), cl_ntoh32(cqe->wqe),\r
+                      cq->cqn, cq->cons_index, cqe->vendor_err));\r
+               dump_cqe(TRACE_LEVEL_VERBOSE, cqe);\r
+       }\r
+\r
+       /*\r
+        * For completions in error, only work request ID, status, vendor error\r
+        * (and freed resource count for RD) have to be set.\r
+        */\r
+       switch (cqe->syndrome) {\r
+       case SYNDROME_LOCAL_LENGTH_ERR:\r
+               entry->status = IB_WCS_LOCAL_LEN_ERR;\r
+               break;\r
+       case SYNDROME_LOCAL_QP_OP_ERR:\r
+               entry->status = IB_WCS_LOCAL_OP_ERR;\r
+               break;\r
+       case SYNDROME_LOCAL_PROT_ERR:\r
+               entry->status = IB_WCS_LOCAL_PROTECTION_ERR;\r
+               break;\r
+       case SYNDROME_WR_FLUSH_ERR:\r
+               entry->status = IB_WCS_WR_FLUSHED_ERR;\r
+               break;\r
+       case SYNDROME_MW_BIND_ERR:\r
+               entry->status = IB_WCS_MEM_WINDOW_BIND_ERR;\r
+               break;\r
+       case SYNDROME_BAD_RESP_ERR:\r
+               entry->status = IB_WCS_BAD_RESP_ERR;\r
+               break;\r
+       case SYNDROME_LOCAL_ACCESS_ERR:\r
+               entry->status = IB_WCS_LOCAL_ACCESS_ERR;\r
+               break;\r
+       case SYNDROME_REMOTE_INVAL_REQ_ERR:\r
+               entry->status = IB_WCS_REM_INVALID_REQ_ERR;\r
+               break;\r
+       case SYNDROME_REMOTE_ACCESS_ERR:\r
+               entry->status = IB_WCS_REM_ACCESS_ERR;\r
+               break;\r
+       case SYNDROME_REMOTE_OP_ERR:\r
+               entry->status = IB_WCS_REM_OP_ERR;\r
+               break;\r
+       case SYNDROME_RETRY_EXC_ERR:\r
+               entry->status = IB_WCS_TIMEOUT_RETRY_ERR;\r
+               break;\r
+       case SYNDROME_RNR_RETRY_EXC_ERR:\r
+               entry->status = IB_WCS_RNR_RETRY_ERR;\r
+               break;\r
+       case SYNDROME_LOCAL_EEC_OP_ERR:\r
+       case SYNDROME_LOCAL_RDD_VIOL_ERR:\r
+       case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:\r
+       case SYNDROME_REMOTE_ABORTED_ERR:\r
+       case SYNDROME_INVAL_EECN_ERR:\r
+       case SYNDROME_INVAL_EEC_STATE_ERR:\r
+       default:\r
+               entry->status = IB_WCS_GENERAL_ERR;\r
+               break;\r
+       }\r
+\r
+       entry->vendor_specific = cqe->vendor_err;\r
+       \r
+       /*\r
+        * Mem-free HCAs always generate one CQE per WQE, even in the\r
+        * error case, so we don't have to check the doorbell count, etc.\r
+        */\r
+       if (mthca_is_memfree(cq->ibv_cq.context))\r
+               return 0;\r
+\r
+       err = mthca_free_err_wqe(qp, is_send, wqe_index, &dbd, &new_wqe);\r
+       if (err)\r
+               return err;\r
+\r
+       /*\r
+        * If we're at the end of the WQE chain, or we've used up our\r
+        * doorbell count, free the CQE.  Otherwise just update it for\r
+        * the next poll operation.\r
+        * \r
+        * This doesn't apply to mem-free HCAs, which never use the\r
+        * doorbell count field.  In that case we always free the CQE.\r
+        */\r
+       if (mthca_is_memfree(cq->ibv_cq.context) ||\r
+           !(new_wqe & cl_hton32(0x3f)) || (!cqe->db_cnt && dbd))\r
+               return 0;\r
+\r
+       cqe->db_cnt   = cl_hton16(cl_ntoh16(cqe->db_cnt) - dbd);\r
+       cqe->wqe      = new_wqe;\r
+       cqe->syndrome = SYNDROME_WR_FLUSH_ERR;\r
+\r
+       *free_cqe = 0;\r
+\r
+       return 0;\r
+}\r
+\r
+static inline int mthca_poll_one(struct mthca_cq *cq,\r
+                                struct mthca_qp **cur_qp,\r
+                                int *freed,\r
+                                struct _ib_wc *entry)\r
+{\r
+       struct mthca_wq *wq;\r
+       struct mthca_cqe *cqe;\r
+       uint32_t qpn;\r
+       int wqe_index;\r
+       int is_error;\r
+       int is_send;\r
+       int free_cqe = 1;\r
+       int err = 0;\r
+\r
+       UVP_ENTER(UVP_DBG_CQ);\r
+       \r
+       cqe = next_cqe_sw(cq);\r
+       if (!cqe)\r
+               return -EAGAIN;\r
+\r
+       /*\r
+        * Make sure we read CQ entry contents after we've checked the\r
+        * ownership bit.\r
+        */\r
+       rmb();\r
+\r
+       { // debug print\r
+               UVP_PRINT(TRACE_LEVEL_VERBOSE,UVP_DBG_CQ,("%x/%d: CQE -> QPN %06x, WQE @ %08x\n",\r
+                         cq->cqn, cq->cons_index, cl_ntoh32(cqe->my_qpn),\r
+                         cl_ntoh32(cqe->wqe)));\r
+               dump_cqe(TRACE_LEVEL_VERBOSE,cqe);\r
+       }\r
+       \r
+       qpn = cl_ntoh32(cqe->my_qpn);\r
+\r
+       is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==\r
+               MTHCA_ERROR_CQE_OPCODE_MASK;\r
+       is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;\r
+\r
+       if (!*cur_qp || cl_ntoh32(cqe->my_qpn) != (*cur_qp)->ibv_qp.qp_num) {\r
+               /*\r
+                * We do not have to take the QP table lock here,\r
+                * because CQs will be locked while QPs are removed\r
+                * from the table.\r
+                */\r
+               *cur_qp = mthca_find_qp(to_mctx(cq->ibv_cq.context), cl_ntoh32(cqe->my_qpn));\r
+               if (!*cur_qp) {\r
+                       UVP_PRINT(TRACE_LEVEL_WARNING,UVP_DBG_CQ, ("CQ entry for unknown QP %06x\n",\r
+                                  cl_ntoh32(cqe->my_qpn) & 0xffffff));\r
+                       err = -EINVAL;\r
+                       goto out;\r
+               }\r
+       }\r
+\r
+       if (is_send) {\r
+               wq = &(*cur_qp)->sq;\r
+               wqe_index = ((cl_ntoh32(cqe->wqe) - (*cur_qp)->send_wqe_offset) >> wq->wqe_shift);\r
+               entry->wr_id = (*cur_qp)->wrid[wqe_index + (*cur_qp)->rq.max];\r
+       } else if ((*cur_qp)->ibv_qp.srq) {\r
+               struct mthca_srq * srq = to_msrq((*cur_qp)->ibv_qp.srq);\r
+               uint32_t wqe = cl_hton32(cqe->wqe);\r
+               wq = NULL;\r
+               wqe_index = wqe >> srq->wqe_shift;\r
+               entry->wr_id = srq->wrid[wqe_index];\r
+               mthca_free_srq_wqe(srq, wqe_index);\r
+       } else {\r
+               wq = &(*cur_qp)->rq;\r
+               wqe_index = cl_ntoh32(cqe->wqe) >> wq->wqe_shift;\r
+               entry->wr_id = (*cur_qp)->wrid[wqe_index];\r
+       }\r
+\r
+       if (wq) {\r
+               if ((int)wq->last_comp < wqe_index)\r
+                       wq->tail += wqe_index - wq->last_comp;\r
+               else\r
+                       wq->tail += wqe_index + wq->max - wq->last_comp;\r
+\r
+               wq->last_comp = wqe_index;\r
+       }\r
+\r
+       if (is_send) {\r
+               entry->recv.ud.recv_opt = 0;\r
+               switch (cqe->opcode) {\r
+               case MTHCA_OPCODE_RDMA_WRITE:\r
+                       entry->wc_type    = IB_WC_RDMA_WRITE;\r
+                       break;\r
+               case MTHCA_OPCODE_RDMA_WRITE_IMM:\r
+                       entry->wc_type    = IB_WC_RDMA_WRITE;\r
+                       entry->recv.ud.recv_opt |= IB_RECV_OPT_IMMEDIATE;\r
+                       break;\r
+               case MTHCA_OPCODE_SEND:\r
+                       entry->wc_type    = IB_WC_SEND;\r
+                       break;\r
+               case MTHCA_OPCODE_SEND_IMM:\r
+                       entry->wc_type    = IB_WC_SEND;\r
+                       entry->recv.ud.recv_opt |= IB_RECV_OPT_IMMEDIATE;\r
+                       break;\r
+               case MTHCA_OPCODE_RDMA_READ:\r
+                       entry->wc_type    = IB_WC_RDMA_READ;\r
+                       entry->length  = cl_ntoh32(cqe->byte_cnt);\r
+                       break;\r
+               case MTHCA_OPCODE_ATOMIC_CS:\r
+                       entry->wc_type    = IB_WC_COMPARE_SWAP;\r
+                       entry->length  = MTHCA_BYTES_PER_ATOMIC_COMPL;\r
+                       break;\r
+               case MTHCA_OPCODE_ATOMIC_FA:\r
+                       entry->wc_type    = IB_WC_FETCH_ADD;\r
+                       entry->length  = MTHCA_BYTES_PER_ATOMIC_COMPL;\r
+                       break;\r
+               case MTHCA_OPCODE_BIND_MW:\r
+                       entry->wc_type    = IB_WC_MW_BIND;\r
+                       break;\r
+               default:\r
+                       /* assume it's a send completion */\r
+                       entry->wc_type    = IB_WC_SEND;\r
+                       break;\r
+               }\r
+       } else {\r
+               entry->length = cl_ntoh32(cqe->byte_cnt);\r
+               switch (cqe->opcode & 0x1f) {\r
+               case IBV_OPCODE_SEND_LAST_WITH_IMMEDIATE:\r
+               case IBV_OPCODE_SEND_ONLY_WITH_IMMEDIATE:\r
+                       entry->recv.ud.recv_opt  = IB_RECV_OPT_IMMEDIATE;\r
+                       entry->recv.ud.immediate_data = cqe->imm_etype_pkey_eec;\r
+                       entry->wc_type = IB_WC_RECV;\r
+                       break;\r
+               case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:\r
+               case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:\r
+                       entry->recv.ud.recv_opt  = IB_RECV_OPT_IMMEDIATE;\r
+                       entry->recv.ud.immediate_data = cqe->imm_etype_pkey_eec;\r
+                       entry->wc_type = IB_WC_RECV;\r
+                       break;\r
+               default:\r
+                       entry->recv.ud.recv_opt  = 0;\r
+                       entry->wc_type = IB_WC_RECV;\r
+                       break;\r
+               }\r
+               entry->recv.ud.remote_lid = cqe->rlid;\r
+               entry->recv.ud.remote_qp = cqe->rqpn & 0xffffff00;\r
+               entry->recv.ud.pkey_index     = (uint16_t)(cl_ntoh32(cqe->imm_etype_pkey_eec) >> 16);\r
+               entry->recv.ud.remote_sl           = cl_ntoh16(cqe->sl_g_mlpath) >> 12;\r
+               entry->recv.ud.path_bits = cl_ntoh16(cqe->sl_g_mlpath) & 0x7f;\r
+               entry->recv.ud.recv_opt      |= cl_ntoh16(cqe->sl_g_mlpath) & 0x80 ?\r
+                       IB_RECV_OPT_GRH_VALID : 0;\r
+       }\r
+\r
+\r
+       if (is_error) {\r
+               err = handle_error_cqe(cq, *cur_qp, wqe_index, is_send,\r
+                                      (struct mthca_err_cqe *) cqe,\r
+                                      entry, &free_cqe);\r
+       }\r
+       else\r
+               entry->status = IB_WCS_SUCCESS;\r
+\r
+out:\r
+       if (likely(free_cqe)) {\r
+               set_cqe_hw(cqe);\r
+               ++(*freed);\r
+               ++cq->cons_index;\r
+       }\r
+\r
+       UVP_EXIT(UVP_DBG_CQ);\r
+       return err;\r
+}\r
+\r
+int mthca_poll_cq(struct ibv_cq *ibcq, int num_entries, struct _uvp_wc *entry)\r
+{\r
+       struct mthca_cq *cq = to_mcq(ibcq);\r
+       struct mthca_qp *qp = NULL;\r
+       int err = CQ_OK;\r
+       int freed = 0;\r
+       int npolled;\r
+       \r
+       cl_spinlock_acquire(&cq->lock);\r
+\r
+       for (npolled = 0; npolled < num_entries; ++npolled) {\r
+               err = mthca_poll_one(cq, &qp, &freed, (struct _ib_wc *) (entry + npolled));\r
+               if (err)\r
+                       break;\r
+               entry[npolled].qp_context = qp->ibv_qp.qp_context;\r
+       }\r
+\r
+       if (freed) {\r
+               wmb();\r
+               update_cons_index(cq, freed);\r
+       }\r
+\r
+       cl_spinlock_release(&cq->lock);\r
+\r
+       return (err == 0 || err == -EAGAIN) ? npolled : err;\r
+}\r
+\r
+int mthca_poll_cq_list(\r
+       IN              struct ibv_cq *ibcq, \r
+       IN      OUT                     struct _ib_wc** const                           pp_free_wclist,\r
+               OUT                     struct _ib_wc** const                           pp_done_wclist )\r
+{\r
+       struct mthca_cq *cq = to_mcq(ibcq);\r
+       struct mthca_qp *qp = NULL;\r
+       int err = CQ_OK;\r
+       int freed = 0;\r
+       ib_wc_t         *wc_p, **next_pp;\r
+       uint32_t        wc_cnt = 0;\r
+\r
+       cl_spinlock_acquire(&cq->lock);\r
+\r
+       // loop through CQ\r
+       next_pp = pp_done_wclist;\r
+       wc_p = *pp_free_wclist;\r
+       while( wc_p ) {\r
+               // poll one CQE\r
+               err = mthca_poll_one(cq, &qp, &freed, wc_p);\r
+               if (err)\r
+                       break;\r
+\r
+               // prepare for the next loop\r
+               *next_pp = wc_p;\r
+               next_pp = &wc_p->p_next;\r
+               wc_p = wc_p->p_next;\r
+       }\r
+\r
+       // prepare the results\r
+       *pp_free_wclist = wc_p;         /* Set the head of the free list. */\r
+       *next_pp = NULL;                                                /* Clear the tail of the done list. */\r
+\r
+       // update consumer index\r
+       if (freed) {\r
+               wmb();\r
+               update_cons_index(cq, freed);\r
+       }\r
+\r
+       cl_spinlock_release(&cq->lock);\r
+       return (err == 0 || err == -EAGAIN)? 0 : err; \r
+}\r
+\r
+int mthca_tavor_arm_cq(struct ibv_cq *cq, enum ib_cq_notify notify)\r
+{\r
+       uint32_t doorbell[2];\r
+\r
+       doorbell[0] = cl_hton32((notify == IB_CQ_SOLICITED ?\r
+                            MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :\r
+                            MTHCA_TAVOR_CQ_DB_REQ_NOT)      |\r
+                           to_mcq(cq)->cqn);\r
+       doorbell[1] = 0xffffffff;\r
+\r
+       mthca_write64(doorbell, to_mctx(cq->context), MTHCA_CQ_DOORBELL);\r
+\r
+       return 0;\r
+}\r
+\r
+int mthca_arbel_arm_cq(struct ibv_cq *ibvcq, enum ib_cq_notify notify)\r
+{\r
+       struct mthca_cq *cq = to_mcq(ibvcq);\r
+       uint32_t doorbell[2];\r
+       uint32_t sn;\r
+       uint32_t ci;\r
+\r
+       sn = *cq->p_u_arm_sn & 3;\r
+       ci = cl_hton32(cq->cons_index);\r
+\r
+       doorbell[0] = ci;\r
+       doorbell[1] = cl_hton32((cq->cqn << 8) | (2 << 5) | (sn << 3) |\r
+                           (notify == IB_CQ_SOLICITED ? 1 : 2));\r
+\r
+       mthca_write_db_rec(doorbell, cq->arm_db);\r
+\r
+       /*\r
+        * Make sure that the doorbell record in host memory is\r
+        * written before ringing the doorbell via PCI MMIO.\r
+        */\r
+       wmb();\r
+\r
+       doorbell[0] = cl_hton32((sn << 28)                       |\r
+                           (notify == IB_CQ_SOLICITED ?\r
+                            MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :\r
+                            MTHCA_ARBEL_CQ_DB_REQ_NOT)      |\r
+                           cq->cqn);\r
+       doorbell[1] = ci;\r
+\r
+       mthca_write64(doorbell, to_mctx(ibvcq->context), MTHCA_CQ_DOORBELL);\r
+\r
+       return 0;\r
+}\r
+\r
+static inline int is_recv_cqe(struct mthca_cqe *cqe)\r
+{\r
+       if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==\r
+           MTHCA_ERROR_CQE_OPCODE_MASK)\r
+               return !(cqe->opcode & 0x01);\r
+       else\r
+               return !(cqe->is_send & 0x80);\r
+}\r
+\r
+void mthca_cq_clean(struct mthca_cq *cq, uint32_t qpn, struct mthca_srq *srq)\r
+{\r
+       struct mthca_cqe *cqe;\r
+       uint32_t prod_index;\r
+       int nfreed = 0;\r
+\r
+       cl_spinlock_acquire(&cq->lock);\r
+\r
+       /*\r
+        * First we need to find the current producer index, so we\r
+        * know where to start cleaning from.  It doesn't matter if HW\r
+        * adds new entries after this loop -- the QP we're worried\r
+        * about is already in RESET, so the new entries won't come\r
+        * from our QP and therefore don't need to be checked.\r
+        */\r
+       for (prod_index = cq->cons_index;\r
+            cqe_sw(cq, prod_index & cq->ibv_cq.cqe);\r
+            ++prod_index)\r
+               if (prod_index == cq->cons_index + cq->ibv_cq.cqe)\r
+                       break;\r
+\r
+       /*\r
+        * Now sweep backwards through the CQ, removing CQ entries\r
+        * that match our QP by copying older entries on top of them.\r
+        */\r
+       while ((int) --prod_index - (int) cq->cons_index >= 0) {\r
+               cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);\r
+               if (cqe->my_qpn == cl_hton32(qpn)) {\r
+                       if (srq && is_recv_cqe(cqe))\r
+                               mthca_free_srq_wqe(srq,\r
+                                                  cl_ntoh32(cqe->wqe) >> srq->wqe_shift);\r
+                       ++nfreed;\r
+               } else if (nfreed)\r
+                       memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe),\r
+                              cqe, MTHCA_CQ_ENTRY_SIZE);\r
+       }\r
+\r
+       if (nfreed) {\r
+               mb();\r
+               cq->cons_index += nfreed;\r
+               update_cons_index(cq, nfreed);\r
+       }\r
+\r
+       cl_spinlock_release(&cq->lock);\r
+}\r
+\r
+void mthca_init_cq_buf(struct mthca_cq *cq, int nent)\r
+{\r
+       int i;\r
+\r
+       for (i = 0; i < nent; ++i)\r
+               set_cqe_hw(get_cqe(cq, i));\r
+\r
+       cq->cons_index = 0;\r
+}\r
index 18fe060..069271c 100644 (file)
@@ -447,7 +447,7 @@ struct ibv_context_ops {
                               struct ibv_create_cq *req);\r
        struct ibv_cq * (*create_cq_post)(struct ibv_context *context, \r
                               struct ibv_create_cq_resp *resp);\r
-       int                     (*poll_cq)(struct ibv_cq *cq, int num_entries, struct _ib_wc *wc);\r
+       int                     (*poll_cq)(struct ibv_cq *cq, int num_entries, struct _uvp_wc *wc);\r
        int                     (*poll_cq_list)( struct ibv_cq *ibcq, \r
                struct _ib_wc** const                   pp_free_wclist,\r
                struct _ib_wc** const                   pp_done_wclist );\r
index 6e21944..faf3e68 100644 (file)
@@ -3011,6 +3011,51 @@ typedef ib_api_status_t
 \r
 /********/\r
 \r
+/*\r
+ * Define uvp_wc_t so that we can cast directly to ib_wc_t.\r
+ */\r
+typedef struct _uvp_wc\r
+{\r
+       void*                                   qp_context;\r
+       /* If pointer size is 32-bits, then compiler will pad before uint64_t */\r
+       uint64_t                                wr_id;\r
+       ib_wc_type_t                    wc_type;\r
+\r
+       uint32_t                                length;\r
+       uint64_t                                vendor_specific;\r
+       ib_wc_status_t                  status;\r
+\r
+       union _uvp_wc_recv\r
+       {\r
+               struct _uvp_wc_conn\r
+               {\r
+                       ib_recv_opt_t   recv_opt;\r
+                       ib_net32_t              immediate_data;\r
+\r
+               }       conn;\r
+\r
+               struct _uvp_wc_ud\r
+               {\r
+                       ib_recv_opt_t   recv_opt;\r
+                       ib_net32_t              immediate_data;\r
+                       ib_net32_t              remote_qp;\r
+                       uint16_t                pkey_index;\r
+                       ib_net16_t              remote_lid;\r
+                       uint8_t                 remote_sl;\r
+                       uint8_t                 path_bits;\r
+\r
+               }       ud;\r
+       }       recv;\r
+}      uvp_wc_t;\r
+\r
+typedef int\r
+(AL_API *uvp_poll_cq_array) (\r
+       IN              const   void*                                           h_cq,\r
+       IN              const   int                                                     num_entries,\r
+       IN      OUT                     uvp_wc_t*       const                   wcs);\r
+\r
+/********/\r
+\r
 /****f* user-mode Verbs/uvp_rearm_cq\r
 * NAME\r
 *      uvp_rearm_cq -- Invoke the Completion handler, on next entry added.\r
@@ -3455,6 +3500,7 @@ typedef struct _uvp_interface
        uvp_nd_modify_qp_t                      nd_modify_qp;\r
        uvp_nd_get_qp_state_t           nd_get_qp_state;\r
        uvp_wv_pre_create_qp            wv_pre_create_qp;\r
+       uvp_poll_cq_array                       poll_cq_array;\r
 \r
 } uvp_interface_t;\r
 \r