[mlx4] Add support for large send offload in mlx4. (mlnx: 3041)
authortzachid <tzachid@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Tue, 26 Aug 2008 17:36:39 +0000 (17:36 +0000)
committertzachid <tzachid@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Tue, 26 Aug 2008 17:36:39 +0000 (17:36 +0000)
git-svn-id: svn://openib.tc.cornell.edu/gen1/trunk@1514 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86

hw/mlx4/kernel/bus/ib/cq.c
hw/mlx4/kernel/bus/ib/main.c
hw/mlx4/kernel/bus/ib/mlx4_ib.h
hw/mlx4/kernel/bus/ib/qp.c
hw/mlx4/kernel/bus/inc/device.h
hw/mlx4/kernel/bus/inc/ib_verbs.h
hw/mlx4/kernel/bus/inc/qp.h
hw/mlx4/kernel/bus/net/fw.c
hw/mlx4/kernel/bus/net/fw.h
hw/mlx4/kernel/bus/net/main.c
hw/mlx4/kernel/hca/qp.c

index 5be4388..3b80a83 100644 (file)
@@ -461,6 +461,9 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
                case MLX4_OPCODE_BIND_MW:\r
                        wc->wc_type    = IB_WC_MW_BIND;\r
                        break;\r
+               case MLX4_OPCODE_LSO:\r
+                       wc->wc_type    = IB_WC_LSO;\r
+                       break;\r
                default:\r
                        wc->wc_type       = IB_WC_SEND;\r
                        break;\r
index fa46eb7..447c80a 100644 (file)
@@ -86,6 +86,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev,
                props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
        if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
                props->device_cap_flags |= IB_DEVICE_IPOIB_CSUM;
+       if (dev->dev->caps.max_gso_sz)
+               props->device_cap_flags |= IB_DEVICE_UD_TSO;
        props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
                0xffffff;
        props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
index 467ac91..ef6999f 100644 (file)
@@ -116,6 +116,11 @@ struct mlx4_ib_wq {
        unsigned                tail;
 };
 
+enum mlx4_ib_qp_flags {
+       MLX4_IB_QP_LSO= 1 << 0
+};
+
+
 struct mlx4_ib_qp {
        struct ib_qp            ibqp;
        struct mlx4_qp          mqp;
@@ -133,6 +138,7 @@ struct mlx4_ib_qp {
        struct mlx4_mtt         mtt;
        int                     buf_size;
        struct mutex            mutex;
+       u32                     flags;
        u8                      port;
        u8                      alt_port;
        u8                      atomic_rd_en;
index 40a51ec..38a9d32 100644 (file)
@@ -73,6 +73,7 @@ static const __be32 mlx4_ib_opcode[] = {
        __constant_cpu_to_be32(MLX4_OPCODE_RDMA_READ),          /*      [IB_WR_RDMA_READ]                       */\r
        __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),          /*      [IB_WR_ATOMIC_CMP_AND_SWP]      */\r
        __constant_cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),          /*      [IB_WR_ATOMIC_FETCH_AND_ADD]*/\r
+       __constant_cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6))                              /* [IB_WR_LSO]                                  */\r
 };\r
 \r
 static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)\r
@@ -168,7 +169,7 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
        ibqp->event_handler(&event);\r
 }\r
 \r
-static int send_wqe_overhead(enum ib_qp_type type)\r
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)\r
 {\r
        /*\r
         * UD WQEs must have a datagram segment.\r
@@ -178,8 +179,9 @@ static int send_wqe_overhead(enum ib_qp_type type)
         */\r
        switch (type) {\r
        case IB_QPT_UD:\r
-               return sizeof (struct mlx4_wqe_ctrl_seg) +\r
-                       sizeof (struct mlx4_wqe_datagram_seg);\r
+               return sizeof (struct mlx4_wqe_ctrl_seg)  +\r
+                       sizeof (struct mlx4_wqe_datagram_seg) +\r
+                       ((flags & MLX4_IB_QP_LSO) ? 64 : 0);\r
        case IB_QPT_UC:\r
                return sizeof (struct mlx4_wqe_ctrl_seg) +\r
                        sizeof (struct mlx4_wqe_raddr_seg);\r
@@ -249,7 +251,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
        /* Sanity check SQ size before proceeding */\r
        if ((int)cap->max_send_wr       > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE  ||\r
            (int)cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||\r
-           (int)cap->max_inline_data + send_wqe_overhead(type) +\r
+           (int)cap->max_inline_data + send_wqe_overhead(type, qp->flags) +\r
            (int)sizeof(struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)\r
                return -EINVAL;\r
 \r
@@ -265,9 +267,9 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
                                                        sizeof (struct mlx4_wqe_data_seg),\r
                                                        cap->max_inline_data +\r
                                                        sizeof (struct mlx4_wqe_inline_seg)) +\r
-                                                   send_wqe_overhead(type)));\r
+                                                   send_wqe_overhead(type,qp->flags)));\r
        qp->sq.wqe_shift = max(MLX4_IB_SQ_MIN_WQE_SHIFT, qp->sq.wqe_shift);\r
-       qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /\r
+       qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type,qp->flags)) /\r
                sizeof (struct mlx4_wqe_data_seg);\r
 \r
        /*\r
@@ -380,7 +382,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                }\r
        } else {\r
                qp->sq_no_prefetch = 0;\r
-\r
+               \r
+               if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)\r
+                       qp->flags |= MLX4_IB_QP_LSO;\r
+               \r
                err = set_kernel_sq_size(dev, &init_attr->cap, init_attr->qp_type, qp);\r
                if (err)\r
                        goto err;\r
@@ -561,6 +566,13 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        struct mlx4_ib_qp *qp;\r
        int err;\r
 \r
+       /* TODO: suggest to remove :We only support LSO, and only for kernel UD QPs. */\r
+       /*if (init_attr->create_flags & ~IB_QP_CREATE_IPOIB_UD_LSO)\r
+               return ERR_PTR(-EINVAL);\r
+       if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO &&\r
+               (pd->uobject || init_attr->qp_type != IB_QPT_UD))\r
+               return ERR_PTR(-EINVAL);*/\r
+\r
        if (mlx4_is_barred(pd->device->dma_device))\r
                return ERR_PTR(-EFAULT);\r
 \r
@@ -767,13 +779,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
                }\r
        }\r
 \r
-       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||\r
-           ibqp->qp_type == IB_QPT_UD)\r
+       if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI )\r
                context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;\r
-       else if (attr_mask & IB_QP_PATH_MTU) {\r
+       else if (ibqp->qp_type == IB_QPT_UD) {\r
+               if (qp->flags & MLX4_IB_QP_LSO)\r
+                       context->mtu_msgmax = (u8)((IB_MTU_4096 << 5) |\r
+                                       ilog2(dev->dev->caps.max_gso_sz));\r
+               else\r
+                       context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;\r
+       } else if (attr_mask & IB_QP_PATH_MTU) {\r
                if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {\r
                        printk(KERN_ERR "path MTU (%u) is invalid\n",\r
-                              attr->path_mtu);\r
+                               attr->path_mtu);\r
                        goto out;\r
                }\r
                context->mtu_msgmax = (u8)((attr->path_mtu << 5) |\r
@@ -1093,12 +1110,21 @@ static enum ib_wr_opcode to_wr_opcode(struct _ib_send_wr *wr)
                case WR_SEND: \r
                        opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? IB_WR_SEND_WITH_IMM : IB_WR_SEND;\r
                        break;\r
+               case WR_LSO:\r
+                       opcode = IB_WR_LSO;\r
+                       break;\r
                case WR_RDMA_WRITE:     \r
                        opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? IB_WR_RDMA_WRITE_WITH_IMM : IB_WR_RDMA_WRITE;\r
                        break;\r
-               case WR_RDMA_READ:              opcode = IB_WR_RDMA_READ; break;\r
-               case WR_COMPARE_SWAP:           opcode = IB_WR_ATOMIC_CMP_AND_SWP; break;\r
-               case WR_FETCH_ADD:                      opcode = IB_WR_ATOMIC_FETCH_AND_ADD; break;\r
+               case WR_RDMA_READ:\r
+                       opcode = IB_WR_RDMA_READ;\r
+                       break;\r
+               case WR_COMPARE_SWAP:\r
+                       opcode = IB_WR_ATOMIC_CMP_AND_SWP;\r
+                       break;\r
+               case WR_FETCH_ADD:\r
+                       opcode = IB_WR_ATOMIC_FETCH_AND_ADD;\r
+                       break;\r
        }\r
        return opcode;\r
 }\r
@@ -1321,12 +1347,43 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, ib_local_ds_t *sg)
        dseg->addr       = cpu_to_be64(sg->vaddr);\r
 }\r
 \r
+static int build_lso_seg(struct mlx4_lso_seg *wqe, ib_send_wr_t *wr,\r
+                                                struct mlx4_ib_qp *qp, unsigned *lso_seg_len)\r
+ {\r
+       unsigned halign = ALIGN(sizeof *wqe + wr->dgrm.ud.hlen, 16);\r
+       void * ds;\r
+       /*\r
+       * This is a temporary limitation and will be removed in\r
+        a forthcoming FW release:\r
+       */\r
+       if (unlikely(halign > 64))\r
+               return -EINVAL;\r
+\r
+       if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&\r
+               wr->num_ds > qp->sq.max_gs - (halign >> 4)))\r
+               return -EINVAL;\r
+       *lso_seg_len = halign;\r
+        ds =  (u8 *) (void *) wqe + halign;\r
+       \r
+       //TODO: use memcpy from physical/virtual addr we can get directly from the ipoib at first data segmentmemcpy(wqe->header, , );\r
+       memcpy(wqe->header, wr->dgrm.ud.header, wr->dgrm.ud.hlen);\r
+       \r
+       /* make sure LSO header is written before overwriting stamping */\r
+       wmb();\r
+\r
+       wqe->mss_hdr_size = cpu_to_be32((wr->dgrm.ud.mss - wr->dgrm.ud.hlen) << 16 |\r
+                                                                       wr->dgrm.ud.hlen);\r
+       \r
+       return 0;\r
+}\r
+\r
+\r
 int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,\r
                      ib_send_wr_t **bad_wr)\r
 {\r
        enum ib_wr_opcode opcode;\r
        struct mlx4_ib_qp *qp = to_mqp(ibqp);\r
-       u8 *wqe;\r
+       u8 *wqe /*, *wqe_start*/;\r
        struct mlx4_wqe_ctrl_seg *ctrl;\r
        struct mlx4_wqe_data_seg *dseg;\r
        unsigned long flags;\r
@@ -1334,7 +1391,9 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,
        int err = 0;\r
        int ind;\r
        int size;\r
+       unsigned seglen;\r
        int i;\r
+       int j = 0;\r
 \r
        if (mlx4_is_barred(ibqp->device->dma_device))\r
                return -EFAULT;\r
@@ -1358,6 +1417,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,
                        goto out;\r
                }\r
 \r
+               /*wqe_start = */\r
                wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));\r
                ctrl = (void*)wqe;\r
                qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;\r
@@ -1420,6 +1480,22 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,
                        set_datagram_seg((void*)wqe, wr);\r
                        wqe  += sizeof (struct mlx4_wqe_datagram_seg);\r
                        size += sizeof (struct mlx4_wqe_datagram_seg) / 16;\r
+                       if (wr->wr_type == WR_LSO) {\r
+                               err = build_lso_seg((struct mlx4_lso_seg *)(void *)wqe, wr, qp, &seglen);\r
+                               if (unlikely(err)) {\r
+                                       *bad_wr = wr;\r
+                                       goto out;\r
+                               }\r
+#define I64_CACHE_LINE                 64\r
+#define OPCODE_INVALID_BIT     6\r
+                               // WQE bug treatment for LSO case, when LSO header is large enough\r
+                               if (unlikely (seglen > I64_CACHE_LINE)) {\r
+                                       ctrl->owner_opcode |= cpu_to_be32 ( 1 << OPCODE_INVALID_BIT);\r
+                               }\r
+                               wqe  += seglen;\r
+                               size += seglen / 16;\r
+                               j=1;\r
+                       }\r
                        break;\r
 \r
                case IB_QPT_SMI:\r
@@ -1491,7 +1567,20 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, ib_send_wr_t *wr,
                ++ind;\r
        }\r
 \r
+//printk("ctrl->srcrb_flags & MLX4_WQE_CTRL_TCP_UDP_CSUM =%d \n", ctrl->srcrb_flags & cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM ));\r
+\r
 out:\r
+//WQE printout\r
+#if 0  \r
+       if (j) {\r
+               u32 *ds = (u32 *) wqe_start;\r
+               printk("WQE DUMP:\n");cq.c.their\r
+               for (j = 0; j < ctrl->fence_size*4; ++j) {\r
+                       printk("%d %08x\n", j,be32_to_cpu(*ds));\r
+                       ++ds;\r
+               }\r
+       }\r
+#endif \r
        if (likely(nreq)) {\r
                qp->sq.head += nreq;\r
 \r
index 09b0c77..3522323 100644 (file)
@@ -194,6 +194,7 @@ struct mlx4_caps {
        u32                     flags;
        u16                     stat_rate_support;
        u8                      port_width_cap[MLX4_MAX_PORTS + 1];
+       int                     max_gso_sz;
        int                     reserved_qps_cnt[MLX4_QP_REGION_COUNT];
        int                     reserved_qps_base[MLX4_QP_REGION_COUNT];
        int                     log_num_macs;
index e2512d9..da886ff 100644 (file)
@@ -88,7 +88,8 @@ enum ib_device_cap_flags {
        IB_DEVICE_ZERO_STAG                     = (1<<15),\r
        IB_DEVICE_SEND_W_INV            = (1<<16),\r
        IB_DEVICE_MEM_WINDOW            = (1<<17),\r
-       IB_DEVICE_IPOIB_CSUM            = (1<<18)\r
+       IB_DEVICE_IPOIB_CSUM            = (1<<18),\r
+       IB_DEVICE_UD_TSO                        = (1<<19)\r
 };\r
 \r
 enum ib_atomic_cap {\r
@@ -413,6 +414,7 @@ enum ib_wc_opcode {
        XIB_WC_COMP_SWAP,\r
        XIB_WC_FETCH_ADD,\r
        XIB_WC_BIND_MW,\r
+       XIB_WC_LSO, //for Linux compatibility\r
 /*\r
  * Set value of XIB_WC_RECV so consumers can test if a completion is a\r
  * receive by testing (opcode & XIB_WC_RECV).\r
@@ -497,15 +499,20 @@ enum ib_qp_type {
        IB_QPT_RAW_ETY\r
 };\r
 \r
+enum ib_qp_create_flags {\r
+       IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0,\r
+};\r
+\r
 struct ib_qp_init_attr {\r
        void                  (*event_handler)(ib_event_rec_t *);\r
        void                   *qp_context;\r
        struct ib_cq           *send_cq;\r
        struct ib_cq           *recv_cq;\r
        struct ib_srq          *srq;\r
-       struct ib_qp_cap        cap;\r
-       enum ib_sig_type        sq_sig_type;\r
-       enum ib_qp_type         qp_type;\r
+       struct ib_qp_cap                cap;\r
+       enum ib_sig_type                sq_sig_type;\r
+       enum ib_qp_type                 qp_type;\r
+       enum ib_qp_create_flags create_flags;\r
        u8                      port_num; /* special QP types only */\r
 };\r
 \r
@@ -619,7 +626,8 @@ enum ib_wr_opcode {
        IB_WR_SEND_WITH_IMM,\r
        IB_WR_RDMA_READ,\r
        IB_WR_ATOMIC_CMP_AND_SWP,\r
-       IB_WR_ATOMIC_FETCH_AND_ADD\r
+       IB_WR_ATOMIC_FETCH_AND_ADD,\r
+       IB_WR_LSO\r
 };\r
 \r
 enum ib_send_flags {\r
@@ -656,10 +664,13 @@ struct ib_send_wr {
                } atomic;\r
                struct {\r
                        struct ib_ah *ah;\r
-                       u32     remote_qpn;\r
-                       u32     remote_qkey;\r
-                       u16     pkey_index; /* valid for GSI only */\r
-                       u8      port_num;   /* valid for DR SMPs on switch only */\r
+                       void    *header;\r
+                       int             hlen;\r
+                       int             mss;\r
+                       u32             remote_qpn;\r
+                       u32             remote_qkey;\r
+                       u16             pkey_index; /* valid for GSI only */\r
+                       u8              port_num;   /* valid for DR SMPs on switch only */\r
                } ud;\r
        } wr;\r
 };\r
index fb662c4..4fbe077 100644 (file)
@@ -216,6 +216,14 @@ struct mlx4_wqe_datagram_seg {
        __be32                  reservd[2];
 };
 
+#pragma warning( disable : 4200)
+struct mlx4_lso_seg {
+__be32                         mss_hdr_size;
+__be32                         header[0];
+};
+#pragma warning( default : 4200)
+
+
 struct mlx4_wqe_bind_seg {
        __be32                  flags1;
        __be32                  flags2;
index 0e792b1..04f9ff5 100644 (file)
@@ -140,6 +140,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_AV_OFFSET            0x27
 #define QUERY_DEV_CAP_MAX_REQ_QP_OFFSET                0x29
 #define QUERY_DEV_CAP_MAX_RES_QP_OFFSET                0x2b
+#define QUERY_DEV_CAP_MAX_GSO_OFFSET           0x2d
 #define QUERY_DEV_CAP_MAX_RDMA_OFFSET          0x2f
 #define QUERY_DEV_CAP_RSZ_SRQ_OFFSET           0x33
 #define QUERY_DEV_CAP_ACK_DELAY_OFFSET         0x35
@@ -222,6 +223,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
        dev_cap->max_requester_per_qp = 1 << (field & 0x3f);
        MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RES_QP_OFFSET);
        dev_cap->max_responder_per_qp = 1 << (field & 0x3f);
+       MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_GSO_OFFSET);
+       field &= 0x1f;
+       if (!field)
+               dev_cap->max_gso_sz = 0;
+       else
+               dev_cap->max_gso_sz = 1 << field;
        MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_RDMA_OFFSET);
        dev_cap->max_rdma_global = 1 << (field & 0x3f);
        MLX4_GET(field, outbox, QUERY_DEV_CAP_ACK_DELAY_OFFSET);
@@ -394,6 +401,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
                 dev_cap->max_sq_desc_sz, dev_cap->max_sq_sg);
        mlx4_dbg(dev, "Max RQ desc size: %d, max RQ S/G: %d\n",
                 dev_cap->max_rq_desc_sz, dev_cap->max_rq_sg);
+       mlx4_dbg(dev, "Max GSO size: %d\n", dev_cap->max_gso_sz);
 
        dump_dev_cap_flags(dev, dev_cap->flags);
 
index 38b97a6..f5113e6 100644 (file)
@@ -98,6 +98,7 @@ struct mlx4_dev_cap {
        u8  bmme_flags;
        u32 reserved_lkey;
        u64 max_icm_sz;
+       int max_gso_sz;
        u8  supported_port_types[MLX4_MAX_PORTS + 1];
        u8  log_max_macs[MLX4_MAX_PORTS + 1];
        u8  log_max_vlans[MLX4_MAX_PORTS + 1];
index a56f95e..28ca217 100644 (file)
@@ -222,6 +222,7 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
        dev->caps.page_size_cap      = ~(u32) (dev_cap->min_page_sz - 1);
        dev->caps.flags              = dev_cap->flags;
        dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
+       dev->caps.max_gso_sz         = dev_cap->max_gso_sz;
 
        dev->caps.log_num_macs  = ilog2(roundup_pow_of_two
                                        (g.mod_num_mac + 1));
index 1a91125..71d3169 100644 (file)
@@ -140,6 +140,7 @@ __create_qp (
        qp_init_attr.sq_sig_type = (p_create_attr->sq_signaled) ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;\r
        qp_init_attr.qp_type = to_qp_type(p_create_attr->qp_type);\r
        qp_init_attr.port_num = port_num;\r
+       qp_init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;\r
 \r
        // create qp            \r
        p_ib_qp = ibv_create_qp( p_ib_pd, &qp_init_attr, p_uctx, p_umv_buf );\r