[MTHCA] improve the time of handling events like port state change
[mirror/winof/.git] / hw / mthca / kernel / mthca_provider.c
1 /* 
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4  * Copyright (c) 2005 Cisco Systems. All rights reserved.
5  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
6  * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
7  *
8  * This software is available to you under a choice of one of two
9  * licenses.  You may choose to be licensed under the terms of the GNU
10  * General Public License (GPL) Version 2, available from the file
11  * COPYING in the main directory of this source tree, or the
12  * OpenIB.org BSD license below:
13  *
14  *     Redistribution and use in source and binary forms, with or
15  *     without modification, are permitted provided that the following
16  *     conditions are met:
17  *
18  *      - Redistributions of source code must retain the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer.
21  *
22  *      - Redistributions in binary form must reproduce the above
23  *        copyright notice, this list of conditions and the following
24  *        disclaimer in the documentation and/or other materials
25  *        provided with the distribution.
26  *
27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
31  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
32  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
33  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34  * SOFTWARE.
35  *
36  * $Id: mthca_provider.c 3047 2005-08-10 03:59:35Z roland $
37  */
38
39 #include <ib_smi.h>
40
41 #include "mx_abi.h"
42 #include "mthca_dev.h"
43
44 #if defined(EVENT_TRACING)
45 #ifdef offsetof
46 #undef offsetof
47 #endif
48 #include "mthca_provider.tmh"
49 #endif
50 #include "mthca_cmd.h"
51 #include "mthca_memfree.h"
52
53  void ibv_umem_release(struct ib_device *dev, struct ib_umem *umem);
54  int ibv_umem_get(struct ib_device *dev, struct ib_umem *mem,
55                  void *addr, size_t size, int write);
56  
57  static void init_query_mad(struct ib_smp *mad)
58  {
59          mad->base_version      = 1;
60          mad->mgmt_class                = IB_MGMT_CLASS_SUBN_LID_ROUTED;
61          mad->class_version = 1;
62          mad->method                            = IB_MGMT_METHOD_GET;
63  }
64
65  int mthca_query_device(struct ib_device *ibdev,
66                               struct ib_device_attr *props)
67 {
68         struct ib_smp *in_mad  = NULL;
69         struct ib_smp *out_mad = NULL;
70         int err = -ENOMEM;
71         struct mthca_dev* mdev = to_mdev(ibdev);
72
73         u8 status;
74
75         in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
76         out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
77         if (!in_mad || !out_mad)
78                 goto out;
79
80         init_query_mad(in_mad);
81         in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
82
83         err = mthca_MAD_IFC(mdev, 1, 1,
84             1, NULL, NULL, in_mad, out_mad, &status);
85         if (err)
86                 goto out;
87         if (status) {
88                 err = -EINVAL;
89                 goto out;
90         }
91
92         RtlZeroMemory(props, sizeof *props);
93         props->fw_ver              = mdev->fw_ver;
94         props->device_cap_flags    = mdev->device_cap_flags;
95         props->vendor_id           = cl_ntoh32(*(__be32 *) (out_mad->data + 36)) &
96                 0xffffff;
97         props->vendor_part_id      = cl_ntoh16(*(__be16 *) (out_mad->data + 30));
98         props->hw_ver              = cl_ntoh32(*(__be32 *) (out_mad->data + 32));
99         memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
100
101         props->max_mr_size         = ~0ull;
102         props->page_size_cap       = mdev->limits.page_size_cap;
103         props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
104         props->max_qp_wr           = mdev->limits.max_wqes;
105         props->max_sge             = mdev->limits.max_sg;
106         props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
107         props->max_cqe             = mdev->limits.max_cqes;
108         props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
109         props->max_pd              = mdev->limits.num_pds - mdev->limits.reserved_pds;
110         props->max_qp_rd_atom      = 1 << mdev->qp_table.rdb_shift;
111         props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
112         props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
113         props->max_srq             = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
114         props->max_srq_wr          = mdev->limits.max_srq_wqes;
115         props->max_srq_sge         = mdev->limits.max_sg;
116         props->local_ca_ack_delay  = (u8)mdev->limits.local_ca_ack_delay;
117         props->atomic_cap          = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ? 
118                                         IB_ATOMIC_LOCAL : IB_ATOMIC_NONE;
119         props->max_pkeys           = (u16)mdev->limits.pkey_table_len;
120         props->max_mcast_grp       = mdev->limits.num_mgms + mdev->limits.num_amgms;
121         props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
122         props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * 
123                                            props->max_mcast_grp;
124
125         err = 0;
126  out:
127         kfree(in_mad);
128         kfree(out_mad);
129         return err;
130 }
131
132 int mthca_query_port(struct ib_device *ibdev,
133                             u8 port, struct ib_port_attr *props)
134 {
135         struct ib_smp *in_mad  = NULL;
136         struct ib_smp *out_mad = NULL;
137         int err = -ENOMEM;
138         u8 status;
139
140         in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
141         out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
142         if (!in_mad || !out_mad)
143                 goto out;
144
145         init_query_mad(in_mad);
146         in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
147         in_mad->attr_mod = cl_hton32(port);
148
149         err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
150                             port, NULL, NULL, in_mad, out_mad,
151                             &status);
152         if (err)
153                 goto out;
154         if (status) {
155                 err = -EINVAL;
156                 goto out;
157         }
158
159         RtlZeroMemory(props, sizeof *props);
160         props->lid               = cl_ntoh16(*(__be16 *) (out_mad->data + 16));
161         props->lmc               = out_mad->data[34] & 0x7;
162         props->sm_lid            = cl_ntoh16(*(__be16 *) (out_mad->data + 18));
163         props->sm_sl             = out_mad->data[36] & 0xf;
164         props->state             = out_mad->data[32] & 0xf;
165         props->phys_state        = out_mad->data[33] >> 4;
166         props->port_cap_flags    = cl_ntoh32(*(__be32 *) (out_mad->data + 20));
167         props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
168         props->max_msg_sz        = 0x80000000;
169         props->pkey_tbl_len      = (u16)to_mdev(ibdev)->limits.pkey_table_len;
170         props->bad_pkey_cntr     = cl_ntoh16(*(__be16 *) (out_mad->data + 46));
171         props->qkey_viol_cntr    = cl_ntoh16(*(__be16 *) (out_mad->data + 48));
172         props->active_width      = out_mad->data[31] & 0xf;
173         props->active_speed      = out_mad->data[35] >> 4;
174         props->max_mtu           = out_mad->data[41] & 0xf;
175         props->active_mtu        = out_mad->data[36] >> 4;
176         props->subnet_timeout    = out_mad->data[51] & 0x1f;
177
178  out:
179         kfree(in_mad);
180         kfree(out_mad);
181         return err;
182 }
183
184 int mthca_modify_port(struct ib_device *ibdev,
185                              u8 port, int port_modify_mask,
186                              struct ib_port_modify *props)
187 {
188         struct mthca_set_ib_param set_ib;
189         struct ib_port_attr attr;
190         int err;
191         u8 status;
192
193         if (down_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
194                 return -EFAULT;
195
196         err = mthca_query_port(ibdev, port, &attr);
197         if (err)
198                 goto out;
199
200         set_ib.set_si_guid     = 0;
201         set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
202
203         set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
204                 ~props->clr_port_cap_mask;
205
206         err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port, &status);
207         if (err)
208                 goto out;
209         if (status) {
210                 err = -EINVAL;
211                 goto out;
212         }
213
214 out:
215         up(&to_mdev(ibdev)->cap_mask_mutex);
216         return err;
217 }
218
219 int mthca_query_pkey_chunk(struct ib_device *ibdev,
220                             u8 port, u16 index, u16 pkey[32])
221 {
222         struct ib_smp *in_mad  = NULL;
223         struct ib_smp *out_mad = NULL;
224         int err = -ENOMEM;
225         u8 status;
226
227         in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
228         out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
229         if (!in_mad || !out_mad)
230                 goto out;
231
232         init_query_mad(in_mad);
233         in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
234         in_mad->attr_mod = cl_hton32(index / 32);
235
236         err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
237                             port, NULL, NULL, in_mad, out_mad,
238                             &status);
239         if (err)
240                 goto out;
241         if (status) {
242                 err = -EINVAL;
243                 goto out;
244         }
245
246         { // copy the results
247                 int i;
248                 __be16 *pkey_chunk = (__be16 *)out_mad->data;
249                 for (i=0; i<32; ++i) 
250                         pkey[i] = cl_ntoh16(pkey_chunk[i]);
251         }
252
253  out:
254         kfree(in_mad);
255         kfree(out_mad);
256         return err;
257 }
258
259 int mthca_query_gid_chunk(struct ib_device *ibdev, u8 port,
260                            int index, union ib_gid gid[8])
261 {
262         struct ib_smp *in_mad  = NULL;
263         struct ib_smp *out_mad = NULL;
264         int err = -ENOMEM;
265         u8 status;
266         __be64  subnet_prefix;
267
268         in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
269         out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
270         if (!in_mad || !out_mad)
271                 goto out;
272
273         init_query_mad(in_mad);
274         in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
275         in_mad->attr_mod = cl_hton32(port);
276
277         err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
278                             port, NULL, NULL, in_mad, out_mad,
279                             &status);
280         if (err)
281                 goto out;
282         if (status) {
283                 err = -EINVAL;
284                 goto out;
285         }
286
287         memcpy(&subnet_prefix, out_mad->data + 8, 8);
288
289         init_query_mad(in_mad);
290         in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
291         in_mad->attr_mod = cl_hton32(index / 8);
292
293         err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
294                             port, NULL, NULL, in_mad, out_mad,
295                             &status);
296         if (err)
297                 goto out;
298         if (status) {
299                 err = -EINVAL;
300                 goto out;
301         }
302
303         { // copy the results
304                 int i;
305                 __be64 *guid = (__be64 *)out_mad->data;
306                 for (i=0; i<8; ++i) {
307                         gid[i].global.subnet_prefix = subnet_prefix;
308                         gid[i].global.interface_id = guid[i];
309                 }
310         }
311
312  out:
313         kfree(in_mad);
314         kfree(out_mad);
315         return err;
316 }
317
318 struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
319                                                 ci_umv_buf_t* const     p_umv_buf)
320 {
321         struct mthca_alloc_ucontext_resp uresp;
322         struct mthca_ucontext           *context;
323         int                              err;
324
325         RtlZeroMemory(&uresp, sizeof uresp);
326
327         uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
328         if (mthca_is_memfree(to_mdev(ibdev)))
329                 uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
330         else
331                 uresp.uarc_size = 0;
332
333         context = kzalloc(sizeof *context, GFP_KERNEL);
334         if (!context) {
335                 err = -ENOMEM;
336                 goto err_nomem;
337         }
338         
339         err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
340         if (err) 
341                 goto err_uar_alloc;
342
343         /*
344         * map uar to user space
345         */
346
347         /* map UAR to kernel */
348         context->kva = ioremap(context->uar.pfn << PAGE_SHIFT, PAGE_SIZE,&context->uar_size);
349         if (!context->kva) {
350                 HCA_PRINT( TRACE_LEVEL_ERROR, HCA_DBG_LOW ,("Couldn't map kernel access region, aborting.\n") );
351                 err = -ENOMEM;
352                 goto err_ioremap;
353         }
354
355         /* build MDL */
356         context->mdl = IoAllocateMdl( context->kva, (ULONG)context->uar_size,
357                 FALSE, TRUE, NULL );
358         if( !context->mdl ) {
359                 err = -ENOMEM;
360                 goto err_alloc_mdl;
361         }
362         MmBuildMdlForNonPagedPool( context->mdl );
363
364         /* Map the memory into the calling process's address space. */
365         __try   {
366                 context->ibucontext.user_uar = MmMapLockedPagesSpecifyCache( context->mdl,
367                         UserMode, MmNonCached, NULL, FALSE, NormalPagePriority );
368         }
369         __except(EXCEPTION_EXECUTE_HANDLER) {
370                 err = -EACCES;
371                 goto err_map;
372         }
373
374         /* user_db_tab */
375         context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
376         if (IS_ERR(context->db_tab)) {
377                 err = PTR_ERR(context->db_tab);
378                 goto err_init_user;
379         }
380
381         err = ib_copy_to_umv_buf(p_umv_buf, &uresp, sizeof uresp);
382         if (err) 
383                 goto err_copy_to_umv_buf;
384
385         context->ibucontext.device = ibdev;
386         
387         atomic_set(&context->ibucontext.usecnt, 0);
388         return &context->ibucontext;
389
390 err_copy_to_umv_buf:
391         mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar,
392                 context->db_tab);
393 err_init_user:  
394         MmUnmapLockedPages( context->ibucontext.user_uar, context->mdl );
395 err_map:
396         IoFreeMdl(context->mdl);
397 err_alloc_mdl:  
398         iounmap(context->kva, PAGE_SIZE);
399 err_ioremap:    
400         mthca_uar_free(to_mdev(ibdev), &context->uar);
401 err_uar_alloc:
402         kfree(context);
403 err_nomem:      
404         return ERR_PTR(err);
405 }
406
407  int mthca_dealloc_ucontext(struct ib_ucontext *context)
408 {
409          struct mthca_ucontext                                   *mucontext = to_mucontext(context);
410
411         mthca_cleanup_user_db_tab(to_mdev(context->device), &mucontext->uar,
412                                   mucontext->db_tab);
413         MmUnmapLockedPages( mucontext->ibucontext.user_uar, mucontext->mdl );
414         IoFreeMdl(mucontext->mdl);
415         iounmap(mucontext->kva, PAGE_SIZE);
416         mthca_uar_free(to_mdev(context->device), &mucontext->uar);
417         kfree(mucontext);
418         
419         return 0;
420 }
421
422 struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
423                                     struct ib_ucontext *context,
424                                     ci_umv_buf_t* const                 p_umv_buf)
425 {
426         int err;
427         struct mthca_pd *pd;
428         struct ibv_alloc_pd_resp resp;
429
430         /* sanity check */
431         if (p_umv_buf && p_umv_buf->command) {
432                 if (p_umv_buf->output_size < sizeof(struct ibv_alloc_pd_resp)) {
433                         err = -EINVAL;
434                         goto err_param;
435                 }
436         }
437         
438         pd = kmalloc(sizeof *pd, GFP_KERNEL);
439         if (!pd) {
440                 err = -ENOMEM;
441                 goto err_mem;
442         }
443
444         err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
445         if (err) {
446                 goto err_pd_alloc;
447         }
448
449         if (p_umv_buf && p_umv_buf->command) {
450                 resp.pd_handle = (u64)(UINT_PTR)pd;
451                 resp.pdn = pd->pd_num;
452                 if (ib_copy_to_umv_buf(p_umv_buf, &resp, sizeof(struct ibv_alloc_pd_resp))) {
453                         err = -EFAULT;
454                         goto err_copy;
455                 }
456         }
457
458         return &pd->ibpd;
459
460 err_copy:       
461         mthca_pd_free(to_mdev(ibdev), pd);
462 err_pd_alloc:
463         kfree(pd);
464 err_mem:
465 err_param:
466         return ERR_PTR(err);
467 }
468
469 int mthca_dealloc_pd(struct ib_pd *pd)
470 {
471         mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
472         kfree(pd);
473
474         return 0;
475 }
476
477 struct ib_ah *mthca_ah_create(struct ib_pd *pd,
478                                      struct ib_ah_attr *ah_attr)
479 {
480         int err;
481         struct mthca_ah *ah;
482
483         ah = kzalloc(sizeof *ah, GFP_ATOMIC);
484         if (!ah)
485                 return ERR_PTR(-ENOMEM);
486
487         err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
488         if (err) {
489                 kfree(ah);
490                 return ERR_PTR(err);
491         }
492
493         return &ah->ibah;
494 }
495
496 int mthca_ah_destroy(struct ib_ah *ah)
497 {
498         mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
499         kfree(ah);
500
501         return 0;
502 }
503
504 struct ib_srq *mthca_create_srq(struct ib_pd *pd,
505                                        struct ib_srq_init_attr *init_attr,
506                                        ci_umv_buf_t* const                      p_umv_buf)
507 {
508 #ifdef WIN_TO_BE_CHANGED
509         struct mthca_create_srq ucmd;
510         struct mthca_ucontext *context = NULL;
511         struct mthca_srq *srq;
512         int err;
513
514         srq = kmalloc(sizeof *srq, GFP_KERNEL);
515         if (!srq)
516                 return ERR_PTR(-ENOMEM);
517
518         if (pd->ucontext) {
519                 context = to_mucontext(pd->ucontext);
520
521                 if (ib_copy_from_umv_buf(&ucmd, p_umv_buf, sizeof ucmd)) {
522                         err = -EFAULT;
523                         goto err_free;
524                 }
525                 err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
526                                         context->db_tab, ucmd.db_index,
527                                         ucmd.db_page);
528
529                 if (err)
530                         goto err_free;
531
532                 srq->mr.ibmr.lkey = ucmd.lkey;
533                 srq->db_index     = ucmd.db_index;
534         }
535
536         err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
537                               &init_attr->attr, srq);
538
539         if (err && pd->ucontext)
540                 mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
541                                     context->db_tab, ucmd.db_index);
542
543         if (err)
544                 goto err_free;
545
546         if (context && ib_copy_to_umv_buf(p_umv_buf, &srq->srqn, sizeof (u32))) {
547                 mthca_free_srq(to_mdev(pd->device), srq);
548                 err = -EFAULT;
549                 goto err_free;
550         }
551
552         return &srq->ibsrq;
553
554 err_free:
555         kfree(srq);
556
557         return ERR_PTR(err);
558 #else
559         UNREFERENCED_PARAMETER(p_umv_buf);
560         UNREFERENCED_PARAMETER(init_attr);
561         UNREFERENCED_PARAMETER(pd);
562         return NULL;
563 #endif
564 }
565
566 int mthca_destroy_srq(struct ib_srq *srq)
567 {
568         struct mthca_ucontext *context;
569
570         if (srq->uobject) {
571                 context = to_mucontext(srq->uobject->context);
572
573                 mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
574                                     context->db_tab, to_msrq(srq)->db_index);
575         }
576
577         mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
578         kfree(srq);
579
580         return 0;
581 }
582
583 struct ib_qp *mthca_create_qp(struct ib_pd *pd,
584                                      struct ib_qp_init_attr *init_attr,
585                                       ci_umv_buf_t* const                       p_umv_buf)
586 {
587         struct ibv_create_qp ucmd = {0};
588         struct mthca_qp *qp = NULL;
589         struct mthca_ucontext *context = NULL;
590         int err;
591
592         switch (init_attr->qp_type) {
593         case IB_QPT_RELIABLE_CONN:
594         case IB_QPT_UNRELIABLE_CONN:
595         case IB_QPT_UNRELIABLE_DGRM:
596         {
597
598                 qp = kmalloc(sizeof *qp, GFP_KERNEL);
599                 if (!qp) {
600                         err = -ENOMEM;
601                         goto err_mem;
602                 }
603
604                 if (pd->ucontext) {
605                         context = to_mucontext(pd->ucontext);
606
607                         if (ib_copy_from_umv_buf(&ucmd, p_umv_buf, sizeof ucmd)) {
608                                 err = -EFAULT;
609                                 goto err_copy;
610                         }
611
612                         err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
613                                                 context->db_tab,
614                                                 ucmd.sq_db_index, ucmd.sq_db_page);
615                         if (err) 
616                                 goto err_map1;
617
618                         err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
619                                                 context->db_tab,
620                                                 ucmd.rq_db_index, ucmd.rq_db_page);
621                         if (err) 
622                                 goto err_map2;
623
624                         qp->mr.ibmr.lkey = ucmd.lkey;
625                         qp->sq.db_index  = ucmd.sq_db_index;
626                         qp->rq.db_index  = ucmd.rq_db_index;
627                 }
628
629                 err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
630                                      to_mcq(init_attr->send_cq),
631                                      to_mcq(init_attr->recv_cq),
632                                      init_attr->qp_type, init_attr->sq_sig_type,
633                                      &init_attr->cap, qp);
634
635                 if (err)
636                         if (pd->ucontext) 
637                                 goto err_alloc_qp_user;
638                         else 
639                                 goto err_copy;
640
641                 qp->ibqp.qp_num = qp->qpn;
642                 break;
643         }
644         case IB_QPT_QP0:
645         case IB_QPT_QP1:
646         {
647                 /* Don't allow userspace to create special QPs */
648                 if (pd->ucontext) {
649                         err = -EINVAL;
650                         goto err_inval;
651                 }
652
653                 qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
654                 if (!qp) {
655                         err = -ENOMEM;
656                         goto err_mem;
657                 }
658
659                 qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_QP0 ? 0 : 1;
660
661                 err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
662                                       to_mcq(init_attr->send_cq),
663                                       to_mcq(init_attr->recv_cq),
664                                       init_attr->sq_sig_type, &init_attr->cap,
665                                       qp->ibqp.qp_num, init_attr->port_num,
666                                       to_msqp(qp));
667                 if (err)
668                         goto err_alloc_sqp;
669                 
670                 break;
671         }
672         default:
673                 /* Don't support raw QPs */
674                 err = -ENOSYS;
675                 goto err_unsupported;
676         }
677
678         init_attr->cap.max_send_wr     = qp->sq.max;
679         init_attr->cap.max_recv_wr     = qp->rq.max;
680         init_attr->cap.max_send_sge    = qp->sq.max_gs;
681         init_attr->cap.max_recv_sge    = qp->rq.max_gs;
682         init_attr->cap.max_inline_data    = qp->max_inline_data;
683
684         return &qp->ibqp;
685
686                 
687 err_alloc_qp_user:
688         if (pd->ucontext) 
689                 mthca_unmap_user_db(to_mdev(pd->device),
690                         &context->uar, context->db_tab, ucmd.rq_db_index);
691 err_map2:
692         if (pd->ucontext) 
693                 mthca_unmap_user_db(to_mdev(pd->device),
694                         &context->uar, context->db_tab, ucmd.sq_db_index);
695 err_map1: err_copy: err_alloc_sqp:
696         if (qp)
697                 kfree(qp);
698 err_mem: err_inval:     err_unsupported:
699         return ERR_PTR(err);
700 }
701
702 int mthca_destroy_qp(struct ib_qp *qp)
703 {
704         if (qp->ucontext) {
705                 mthca_unmap_user_db(to_mdev(qp->device),
706                                     &to_mucontext(qp->ucontext)->uar,
707                                     to_mucontext(qp->ucontext)->db_tab,
708                                     to_mqp(qp)->sq.db_index);
709                 mthca_unmap_user_db(to_mdev(qp->device),
710                                     &to_mucontext(qp->ucontext)->uar,
711                                     to_mucontext(qp->ucontext)->db_tab,
712                                     to_mqp(qp)->rq.db_index);
713         }
714         mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
715         kfree(qp);
716         return 0;
717 }
718
719 struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
720                                      struct ib_ucontext *context,
721                                      ci_umv_buf_t* const                        p_umv_buf)
722 {
723         struct ibv_create_cq ucmd = {0};
724         struct mthca_cq *cq;
725         int nent;
726         int err;
727
728         if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)   
729                 return ERR_PTR(-EINVAL);
730
731         if (context) {
732                 if (ib_copy_from_umv_buf(&ucmd, p_umv_buf, sizeof ucmd))
733                         return ERR_PTR(-EFAULT);
734
735                 err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
736                                         to_mucontext(context)->db_tab,
737                                         ucmd.set_db_index, ucmd.set_db_page);
738                 if (err)
739                         return ERR_PTR(err);
740
741                 err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
742                                         to_mucontext(context)->db_tab,
743                                         ucmd.arm_db_index, ucmd.arm_db_page);
744                 if (err)
745                         goto err_unmap_set;
746         }
747
748         cq = kmalloc(sizeof *cq, GFP_KERNEL);
749         if (!cq) {
750                 err = -ENOMEM;
751                 goto err_unmap_arm;
752         }
753
754         if (context) {
755                 cq->mr.ibmr.lkey = ucmd.lkey;
756                 cq->set_ci_db_index = ucmd.set_db_index;
757                 cq->arm_db_index    = ucmd.arm_db_index;
758         }
759
760         for (nent = 1; nent <= entries; nent <<= 1)
761                 ; /* nothing */
762
763         err = mthca_init_cq(to_mdev(ibdev), nent, 
764                             context ? to_mucontext(context) : NULL,
765                             context ? ucmd.mr.pdn : to_mdev(ibdev)->driver_pd.pd_num,
766                             cq);
767         if (err)
768                 goto err_free;
769
770         if (context ) {
771                 struct ibv_create_cq_resp *create_cq_resp = (struct ibv_create_cq_resp *)(void*)p_umv_buf->p_inout_buf;
772                 create_cq_resp->cqn = cq->cqn;
773         }
774
775         HCA_PRINT( TRACE_LEVEL_INFORMATION, HCA_DBG_LOW ,
776                 ("uctx %p, cq_hndl %p, cq_num %#x, cqe  %#x\n",
777                 context, &cq->ibcq, cq->cqn, cq->ibcq.cqe ) );
778         
779         return &cq->ibcq;
780
781 err_free:
782         kfree(cq);
783
784 err_unmap_arm:
785         if (context)
786                 mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
787                                     to_mucontext(context)->db_tab, ucmd.arm_db_index);
788
789 err_unmap_set:
790         if (context)
791                 mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
792                                     to_mucontext(context)->db_tab, ucmd.set_db_index);
793
794         return ERR_PTR(err);
795 }
796
797 int mthca_destroy_cq(struct ib_cq *cq)
798 {
799         if (cq->ucontext) {
800                 mthca_unmap_user_db(to_mdev(cq->device),
801                                     &to_mucontext(cq->ucontext)->uar,
802                                     to_mucontext(cq->ucontext)->db_tab,
803                                     to_mcq(cq)->arm_db_index);
804                 mthca_unmap_user_db(to_mdev(cq->device),
805                                     &to_mucontext(cq->ucontext)->uar,
806                                     to_mucontext(cq->ucontext)->db_tab,
807                                     to_mcq(cq)->set_ci_db_index);
808         }
809         mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
810         kfree(cq);
811
812         return 0;
813 }
814
815 static
816 mthca_mpt_access_t
817 map_qp_mpt(
818         IN                              mthca_qp_access_t                               qp_acl)
819 {
820 #define ACL_MTHCA(mfl,ifl) if (qp_acl & mfl)   mpt_acl |= ifl
821         mthca_mpt_access_t mpt_acl = 0;
822
823         ACL_MTHCA(MTHCA_ACCESS_REMOTE_READ,MTHCA_MPT_FLAG_REMOTE_READ);
824         ACL_MTHCA(MTHCA_ACCESS_REMOTE_WRITE,MTHCA_MPT_FLAG_REMOTE_WRITE);
825         ACL_MTHCA(MTHCA_ACCESS_REMOTE_ATOMIC,MTHCA_MPT_FLAG_ATOMIC);
826         ACL_MTHCA(MTHCA_ACCESS_LOCAL_WRITE,MTHCA_MPT_FLAG_LOCAL_WRITE);
827
828         return (mpt_acl | MTHCA_MPT_FLAG_LOCAL_READ);
829 }
830
831 struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, mthca_qp_access_t acc)
832 {
833         struct mthca_mr *mr;
834         int err;
835
836         mr = kmalloc(sizeof *mr, GFP_KERNEL);
837         if (!mr)
838                 return ERR_PTR(-ENOMEM);
839         RtlZeroMemory(mr, sizeof *mr);
840
841         err = mthca_mr_alloc_notrans(to_mdev(pd->device),
842                                      to_mpd(pd)->pd_num,
843                                      map_qp_mpt(acc), mr);
844
845         if (err) {
846                 kfree(mr);
847                 return ERR_PTR(err);
848         }
849
850         return &mr->ibmr;
851 }
852
853 struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
854                                        struct ib_phys_buf *buffer_list,
855                                        int                 num_phys_buf,
856                                        mthca_qp_access_t                 acc,
857                                        u64                *iova_start)
858 {
859         struct mthca_mr *mr;
860         u64 *page_list;
861         u64 total_size;
862         u64 mask;
863         int shift;
864         int npages;
865         int err;
866         int i, j, n;
867
868         /* First check that we have enough alignment */
869         if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK))
870                 return ERR_PTR(-EINVAL);
871
872         if (num_phys_buf > 1 &&
873             ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK))
874                 return ERR_PTR(-EINVAL);
875
876         mask = 0;
877         total_size = 0;
878         for (i = 0; i < num_phys_buf; ++i) {
879                 if (i != 0)
880                         mask |= buffer_list[i].addr;
881                 if (i != num_phys_buf - 1)
882                         mask |= buffer_list[i].addr + buffer_list[i].size;
883
884                 total_size += buffer_list[i].size;
885         }
886
887         if (mask & ~PAGE_MASK)
888                 return ERR_PTR(-EINVAL);
889
890         /* Find largest page shift we can use to cover buffers */
891         for (shift = PAGE_SHIFT; shift < 31; ++shift)
892                 if (num_phys_buf > 1) {
893                         if ((1ULL << shift) & mask)
894                                 break;
895                 } else {
896                         if (1ULL << shift >=
897                             buffer_list[0].size +
898                             (buffer_list[0].addr & ((1ULL << shift) - 1)))
899                                 break;
900                 }
901
902         buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
903         buffer_list[0].addr &= ~0ull << shift;
904
905         mr = kmalloc(sizeof *mr, GFP_KERNEL);
906         if (!mr)
907                 return ERR_PTR(-ENOMEM);
908         RtlZeroMemory(mr, sizeof *mr);
909
910         npages = 0;
911         for (i = 0; i < num_phys_buf; ++i)
912                 npages += (int)((buffer_list[i].size + (1ULL << shift) - 1) >> shift);
913
914         if (!npages)
915                 return &mr->ibmr;
916
917         page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
918         if (!page_list) {
919                 kfree(mr);
920                 return ERR_PTR(-ENOMEM);
921         }
922
923         n = 0;
924         for (i = 0; i < num_phys_buf; ++i)
925                 for (j = 0;
926                      j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
927                      ++j)
928                         page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
929
930         HCA_PRINT( TRACE_LEVEL_VERBOSE ,HCA_DBG_LOW ,("Registering memory at %I64x (iova %I64x) "
931                   "in PD %x; shift %d, npages %d.\n",
932                   (unsigned long long) buffer_list[0].addr,
933                   (unsigned long long) *iova_start,
934                   to_mpd(pd)->pd_num,
935                   shift, npages));
936
937         err = mthca_mr_alloc_phys(to_mdev(pd->device),
938                                   to_mpd(pd)->pd_num,
939                                   page_list, shift, npages,
940                                   *iova_start, total_size,
941                                   map_qp_mpt(acc), mr);
942
943         if (err) {
944                 kfree(page_list);
945                 kfree(mr);
946                 return ERR_PTR(err);
947         }
948
949         kfree(page_list);
950         return &mr->ibmr;
951 }
952
953 struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, 
954         void* __ptr64   vaddr, uint64_t length, uint64_t hca_va, mthca_qp_access_t acc)
955 {
956         struct mthca_dev *dev = to_mdev(pd->device);
957         struct ib_umem_chunk *chunk;
958         struct mthca_mr *mr;
959         struct ib_umem *region;
960         u64 *pages;
961         int shift, n, len;
962         int i, j, k;
963         int err = 0;
964
965         HCA_ENTER(HCA_DBG_MEMORY);
966         mr = kzalloc(sizeof *mr, GFP_KERNEL);
967         if (!mr) {
968                 err = -ENOMEM;
969                 goto err_nomem;
970         }
971         region = &mr->umem;
972
973         /*
974          * We ask for writable memory if any access flags other than
975          * "remote read" are set.  "Local write" and "remote write"
976          * obviously require write access.  "Remote atomic" can do
977          * things like fetch and add, which will modify memory, and
978          * "MW bind" can change permissions by binding a window.
979          */
980         err = ibv_umem_get(pd->device, region,
981                           (void *)vaddr, (size_t)length,
982                           !!(acc & ~MTHCA_ACCESS_REMOTE_READ));
983         if (err)
984                 goto err_umem_get;
985
986         region->virt_base = hca_va;     /* va in HCA */
987
988         n = 0;
989         shift = ffs(region->page_size) - 1;
990         list_for_each_entry(chunk, &region->chunk_list, list,struct ib_umem_chunk)
991                 n += chunk->nents;
992
993         mr->mtt = mthca_alloc_mtt(dev, n);
994         if (IS_ERR(mr->mtt)) {
995                 err = PTR_ERR(mr->mtt);
996                 goto err_alloc_mtt;
997         }
998
999         pages = (u64 *) kmalloc(PAGE_SIZE,GFP_KERNEL);
1000         if (!pages) {
1001                 err = -ENOMEM;
1002                 goto err_pages;
1003         }
1004
1005         i = n = 0;
1006
1007         list_for_each_entry(chunk, &region->chunk_list, list,struct ib_umem_chunk)
1008                 for (j = 0; j < chunk->nmap; ++j) {
1009                         len = sg_dma_len(&chunk->page_list[j]) >> shift;
1010                         for (k = 0; k < len; ++k) {
1011                                 pages[i++] = sg_dma_address(&chunk->page_list[j]) +
1012                                         region->page_size * k;
1013                                 /*
1014                                  * Be friendly to WRITE_MTT command
1015                                  * and leave two empty slots for the
1016                                  * index and reserved fields of the
1017                                  * mailbox.
1018                                  */
1019                                 if (i == PAGE_SIZE / sizeof (u64) - 2) {
1020                                         err = mthca_write_mtt(dev, mr->mtt,
1021                                                               n, pages, i);
1022                                         if (err)
1023                                                 goto err_write_mtt;
1024                                         n += i;
1025                                         i = 0;
1026                                 }
1027                         }
1028                 }
1029
1030         if (i) {
1031                 err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
1032                 if (err)
1033                         goto err_write_mtt;
1034         }       
1035
1036         err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, region->virt_base,
1037                              region->length, map_qp_mpt(acc), mr);
1038         if (err)
1039                 goto err_mt_alloc;
1040
1041         free_page((void*) pages);
1042         HCA_EXIT(HCA_DBG_MEMORY);
1043         return &mr->ibmr;
1044
1045 err_mt_alloc:
1046 err_write_mtt:
1047         free_page((void*) pages);
1048 err_pages:
1049         mthca_free_mtt(dev, mr->mtt);
1050 err_alloc_mtt:
1051         ibv_umem_release(pd->device, region);
1052 err_umem_get:   
1053         kfree(mr);
1054 err_nomem:      
1055         HCA_EXIT(HCA_DBG_MEMORY);
1056         return ERR_PTR(err);
1057 }
1058
1059 int mthca_dereg_mr(struct ib_mr *mr)
1060 {
1061         struct mthca_mr *mmr = to_mmr(mr);
1062         mthca_free_mr(to_mdev(mr->device), mmr);
1063         if (mr->pd->ucontext)
1064                 ibv_umem_release(mr->pd->device, &mmr->umem);
1065         kfree(mmr);
1066         return 0;
1067 }
1068
1069 struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, mthca_qp_access_t acc,
1070                                       struct ib_fmr_attr *fmr_attr)
1071 {
1072         struct mthca_fmr *fmr;
1073         int err;
1074
1075         fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
1076         if (!fmr)
1077                 return ERR_PTR(-ENOMEM);
1078
1079         memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
1080         err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
1081                              map_qp_mpt(acc), fmr);
1082
1083         if (err) {
1084                 kfree(fmr);
1085                 return ERR_PTR(err);
1086         }
1087
1088         return &fmr->ibmr;
1089 }
1090
1091 int mthca_dealloc_fmr(struct ib_fmr *fmr)
1092 {
1093         struct mthca_fmr *mfmr = to_mfmr(fmr);
1094         int err;
1095
1096         err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
1097         if (err)
1098                 return err;
1099
1100         kfree(mfmr);
1101         return 0;
1102 }
1103
1104 int mthca_unmap_fmr(struct list_head *fmr_list)
1105 {
1106         struct ib_fmr *fmr;
1107         int err;
1108         u8 status;
1109         struct mthca_dev *mdev = NULL;
1110
1111         list_for_each_entry(fmr, fmr_list, list,struct ib_fmr) {
1112                 if (mdev && to_mdev(fmr->device) != mdev)
1113                         return -EINVAL;
1114                 mdev = to_mdev(fmr->device);
1115         }
1116
1117         if (!mdev)
1118                 return 0;
1119
1120         if (mthca_is_memfree(mdev)) {
1121                 list_for_each_entry(fmr, fmr_list, list,struct ib_fmr)
1122                         mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
1123
1124                 wmb();
1125         } else
1126                 list_for_each_entry(fmr, fmr_list, list,struct ib_fmr)
1127                         mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
1128
1129         err = mthca_SYNC_TPT(mdev, &status);
1130         if (err)
1131                 return err;
1132         if (status)
1133                 return -EINVAL;
1134         return 0;
1135 }
1136
1137 static int mthca_init_node_data(struct mthca_dev *dev)
1138 {
1139         struct ib_smp *in_mad  = NULL;
1140         struct ib_smp *out_mad = NULL;
1141         int err = -ENOMEM;
1142         u8 status;
1143
1144         in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
1145         out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
1146         if (!in_mad || !out_mad)
1147                 goto out;
1148
1149         init_query_mad(in_mad);
1150         in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
1151
1152         err = mthca_MAD_IFC(dev, 1, 1,
1153                             1, NULL, NULL, in_mad, out_mad,
1154                             &status);
1155         if (err)
1156                 goto out;
1157         if (status) {
1158                 err = -EINVAL;
1159                 goto out;
1160         }
1161
1162         memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
1163
1164 out:
1165         kfree(in_mad);
1166         kfree(out_mad);
1167         return err;
1168 }
1169
1170 int mthca_register_device(struct mthca_dev *dev)
1171 {
1172         int ret;
1173
1174         ret = mthca_init_node_data(dev);        
1175         if (ret)
1176                 return ret;
1177
1178         strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
1179         dev->ib_dev.node_type            = IB_NODE_CA;
1180         dev->ib_dev.phys_port_cnt        = (u8)dev->limits.num_ports;
1181         dev->ib_dev.mdev                                = dev;
1182         dev->ib_dev.query_device         = mthca_query_device;
1183         dev->ib_dev.query_port           = mthca_query_port;
1184         dev->ib_dev.modify_port          = mthca_modify_port;
1185         dev->ib_dev.query_pkey_chunk           = mthca_query_pkey_chunk;
1186         dev->ib_dev.query_gid_chunk            = mthca_query_gid_chunk;
1187         dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
1188         dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
1189         dev->ib_dev.alloc_pd             = mthca_alloc_pd;
1190         dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
1191         dev->ib_dev.create_ah            = mthca_ah_create;
1192         dev->ib_dev.destroy_ah           = mthca_ah_destroy;
1193
1194         if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
1195                 dev->ib_dev.create_srq           = mthca_create_srq;
1196                 dev->ib_dev.modify_srq           = mthca_modify_srq;
1197                 dev->ib_dev.destroy_srq          = mthca_destroy_srq;
1198
1199                 if (mthca_is_memfree(dev))
1200                         dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
1201                 else
1202                         dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
1203         }
1204
1205         dev->ib_dev.create_qp            = mthca_create_qp;
1206         dev->ib_dev.modify_qp            = mthca_modify_qp;
1207         dev->ib_dev.destroy_qp           = mthca_destroy_qp;
1208         dev->ib_dev.create_cq            = mthca_create_cq;
1209         dev->ib_dev.destroy_cq           = mthca_destroy_cq;
1210         dev->ib_dev.poll_cq              = mthca_poll_cq;
1211         dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
1212         dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
1213         dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
1214         dev->ib_dev.dereg_mr             = mthca_dereg_mr;
1215
1216         if (dev->mthca_flags & MTHCA_FLAG_FMR) {
1217                 dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
1218                 dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
1219                 dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
1220                 if (mthca_is_memfree(dev))
1221                         dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
1222                 else
1223                         dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
1224         }
1225
1226         dev->ib_dev.attach_mcast         = mthca_multicast_attach;
1227         dev->ib_dev.detach_mcast         = mthca_multicast_detach;
1228         dev->ib_dev.process_mad          = mthca_process_mad;
1229
1230         if (mthca_is_memfree(dev)) {
1231                 dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
1232                 dev->ib_dev.post_send     = mthca_arbel_post_send;
1233                 dev->ib_dev.post_recv     = mthca_arbel_post_receive;
1234         } else {
1235                 dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
1236                 dev->ib_dev.post_send     = mthca_tavor_post_send;
1237                 dev->ib_dev.post_recv     = mthca_tavor_post_receive;
1238         }
1239
1240         KeInitializeMutex(&dev->cap_mask_mutex, 0);
1241
1242         ret = ib_register_device(&dev->ib_dev);
1243         if (ret)
1244                 return ret;
1245
1246         mthca_start_catas_poll(dev);
1247
1248         return 0;
1249 }
1250
1251 void mthca_unregister_device(struct mthca_dev *dev)
1252 {
1253         mthca_stop_catas_poll(dev);
1254         ib_unregister_device(&dev->ib_dev);
1255 }