\r
AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI,\r
("Creating h_qp %#I64x, uhdl %#I64x \n", \r
- (uint64_t)h_qp, h_qp->obj.hdl ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl ) );\r
\r
exit:\r
AL_EXIT( AL_DBG_NDI );\r
{\r
AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI,\r
("Destroying h_qp %#I64x, uhdl %#I64x, cid %d\n", \r
- (uint64_t)h_qp, h_qp->obj.hdl, ((al_conn_qp_t*)h_qp)->cid ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, ((al_conn_qp_t*)h_qp)->cid ) );\r
\r
/* Move the state before flushing, so that all new IRPs fail to queue. */\r
__ndi_acquire_lock( &h_qp->p_irp_queue->csq, &irql );\r
{\r
AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI, \r
("h_qp %#I64x, uhdl %#I64x, ref_cnt %d\n", \r
- (uint64_t)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
\r
__ndi_complete_cancelled_irp( &h_qp->p_irp_queue->csq, Irp );\r
}\r
{\r
AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI, \r
("h_qp %#I64x, uhdl %#I64x, ref_cnt %d\n", \r
- (uint64_t)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
\r
__ndi_complete_cancelled_irp( &h_qp->p_irp_queue->csq, Irp );\r
}\r
AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_NDI, \r
("h_qp %#I64x, uhdl %#I64x, ref_cnt %d\n", \r
- (uint64_t)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
}\r
\r
AL_EXIT( AL_DBG_NDI );\r
\r
AL_PRINT(TRACE_LEVEL_ERROR, AL_DBG_ERROR, \r
("p_rej %p, h_qp %#I64x, uhdl %#I64x, connect reject, reason=%hd\n", \r
- p_rej, (uint64_t)h_qp, h_qp->obj.hdl, cl_ntoh16(p_rej->reason) ) );\r
+ p_rej, (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, cl_ntoh16(p_rej->reason) ) );\r
\r
p_irp = IoCsqRemoveNextIrp( &h_qp->p_irp_queue->csq, NULL );\r
__ndi_notify_dreq( h_qp );\r
{\r
AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR, \r
("STATUS_CONNECTION_ACTIVE: h_qp %#I64x, uhdl %#I64x, ref_cnt %d\n",\r
- (uint64_t)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
return STATUS_CONNECTION_ACTIVE;\r
}\r
\r
{\r
AL_PRINT_EXIT( TRACE_LEVEL_ERROR, AL_DBG_ERROR, \r
("STATUS_CONNECTION_ACTIVE: h_qp %#I64x, uhdl %#I64x, ref_cnt %d\n",\r
- (uint64_t)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
+ (uint64_t)(ULONG_PTR)h_qp, h_qp->obj.hdl, h_qp->obj.ref_cnt ) );\r
return STATUS_CONNECTION_INVALID;\r
}\r
\r
-/*
- * Copyright (c) 2004 Topspin Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * $Id: mt_memory.c 2020 2007-05-01 09:29:10Z leonid $
- */
-#include <mlx4_debug.h>
-#include "l2w.h"
-#include "pa_cash.h"
-#include "ib_verbs.h"
-
-#if defined (EVENT_TRACING)
-#ifdef offsetof
-#undef offsetof
-#endif
-#include "iobuf.tmh"
-#endif
-
-
-
-
-/*
-* Function: map user buffer to kernel and lock it
-*
-* Return:
-*/
-int get_user_pages(
- IN struct mlx4_dev *dev, /* device */
- IN u64 start, /* address in user space */
- IN int npages, /* size in pages */
- IN int write_access, /* access rights */
- OUT struct scatterlist *sg /* s/g list */
- )
-{
- PMDL mdl_p;
- int size = npages << PAGE_SHIFT;
- int access = (write_access) ? IoWriteAccess : IoReadAccess;
- int err;
- void * kva; /* kernel virtual address */
-
- UNREFERENCED_PARAMETER(dev);
-
- MLX4_ENTER(MLX4_DBG_MEMORY);
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
-
- /* allocate MDL */
- mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)start, (ULONG)size,
- FALSE,
- FALSE, /* not charge quota */
- NULL);
- if (mdl_p == NULL) {
- err = -ENOMEM;
- goto err0;
- }
-
- /* lock memory */
- __try {
- MmProbeAndLockPages( mdl_p, UserMode, access );
- }
- __except (EXCEPTION_EXECUTE_HANDLER)
- {
- NTSTATUS Status = GetExceptionCode();
- MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_MEMORY ,("Exception 0x%x on MmProbeAndLockPages(), addr 0x%I64x, size %d\n", Status, start, size));
- switch(Status){
- case STATUS_WORKING_SET_QUOTA:
- err = -ENOMEM;break;
- case STATUS_ACCESS_VIOLATION:
- err = -EACCES;break;
- default :
- err = -EINVAL;
- }
-
- goto err1;
- }
-
- /* map it to kernel */
- kva = MmMapLockedPagesSpecifyCache( mdl_p,
- KernelMode, MmNonCached,
- NULL, FALSE, NormalPagePriority );
- if (kva == NULL) {
- MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_MEMORY ,("MmMapLockedPagesSpecifyCache failed\n"));
- err = -EFAULT;
- goto err2;
- }
-
- sg->dma_addr.va = kva;
- sg->dma_addr.sz = size;
- sg->offset = (unsigned int)(start & ~PAGE_MASK);
- sg->p_mdl = mdl_p;
- // TODO: has to be dma address, not physical one
- sg->dma_addr.da = MmGetPhysicalAddress(kva).QuadPart;
- return 0;
-
-err2:
- MmUnlockPages(mdl_p);
-err1:
- IoFreeMdl(mdl_p);
-err0:
- MLX4_EXIT(MLX4_DBG_MEMORY);
- return err;
-
- }
-
-void put_page(struct scatterlist *sg)
-{
- if (sg->p_mdl) {
- MmUnmapLockedPages( sg->dma_addr.va, sg->p_mdl );
- MmUnlockPages(sg->p_mdl);
- IoFreeMdl(sg->p_mdl);
- }
-}
-
-
-typedef struct _iobuf_seg {
- LIST_ENTRY link;
- PMDL mdl_p;
- u64 va; /* virtual address of the buffer */
- u64 size; /* size in bytes of the buffer */
- u32 nr_pages;
- int is_user;
-} iobuf_seg_t;
-
-// Returns: 0 on success, -ENOMEM or -EACCESS on error
-static int register_segment(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN enum ib_access_flags acc,
- OUT iobuf_seg_t **iobuf_seg)
-{
- PMDL mdl_p;
- int rc;
- KPROCESSOR_MODE mode;
- iobuf_seg_t * new_iobuf;
- static ULONG cnt=0;
- LOCK_OPERATION Operation;
-
- // set Operation
- if (acc & IB_ACCESS_LOCAL_WRITE)
- Operation = IoModifyAccess;
- else
- Operation = IoReadAccess;
-
- // allocate IOBUF segment object
- new_iobuf = (iobuf_seg_t *)kmalloc(sizeof(iobuf_seg_t), GFP_KERNEL );
- if (new_iobuf == NULL) {
- rc = -ENOMEM;
- goto err_nomem;
- }
-
- // allocate MDL
- mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)va, (ULONG)size, FALSE,FALSE,NULL);
- if (mdl_p == NULL) {
- rc = -ENOMEM;
- goto err_alloc_mdl;
- }
-
- // make context-dependent things
- if (is_user) {
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
- mode = UserMode;
- }
- else { /* Mapping to kernel virtual address */
- // MmBuildMdlForNonPagedPool(mdl_p); // fill MDL ??? - should we do that really ?
- mode = KernelMode;
- }
-
- __try { /* try */
- MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */
- } /* try */
-
- __except (EXCEPTION_EXECUTE_HANDLER) {
- MLX4_PRINT(TRACE_LEVEL_ERROR, MLX4_DBG_MEMORY,
- ("MOSAL_iobuf_register: Exception 0x%x on MmProbeAndLockPages(), va %I64d, sz %I64d\n",
- GetExceptionCode(), va, size));
- rc = -EACCES;
- goto err_probe;
- }
-
- // fill IOBUF object
- new_iobuf->va = va;
- new_iobuf->size= size;
- new_iobuf->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );
- new_iobuf->mdl_p = mdl_p;
- new_iobuf->is_user = is_user;
- *iobuf_seg = new_iobuf;
- return 0;
-
-err_probe:
- IoFreeMdl(mdl_p);
-err_alloc_mdl:
- ExFreePool((PVOID)new_iobuf);
-err_nomem:
- return rc;
-}
-
-void iobuf_init(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN OUT iobuf_t *iobuf_p)
-{
- iobuf_p->va = va;
- iobuf_p->size= size;
- iobuf_p->is_user = is_user;
- InitializeListHead( &iobuf_p->seg_que );
- iobuf_p->seg_num = 0;
- iobuf_p->nr_pages = 0;
- iobuf_p->is_cashed = 0;
-}
-
-int iobuf_register(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN enum ib_access_flags acc,
- IN OUT iobuf_t *iobuf_p)
-{
- int rc=0;
- u64 seg_va; // current segment start
- u64 seg_size; // current segment size
- u64 rdc; // remain data counter - what is rest to lock
- u64 delta; // he size of the last not full page of the first segment
- iobuf_seg_t * new_iobuf;
- unsigned page_size = PAGE_SIZE;
-
-// 32 - for any case
-#define PFNS_IN_PAGE_SIZE_MDL ((PAGE_SIZE - sizeof(struct _MDL) - 32) / sizeof(long))
-#define MIN_IOBUF_SEGMENT_SIZE (PAGE_SIZE * PFNS_IN_PAGE_SIZE_MDL) // 4MB
-
- ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);
-
- // we'll try to register all at once.
- seg_va = va;
- seg_size = rdc = size;
-
- // allocate segments
- while (rdc > 0) {
- // map a segment
- rc = register_segment(seg_va, seg_size, is_user, acc, &new_iobuf );
-
- // success - move to another segment
- if (!rc) {
- rdc -= seg_size;
- seg_va += seg_size;
- InsertTailList( &iobuf_p->seg_que, &new_iobuf->link );
- iobuf_p->seg_num++;
- // round the segment size to the next page boundary
- delta = (seg_va + seg_size) & (page_size - 1);
- if (delta) {
- seg_size -= delta;
- seg_size += page_size;
- }
- if (seg_size > rdc)
- seg_size = rdc;
- continue;
- }
-
- // failure - too large a buffer: lessen it and try once more
- if (rc == -ENOMEM) {
- // no where to lessen - too low memory
- if (seg_size <= MIN_IOBUF_SEGMENT_SIZE)
- break;
- // lessen the size
- seg_size >>= 1;
- // round the segment size to the next page boundary
- delta = (seg_va + seg_size) & (page_size - 1);
- if (delta) {
- seg_size -= delta;
- seg_size += page_size;
- }
- if (seg_size > rdc)
- seg_size = rdc;
- continue;
- }
-
- // got unrecoverable error
- break;
- }
-
- // SUCCESS
- if (rc)
- iobuf_deregister( iobuf_p );
- else
- iobuf_p->nr_pages += ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );
-
- return rc;
-}
-
-
-static void __iobuf_copy(
- IN OUT iobuf_t *dst_iobuf_p,
- IN iobuf_t *src_iobuf_p
- )
-{
- int i;
- iobuf_seg_t *iobuf_seg_p;
-
- *dst_iobuf_p = *src_iobuf_p;
- InitializeListHead( &dst_iobuf_p->seg_que );
- for (i=0; i<src_iobuf_p->seg_num; ++i) {
- iobuf_seg_p = (iobuf_seg_t *)(PVOID)RemoveHeadList( &src_iobuf_p->seg_que );
- InsertTailList( &dst_iobuf_p->seg_que, &iobuf_seg_p->link );
- }
-}
-
-/* if the buffer to be registered overlaps a buffer, already registered,
- a race can happen between HCA, writing to the previously registered
- buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory),
- used in the algorithm of memory registration.
- To prevent the race we maintain reference counters for the physical pages, being registered,
- and register every physical page FOR THE WRITE ACCESS only once.*/
-
-int iobuf_register_with_cash(
- IN u64 vaddr,
- IN u64 size,
- IN int is_user,
- IN OUT enum ib_access_flags *acc_p,
- IN OUT iobuf_t *iobuf_p)
-{
- int rc, pa_in;
- iobuf_t sec_iobuf;
- int i, page_in , page_out, page_in_total;
- int nr_pages;
- char *subregion_start, *va;
- u64 subregion_size;
- u64 rdc; // remain data counter - what is rest to lock
- u64 delta; // he size of the last not full page of the first segment
- enum ib_access_flags acc;
-
- mutex_lock(&g_pa_mutex);
-
- // register memory for read access to bring pages into the memory
- rc = iobuf_register( vaddr, size, is_user, 0, iobuf_p);
-
- // on error or read access - exit
- if (rc || !(*acc_p & IB_ACCESS_LOCAL_WRITE))
- goto exit;
-
- // re-register buffer with the correct access rights
- iobuf_init( (u64)vaddr, size, is_user, &sec_iobuf );
- nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( vaddr, size );
- subregion_start = va = (char*)(ULONG_PTR)vaddr;
- rdc = size;
- pa_in = page_in = page_in_total = page_out = 0;
-
- for (i=0; i<nr_pages; ++i, va+=PAGE_SIZE) {
- // check whether a phys page is to be registered
- PHYSICAL_ADDRESS pa = MmGetPhysicalAddress(va);
- pa_in = pa_is_registered(pa.QuadPart);
- if (pa_in) {
- ++page_in;
- ++page_in_total;
- }
- else
- ++page_out;
-
- // check whether we get at the end of a subregion with the same rights wrt cash
- if (page_in && page_out) {
- // prepare to registration of the subregion
- if (pa_in) { // SUBREGION WITH WRITE ACCESS
- acc = IB_ACCESS_LOCAL_WRITE;
- subregion_size = (u64)page_out * PAGE_SIZE;
- page_out = 0;
- }
- else { // SUBREGION WITH READ ACCESS
- acc = 0;
- subregion_size = (u64)page_in * PAGE_SIZE;
- page_in = 0;
- }
-
- // round the subregion size to the page boundary
- delta = (u64)(subregion_start + subregion_size) & (PAGE_SIZE - 1);
- subregion_size -= delta;
- if (subregion_size > rdc)
- subregion_size = rdc;
-
- // register the subregion
- rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf);
- if (rc)
- goto cleanup;
-
- // prepare to the next loop
- rdc -= subregion_size;
- subregion_start +=subregion_size;
- }
- }
-
- // prepare to registration of the subregion
- if (pa_in) { // SUBREGION WITH READ ACCESS
- acc = 0;
- subregion_size = (u64)page_in * PAGE_SIZE;
- }
- else { // SUBREGION WITH WRITE ACCESS
- acc = IB_ACCESS_LOCAL_WRITE;
- subregion_size = (u64)page_out * PAGE_SIZE;
- }
-
- // round the subregion size to the page boundary
- delta = (u64)(subregion_start + subregion_size) & (PAGE_SIZE - 1);
- subregion_size -= delta;
- if (subregion_size > rdc)
- subregion_size = rdc;
-
- // register the subregion
- rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf);
- if (rc)
- goto cleanup;
-
- // cash phys pages
- rc = pa_register(iobuf_p);
- if (rc)
- goto err_pa_reg;
-
- // replace the iobuf
- iobuf_deregister( iobuf_p );
- sec_iobuf.is_cashed = TRUE;
- __iobuf_copy( iobuf_p, &sec_iobuf );
-
- // buffer is a part of also registered buffer - change the rights
- if (page_in_total)
- *acc_p &= ~IB_ACCESS_LOCAL_WRITE;
-
- goto exit;
-
-err_pa_reg:
- iobuf_deregister( &sec_iobuf );
-cleanup:
- iobuf_deregister( iobuf_p );
-exit:
- mutex_unlock(&g_pa_mutex);
- return rc;
-}
-
-static void deregister_segment(iobuf_seg_t * iobuf_seg_p)
-{
- MmUnlockPages( iobuf_seg_p->mdl_p ); // unlock the buffer
- IoFreeMdl( iobuf_seg_p->mdl_p ); // free MDL
- ExFreePool(iobuf_seg_p);
-}
-
-void iobuf_deregister(iobuf_t *iobuf_p)
-{
- iobuf_seg_t *iobuf_seg_p; // pointer to current segment object
-
- ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);
-
- // release segments
- while (!IsListEmpty( &iobuf_p->seg_que )) {
- iobuf_seg_p = (iobuf_seg_t *)(PVOID)RemoveTailList( &iobuf_p->seg_que );
- deregister_segment(iobuf_seg_p);
- iobuf_p->seg_num--;
- }
- ASSERT(iobuf_p->seg_num == 0);
-}
-
-void iobuf_deregister_with_cash(iobuf_t *iobuf_p)
-{
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
-
- mutex_lock(&g_pa_mutex);
- if (iobuf_p->is_cashed)
- pa_deregister(iobuf_p);
- iobuf_deregister(iobuf_p);
- mutex_unlock(&g_pa_mutex);
-}
-
-void iobuf_iter_init(
- IN iobuf_t *iobuf_p,
- IN OUT iobuf_iter_t *iterator_p)
-{
- iterator_p->seg_p = iobuf_p->seg_que.Flink;
- iterator_p->pfn_ix = 0;
-}
-
-// the function returns phys addresses of the pages, also for the first page
-// if one wants to get the phys address of the buffer, one has to
-// add the offset from the start of the page to the first phys address
-// Returns: the number of entries, filled in page_tbl_p
-// Returns 0 while at the end of list.
-uint32_t iobuf_get_tpt_seg(
- IN iobuf_t *iobuf_p,
- IN OUT iobuf_iter_t *iterator_p,
- IN uint32_t n_pages_in,
- IN OUT uint64_t *page_tbl_p )
-{
- uint32_t i=0; // has to be initialized here for a premature exit
- iobuf_seg_t *seg_p; // pointer to current segment object
- PPFN_NUMBER pfn_p;
- uint32_t pfn_ix; // index of PFN in PFN array of the current segment
- uint64_t *pa_buf_p = page_tbl_p;
-
- // prepare to the loop
- seg_p = iterator_p->seg_p; // first segment of the first iobuf
- pfn_ix= iterator_p->pfn_ix;
-
- // check, whether we at the end of the list
- if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que)
- goto exit;
- pfn_p = MmGetMdlPfnArray( seg_p->mdl_p ) + pfn_ix;
-
- // pass along all the PFN arrays
- for (; i < n_pages_in; i++, pa_buf_p++) {
- // convert PFN to the physical address
- *pa_buf_p = (uint64_t)*pfn_p++ << PAGE_SHIFT;
-
- // get to the next PFN
- if (++pfn_ix >= seg_p->nr_pages) {
- seg_p = (iobuf_seg_t*)seg_p->link.Flink;
- pfn_ix = 0;
- if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que) {
- i++;
- break;
- }
- pfn_p = MmGetMdlPfnArray( seg_p->mdl_p );
- }
- }
-
-exit:
- iterator_p->seg_p = seg_p;
- iterator_p->pfn_ix = pfn_ix;
- return i;
-}
-
-
+/*\r
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.\r
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.\r
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
+ *\r
+ * This software is available to you under the OpenIB.org BSD license\r
+ * below:\r
+ *\r
+ * Redistribution and use in source and binary forms, with or\r
+ * without modification, are permitted provided that the following\r
+ * conditions are met:\r
+ *\r
+ * - Redistributions of source code must retain the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer.\r
+ *\r
+ * - Redistributions in binary form must reproduce the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer in the documentation and/or other materials\r
+ * provided with the distribution.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ *\r
+ * $Id: mt_memory.c 2020 2007-05-01 09:29:10Z leonid $\r
+ */\r
+#include <mlx4_debug.h>\r
+#include "l2w.h"\r
+#include "pa_cash.h"\r
+#include "ib_verbs.h"\r
+\r
+#if defined (EVENT_TRACING)\r
+#ifdef offsetof\r
+#undef offsetof\r
+#endif\r
+#include "iobuf.tmh"\r
+#endif \r
+\r
+\r
+\r
+\r
+/*\r
+* Function: map user buffer to kernel and lock it\r
+*\r
+* Return: \r
+*/\r
+int get_user_pages(\r
+ IN struct mlx4_dev *dev, /* device */\r
+ IN u64 start, /* address in user space */\r
+ IN int npages, /* size in pages */\r
+ IN int write_access, /* access rights */\r
+ OUT struct scatterlist *sg /* s/g list */\r
+ )\r
+{\r
+ PMDL mdl_p;\r
+ int size = npages << PAGE_SHIFT;\r
+ int access = (write_access) ? IoWriteAccess : IoReadAccess;\r
+ int err;\r
+ void * kva; /* kernel virtual address */\r
+\r
+ UNREFERENCED_PARAMETER(dev);\r
+ \r
+ MLX4_ENTER(MLX4_DBG_MEMORY);\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+ \r
+ /* allocate MDL */\r
+ mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)start, (ULONG)size, \r
+ FALSE,\r
+ FALSE, /* not charge quota */\r
+ NULL);\r
+ if (mdl_p == NULL) {\r
+ err = -ENOMEM; \r
+ goto err0;\r
+ }\r
+\r
+ /* lock memory */\r
+ __try { \r
+ MmProbeAndLockPages( mdl_p, UserMode, access ); \r
+ } \r
+ __except (EXCEPTION_EXECUTE_HANDLER)\r
+ {\r
+ NTSTATUS Status = GetExceptionCode();\r
+ MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_MEMORY ,("Exception 0x%x on MmProbeAndLockPages(), addr 0x%I64x, size %d\n", Status, start, size));\r
+ switch(Status){\r
+ case STATUS_WORKING_SET_QUOTA:\r
+ err = -ENOMEM;break;\r
+ case STATUS_ACCESS_VIOLATION:\r
+ err = -EACCES;break;\r
+ default :\r
+ err = -EINVAL;\r
+ }\r
+\r
+ goto err1;\r
+ }\r
+\r
+ /* map it to kernel */\r
+ kva = MmMapLockedPagesSpecifyCache( mdl_p, \r
+ KernelMode, MmNonCached, \r
+ NULL, FALSE, NormalPagePriority );\r
+ if (kva == NULL) {\r
+ MLX4_PRINT(TRACE_LEVEL_ERROR ,MLX4_DBG_MEMORY ,("MmMapLockedPagesSpecifyCache failed\n"));\r
+ err = -EFAULT;\r
+ goto err2;\r
+ }\r
+\r
+ sg->dma_addr.va = kva;\r
+ sg->dma_addr.sz = size;\r
+ sg->offset = (unsigned int)(start & ~PAGE_MASK);\r
+ sg->p_mdl = mdl_p; \r
+ // TODO: has to be dma address, not physical one\r
+ sg->dma_addr.da = MmGetPhysicalAddress(kva).QuadPart;\r
+ return 0; \r
+ \r
+err2: \r
+ MmUnlockPages(mdl_p);\r
+err1: \r
+ IoFreeMdl(mdl_p);\r
+err0:\r
+ MLX4_EXIT(MLX4_DBG_MEMORY);\r
+ return err;\r
+ \r
+ }\r
+\r
+void put_page(struct scatterlist *sg)\r
+{\r
+ if (sg->p_mdl) {\r
+ MmUnmapLockedPages( sg->dma_addr.va, sg->p_mdl );\r
+ MmUnlockPages(sg->p_mdl);\r
+ IoFreeMdl(sg->p_mdl);\r
+ }\r
+}\r
+\r
+\r
+typedef struct _iobuf_seg {\r
+ LIST_ENTRY link;\r
+ PMDL mdl_p;\r
+ u64 va; /* virtual address of the buffer */\r
+ u64 size; /* size in bytes of the buffer */\r
+ u32 nr_pages;\r
+ int is_user;\r
+} iobuf_seg_t;\r
+\r
+// Returns: 0 on success, -ENOMEM or -EACCESS on error\r
+static int register_segment(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN enum ib_access_flags acc,\r
+ OUT iobuf_seg_t **iobuf_seg)\r
+{\r
+ PMDL mdl_p;\r
+ int rc;\r
+ KPROCESSOR_MODE mode; \r
+ iobuf_seg_t * new_iobuf;\r
+ static ULONG cnt=0;\r
+ LOCK_OPERATION Operation;\r
+\r
+ // set Operation\r
+ if (acc & IB_ACCESS_LOCAL_WRITE)\r
+ Operation = IoModifyAccess;\r
+ else\r
+ Operation = IoReadAccess;\r
+ \r
+ // allocate IOBUF segment object\r
+ new_iobuf = (iobuf_seg_t *)kmalloc(sizeof(iobuf_seg_t), GFP_KERNEL );\r
+ if (new_iobuf == NULL) {\r
+ rc = -ENOMEM;\r
+ goto err_nomem;\r
+ }\r
+\r
+ // allocate MDL \r
+ mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)va, (ULONG)size, FALSE,FALSE,NULL);\r
+ if (mdl_p == NULL) {\r
+ rc = -ENOMEM;\r
+ goto err_alloc_mdl;\r
+ }\r
+\r
+ // make context-dependent things\r
+ if (is_user) {\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+ mode = UserMode;\r
+ }\r
+ else { /* Mapping to kernel virtual address */\r
+ // MmBuildMdlForNonPagedPool(mdl_p); // fill MDL ??? - should we do that really ?\r
+ mode = KernelMode;\r
+ }\r
+\r
+ __try { /* try */\r
+ MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */\r
+ } /* try */\r
+ \r
+ __except (EXCEPTION_EXECUTE_HANDLER) {\r
+ MLX4_PRINT(TRACE_LEVEL_ERROR, MLX4_DBG_MEMORY, \r
+ ("MOSAL_iobuf_register: Exception 0x%x on MmProbeAndLockPages(), va %I64d, sz %I64d\n", \r
+ GetExceptionCode(), va, size));\r
+ rc = -EACCES;\r
+ goto err_probe;\r
+ }\r
+ \r
+ // fill IOBUF object\r
+ new_iobuf->va = va;\r
+ new_iobuf->size= size;\r
+ new_iobuf->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );\r
+ new_iobuf->mdl_p = mdl_p;\r
+ new_iobuf->is_user = is_user;\r
+ *iobuf_seg = new_iobuf;\r
+ return 0;\r
+\r
+err_probe:\r
+ IoFreeMdl(mdl_p);\r
+err_alloc_mdl: \r
+ ExFreePool((PVOID)new_iobuf);\r
+err_nomem: \r
+ return rc;\r
+}\r
+\r
+void iobuf_init(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN OUT iobuf_t *iobuf_p)\r
+{\r
+ iobuf_p->va = va;\r
+ iobuf_p->size= size;\r
+ iobuf_p->is_user = is_user;\r
+ InitializeListHead( &iobuf_p->seg_que );\r
+ iobuf_p->seg_num = 0;\r
+ iobuf_p->nr_pages = 0;\r
+ iobuf_p->is_cashed = 0;\r
+}\r
+\r
+int iobuf_register(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN enum ib_access_flags acc,\r
+ IN OUT iobuf_t *iobuf_p)\r
+{\r
+ int rc=0;\r
+ u64 seg_va; // current segment start\r
+ u64 seg_size; // current segment size\r
+ u64 rdc; // remain data counter - what is rest to lock\r
+ u64 delta; // he size of the last not full page of the first segment\r
+ iobuf_seg_t * new_iobuf;\r
+ unsigned page_size = PAGE_SIZE;\r
+\r
+// 32 - for any case \r
+#define PFNS_IN_PAGE_SIZE_MDL ((PAGE_SIZE - sizeof(struct _MDL) - 32) / sizeof(long))\r
+#define MIN_IOBUF_SEGMENT_SIZE (PAGE_SIZE * PFNS_IN_PAGE_SIZE_MDL) // 4MB \r
+\r
+ ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);\r
+\r
+ // we'll try to register all at once.\r
+ seg_va = va;\r
+ seg_size = rdc = size;\r
+ \r
+ // allocate segments\r
+ while (rdc > 0) {\r
+ // map a segment\r
+ rc = register_segment(seg_va, seg_size, is_user, acc, &new_iobuf );\r
+\r
+ // success - move to another segment\r
+ if (!rc) {\r
+ rdc -= seg_size;\r
+ seg_va += seg_size;\r
+ InsertTailList( &iobuf_p->seg_que, &new_iobuf->link );\r
+ iobuf_p->seg_num++;\r
+ // round the segment size to the next page boundary \r
+ delta = (seg_va + seg_size) & (page_size - 1);\r
+ if (delta) {\r
+ seg_size -= delta;\r
+ seg_size += page_size;\r
+ }\r
+ if (seg_size > rdc)\r
+ seg_size = rdc;\r
+ continue;\r
+ }\r
+\r
+ // failure - too large a buffer: lessen it and try once more\r
+ if (rc == -ENOMEM) {\r
+ // no where to lessen - too low memory\r
+ if (seg_size <= MIN_IOBUF_SEGMENT_SIZE)\r
+ break;\r
+ // lessen the size\r
+ seg_size >>= 1;\r
+ // round the segment size to the next page boundary \r
+ delta = (seg_va + seg_size) & (page_size - 1);\r
+ if (delta) {\r
+ seg_size -= delta;\r
+ seg_size += page_size;\r
+ }\r
+ if (seg_size > rdc)\r
+ seg_size = rdc;\r
+ continue;\r
+ }\r
+\r
+ // got unrecoverable error\r
+ break;\r
+ }\r
+\r
+ // SUCCESS\r
+ if (rc) \r
+ iobuf_deregister( iobuf_p );\r
+ else \r
+ iobuf_p->nr_pages += ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );\r
+\r
+ return rc;\r
+}\r
+\r
+\r
+static void __iobuf_copy(\r
+ IN OUT iobuf_t *dst_iobuf_p,\r
+ IN iobuf_t *src_iobuf_p\r
+ )\r
+{\r
+ int i;\r
+ iobuf_seg_t *iobuf_seg_p;\r
+ \r
+ *dst_iobuf_p = *src_iobuf_p;\r
+ InitializeListHead( &dst_iobuf_p->seg_que );\r
+ for (i=0; i<src_iobuf_p->seg_num; ++i) {\r
+ iobuf_seg_p = (iobuf_seg_t *)(PVOID)RemoveHeadList( &src_iobuf_p->seg_que );\r
+ InsertTailList( &dst_iobuf_p->seg_que, &iobuf_seg_p->link );\r
+ }\r
+}\r
+\r
+/* if the buffer to be registered overlaps a buffer, already registered, \r
+ a race can happen between HCA, writing to the previously registered\r
+ buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory),\r
+ used in the algorithm of memory registration.\r
+ To prevent the race we maintain reference counters for the physical pages, being registered, \r
+ and register every physical page FOR THE WRITE ACCESS only once.*/\r
+\r
+int iobuf_register_with_cash(\r
+ IN u64 vaddr,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN OUT enum ib_access_flags *acc_p,\r
+ IN OUT iobuf_t *iobuf_p)\r
+{\r
+ int rc, pa_in;\r
+ iobuf_t sec_iobuf;\r
+ int i, page_in , page_out, page_in_total;\r
+ int nr_pages;\r
+ char *subregion_start, *va;\r
+ u64 subregion_size;\r
+ u64 rdc; // remain data counter - what is rest to lock\r
+ u64 delta; // he size of the last not full page of the first segment\r
+ enum ib_access_flags acc;\r
+\r
+ mutex_lock(&g_pa_mutex);\r
+\r
+ // register memory for read access to bring pages into the memory\r
+ rc = iobuf_register( vaddr, size, is_user, 0, iobuf_p);\r
+\r
+ // on error or read access - exit\r
+ if (rc || !(*acc_p & IB_ACCESS_LOCAL_WRITE))\r
+ goto exit;\r
+\r
+ // re-register buffer with the correct access rights\r
+ iobuf_init( (u64)vaddr, size, is_user, &sec_iobuf );\r
+ nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( vaddr, size );\r
+ subregion_start = va = (char*)(ULONG_PTR)vaddr;\r
+ rdc = size;\r
+ pa_in = page_in = page_in_total = page_out = 0;\r
+\r
+ for (i=0; i<nr_pages; ++i, va+=PAGE_SIZE) {\r
+ // check whether a phys page is to be registered\r
+ PHYSICAL_ADDRESS pa = MmGetPhysicalAddress(va);\r
+ pa_in = pa_is_registered(pa.QuadPart);\r
+ if (pa_in) {\r
+ ++page_in;\r
+ ++page_in_total;\r
+ }\r
+ else\r
+ ++page_out;\r
+\r
+ // check whether we get at the end of a subregion with the same rights wrt cash\r
+ if (page_in && page_out) {\r
+ // prepare to registration of the subregion\r
+ if (pa_in) { // SUBREGION WITH WRITE ACCESS\r
+ acc = IB_ACCESS_LOCAL_WRITE;\r
+ subregion_size = (u64)page_out * PAGE_SIZE;\r
+ page_out = 0;\r
+ }\r
+ else { // SUBREGION WITH READ ACCESS\r
+ acc = 0;\r
+ subregion_size = (u64)page_in * PAGE_SIZE;\r
+ page_in = 0;\r
+ }\r
+ \r
+ // round the subregion size to the page boundary \r
+ delta = (ULONG_PTR)(subregion_start + subregion_size) & (PAGE_SIZE - 1);\r
+ subregion_size -= delta;\r
+ if (subregion_size > rdc)\r
+ subregion_size = rdc;\r
+\r
+ // register the subregion\r
+ rc = iobuf_register( (ULONG_PTR)subregion_start, subregion_size, is_user, acc, &sec_iobuf);\r
+ if (rc)\r
+ goto cleanup;\r
+\r
+ // prepare to the next loop\r
+ rdc -= subregion_size;\r
+ subregion_start +=subregion_size;\r
+ }\r
+ }\r
+\r
+ // prepare to registration of the subregion\r
+ if (pa_in) { // SUBREGION WITH READ ACCESS\r
+ acc = 0;\r
+ subregion_size = (u64)page_in * PAGE_SIZE;\r
+ }\r
+ else { // SUBREGION WITH WRITE ACCESS\r
+ acc = IB_ACCESS_LOCAL_WRITE;\r
+ subregion_size = (u64)page_out * PAGE_SIZE;\r
+ }\r
+ \r
+ // round the subregion size to the page boundary \r
+ delta = (ULONG_PTR)(subregion_start + subregion_size) & (PAGE_SIZE - 1);\r
+ subregion_size -= delta;\r
+ if (subregion_size > rdc)\r
+ subregion_size = rdc;\r
+ \r
+ // register the subregion\r
+ rc = iobuf_register( (ULONG_PTR)subregion_start, subregion_size, is_user, acc, &sec_iobuf);\r
+ if (rc)\r
+ goto cleanup;\r
+\r
+ // cash phys pages\r
+ rc = pa_register(iobuf_p);\r
+ if (rc)\r
+ goto err_pa_reg;\r
+\r
+ // replace the iobuf\r
+ iobuf_deregister( iobuf_p );\r
+ sec_iobuf.is_cashed = TRUE;\r
+ __iobuf_copy( iobuf_p, &sec_iobuf );\r
+ \r
+ // buffer is a part of also registered buffer - change the rights \r
+ if (page_in_total)\r
+ *acc_p &= ~IB_ACCESS_LOCAL_WRITE;\r
+\r
+ goto exit;\r
+ \r
+err_pa_reg: \r
+ iobuf_deregister( &sec_iobuf );\r
+cleanup:\r
+ iobuf_deregister( iobuf_p );\r
+exit: \r
+ mutex_unlock(&g_pa_mutex);\r
+ return rc;\r
+}\r
+\r
+static void deregister_segment(iobuf_seg_t * iobuf_seg_p)\r
+{\r
+ MmUnlockPages( iobuf_seg_p->mdl_p ); // unlock the buffer \r
+ IoFreeMdl( iobuf_seg_p->mdl_p ); // free MDL\r
+ ExFreePool(iobuf_seg_p);\r
+}\r
+\r
+void iobuf_deregister(iobuf_t *iobuf_p)\r
+{\r
+ iobuf_seg_t *iobuf_seg_p; // pointer to current segment object\r
+\r
+ ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);\r
+\r
+ // release segments\r
+ while (!IsListEmpty( &iobuf_p->seg_que )) {\r
+ iobuf_seg_p = (iobuf_seg_t *)(PVOID)RemoveTailList( &iobuf_p->seg_que );\r
+ deregister_segment(iobuf_seg_p);\r
+ iobuf_p->seg_num--;\r
+ }\r
+ ASSERT(iobuf_p->seg_num == 0);\r
+}\r
+\r
+void iobuf_deregister_with_cash(iobuf_t *iobuf_p)\r
+{\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+\r
+ mutex_lock(&g_pa_mutex);\r
+ if (iobuf_p->is_cashed)\r
+ pa_deregister(iobuf_p);\r
+ iobuf_deregister(iobuf_p);\r
+ mutex_unlock(&g_pa_mutex);\r
+}\r
+\r
+void iobuf_iter_init(\r
+ IN iobuf_t *iobuf_p, \r
+ IN OUT iobuf_iter_t *iterator_p)\r
+{\r
+ iterator_p->seg_p = iobuf_p->seg_que.Flink;\r
+ iterator_p->pfn_ix = 0;\r
+}\r
+\r
+// the function returns phys addresses of the pages, also for the first page\r
+// if one wants to get the phys address of the buffer, one has to \r
+// add the offset from the start of the page to the first phys address\r
+// Returns: the number of entries, filled in page_tbl_p\r
+// Returns 0 while at the end of list.\r
+uint32_t iobuf_get_tpt_seg(\r
+ IN iobuf_t *iobuf_p, \r
+ IN OUT iobuf_iter_t *iterator_p,\r
+ IN uint32_t n_pages_in, \r
+ IN OUT uint64_t *page_tbl_p )\r
+{\r
+ uint32_t i=0; // has to be initialized here for a premature exit\r
+ iobuf_seg_t *seg_p; // pointer to current segment object \r
+ PPFN_NUMBER pfn_p; \r
+ uint32_t pfn_ix; // index of PFN in PFN array of the current segment\r
+ uint64_t *pa_buf_p = page_tbl_p;\r
+\r
+ // prepare to the loop\r
+ seg_p = iterator_p->seg_p; // first segment of the first iobuf\r
+ pfn_ix= iterator_p->pfn_ix;\r
+\r
+ // check, whether we at the end of the list\r
+ if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que)\r
+ goto exit;\r
+ pfn_p = MmGetMdlPfnArray( seg_p->mdl_p ) + pfn_ix;\r
+\r
+ // pass along all the PFN arrays\r
+ for (; i < n_pages_in; i++, pa_buf_p++) {\r
+ // convert PFN to the physical address\r
+ *pa_buf_p = (uint64_t)*pfn_p++ << PAGE_SHIFT;\r
+ \r
+ // get to the next PFN \r
+ if (++pfn_ix >= seg_p->nr_pages) {\r
+ seg_p = (iobuf_seg_t*)seg_p->link.Flink;\r
+ pfn_ix = 0;\r
+ if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que) {\r
+ i++;\r
+ break;\r
+ }\r
+ pfn_p = MmGetMdlPfnArray( seg_p->mdl_p );\r
+ }\r
+ }\r
+\r
+exit:\r
+ iterator_p->seg_p = seg_p;\r
+ iterator_p->pfn_ix = pfn_ix;\r
+ return i;\r
+}\r
+\r
+\r
}\r
\r
// register mr \r
- p_ib_mr = ibv_reg_mr(p_ib_pd, (u64)(ULONG_PTR)(void*)p_mr_create->vaddr, \r
- p_mr_create->length, (uint64_t)p_mr_create->vaddr, \r
+ p_ib_mr = ibv_reg_mr(p_ib_pd, (ULONG_PTR)p_mr_create->vaddr, \r
+ p_mr_create->length, (ULONG_PTR)p_mr_create->vaddr, \r
to_qp_acl(p_mr_create->access_ctrl), um_call ? &umv_buf : NULL );\r
if (IS_ERR(p_ib_mr)) {\r
err = PTR_ERR(p_ib_mr);\r
// register mr \r
mr_p = ibv_reg_mr(ib_pd_p, map_qp_ibal_acl(p_mr_create->access_ctrl), \r
p_mr_create->vaddr, p_mr_create->length, \r
- (uint64_t)p_mr_create->vaddr, um_call, TRUE );\r
+ (ULONG_PTR)p_mr_create->vaddr, um_call, TRUE );\r
if (IS_ERR(mr_p)) {\r
err = PTR_ERR(mr_p);\r
HCA_PRINT(TRACE_LEVEL_ERROR, HCA_DBG_MEMORY,\r
-/*
- * Copyright (c) 2004 Topspin Corporation. All rights reserved.
- * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * $Id$
- */
- #include "hca_driver.h"
-#include "mthca_dev.h"
-#if defined (EVENT_TRACING)
-#ifdef offsetof
-#undef offsetof
-#endif
-#include "mt_memory.tmh"
-#endif
-
-#include "mt_pa_cash.h"
-
-
-/*
-* Function: map user buffer to kernel and lock it
-*
-* Return:
-*/
-int get_user_pages(
- IN struct mthca_dev *dev, /* device */
- IN u64 start, /* address in user space */
- IN int npages, /* size in pages */
- IN int write_access, /* access rights */
- OUT struct scatterlist *sg /* s/g list */
- )
-{
- PMDL mdl_p;
- int size = npages << PAGE_SHIFT;
- int access = (write_access) ? IoWriteAccess : IoReadAccess;
- int err;
- void * kva; /* kernel virtual address */
-
- UNREFERENCED_PARAMETER(dev);
-
- HCA_ENTER(HCA_DBG_MEMORY);
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
-
- /* allocate MDL */
- mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)start, (ULONG)size,
- FALSE,
- FALSE, /* not charge quota */
- NULL);
- if (mdl_p == NULL) {
- err = -ENOMEM;
- goto err0;
- }
-
- /* lock memory */
- __try {
- MmProbeAndLockPages( mdl_p, UserMode, access );
- }
- __except (EXCEPTION_EXECUTE_HANDLER)
- {
- NTSTATUS Status = GetExceptionCode();
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_MEMORY ,("Exception 0x%x on MmProbeAndLockPages(), addr 0x%I64x, size %d\n", Status, start, size));
- switch(Status){
- case STATUS_WORKING_SET_QUOTA:
- err = -ENOMEM;break;
- case STATUS_ACCESS_VIOLATION:
- err = -EACCES;break;
- default :
- err = -EINVAL;
- }
-
- goto err1;
- }
-
- /* map it to kernel */
- kva = MmMapLockedPagesSpecifyCache( mdl_p,
- KernelMode, MmNonCached,
- NULL, FALSE, NormalPagePriority );
- if (kva == NULL) {
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_MEMORY ,("MmMapLockedPagesSpecifyCache failed\n"));
- err = -EFAULT;
- goto err2;
- }
-
- sg->page = kva;
- sg->length = size;
- sg->offset = (unsigned int)(start & ~PAGE_MASK);
- sg->p_mdl = mdl_p;
- sg->dma_address = MmGetPhysicalAddress(kva).QuadPart;
- return 0;
-
-err2:
- MmUnlockPages(mdl_p);
-err1:
- IoFreeMdl(mdl_p);
-err0:
- HCA_EXIT(HCA_DBG_MEMORY);
- return err;
-
- }
-
-void put_page(struct scatterlist *sg)
-{
- if (sg->p_mdl) {
- MmUnmapLockedPages( sg->page, sg->p_mdl );
- MmUnlockPages(sg->p_mdl);
- IoFreeMdl(sg->p_mdl);
- }
-}
-
-VOID
- AdapterListControl(
- IN PDEVICE_OBJECT DeviceObject,
- IN PIRP Irp,
- IN PSCATTER_GATHER_LIST ScatterGather,
- IN PVOID Context
- )
-{
- struct scatterlist *p_sg = (struct scatterlist *)Context;
-
- UNREFERENCED_PARAMETER(DeviceObject);
- UNREFERENCED_PARAMETER(Irp);
-
- // sanity checks
- if (!ScatterGather || !Context) {
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("AdapterListControl failed: invalid parameters\n"));
- return;
- }
- if (ScatterGather->NumberOfElements > 1) {
- HCA_PRINT(TRACE_LEVEL_WARNING ,HCA_DBG_LOW ,("AdapterListControl failed: unexpected sg size; %d elements \n",
- ScatterGather->NumberOfElements ));
- }
- if (ScatterGather->Elements[0].Length != p_sg->length) {
- HCA_PRINT(TRACE_LEVEL_WARNING ,HCA_DBG_LOW ,("AdapterListControl failed: unexpected buffer size %#x (expected %#x) \n",
- ScatterGather->Elements[0].Length, p_sg->length ));
- }
-
- // results
- p_sg->dma_address = ScatterGather->Elements[0].Address.QuadPart; // get logical address
- p_sg->p_os_sg = ScatterGather; // store sg list address for releasing
- //NB: we do not flush the buffers by FlushAdapterBuffers(), because we don't really transfer data
-}
-
-/* Returns: the number of mapped sg elements */
-int pci_map_sg(struct mthca_dev *dev,
- struct scatterlist *sg, int nents, int direction)
-{
-#ifndef USE_GET_SG_LIST
-
- UNREFERENCED_PARAMETER(dev);
- UNREFERENCED_PARAMETER(sg);
- UNREFERENCED_PARAMETER(direction);
-
- // mapping was performed in alloc_dma_mem
- return nents;
-
-#else
-
- int i;
- NTSTATUS status;
- hca_dev_ext_t *p_ext = dev->ext;
- struct scatterlist *p_sg = sg;
- KIRQL irql = KeRaiseIrqlToDpcLevel();
-
- for (i=0; i<nents; ++i, ++p_sg) {
- status = p_ext->p_dma_adapter->DmaOperations->GetScatterGatherList(
- p_ext->p_dma_adapter, p_ext->cl_ext.p_self_do, p_sg->p_mdl, p_sg->page,
- p_sg->length, AdapterListControl, sg, (BOOLEAN)direction );
- if (!NT_SUCCESS(status)) {
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("GetScatterGatherList failed %#x\n", status)));
- break;
- }
- }
- KeLowerIrql(irql);
- return i; /* i.e., we mapped all the entries */
-
-#endif
-}
-
-/* Returns: the number of unmapped sg elements */
-int pci_unmap_sg(struct mthca_dev *dev,
- struct scatterlist *sg, int nents, int direction)
-{
-#ifndef USE_GET_SG_LIST
-
- UNREFERENCED_PARAMETER(dev);
- UNREFERENCED_PARAMETER(sg);
- UNREFERENCED_PARAMETER(direction);
- // mapping was performed in alloc_dma_mem
- return nents;
-
-#else
-
- int i;
- hca_dev_ext_t *p_ext = dev->ext;
- struct scatterlist *p_sg = sg;
- KIRQL irql = KeRaiseIrqlToDpcLevel();
- void *p_os_sg = p_sg->p_os_sg;
-
- for (i=0; i<nents; ++i, ++p_sg) {
- if (p_os_sg)
- p_sg->p_os_sg = NULL;
- p_ext->p_dma_adapter->DmaOperations->PutScatterGatherList(
- p_ext->p_dma_adapter, p_os_sg, (BOOLEAN)direction );
- }
- KeLowerIrql(irql);
- return i; /* i.e., we mapped all the entries */
-
-#endif
-}
-
-/* The function zeroes 'struct scatterlist' and then fills it with values.
- On error 'struct scatterlist' is returned zeroed */
-void *alloc_dma_mem(
- IN struct mthca_dev *dev,
- IN unsigned long size,
- OUT struct scatterlist *p_sg)
-{
- void *va;
- DMA_ADAPTER *p_dma = dev->ext->p_dma_adapter;
-
-#ifndef USE_GET_SG_LIST
-
- PHYSICAL_ADDRESS pa = {0};
- ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);
-
- RtlZeroMemory(p_sg,sizeof *p_sg);
- if (!size)
- return NULL;
-
- va = p_dma->DmaOperations->AllocateCommonBuffer(
- p_dma, size, &pa, FALSE );
- if (va) {
- p_sg->length = size;
- p_sg->dma_address = pa.QuadPart;
- p_sg->page = va;
- }
-
-#else
-
- int err;
- PHYSICAL_ADDRESS la = {0}, ba = {0}, ha = {(u64)(-1I64)};
- PMDL p_mdl;
-
- ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);
-
- RtlZeroMemory(p_sg,sizeof *p_sg);
- if (!size)
- return NULL;
-
- // allocate memory
- va = MmAllocateContiguousMemorySpecifyCache(
- size, la, ha, ba, MmNonCached );
- if (!va) {
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("MmAllocateContiguousMemorySpecifyCache failed on %#x size\n", size )));
- goto err_alloc;
- }
-
- // allocate MDL
- p_mdl = IoAllocateMdl( va, size, FALSE, FALSE, NULL );
- if (!p_mdl) {
- HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("MmAllocateContiguousMemorySpecifyCache failed on %#x size\n", size )));
- goto err_mdl;
- }
- MmBuildMdlForNonPagedPool( p_mdl );
-
- p_sg->p_mdl = p_mdl;
- p_sg->length = size;
- p_sg->page = va;
-
- goto end;
-
-err_mdl:
- MmFreeContiguousMemory(va);
- va = NULL;
-err_alloc:
-end:
-
-#endif
-
- return va;
-}
-
-void free_dma_mem(
- IN struct mthca_dev *dev,
- IN struct scatterlist *p_sg)
-{
-#ifndef USE_GET_SG_LIST
-
- PHYSICAL_ADDRESS pa;
- DMA_ADAPTER *p_dma = dev->ext->p_dma_adapter;
-
- ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);
-
- if (p_sg->length) {
- pa.QuadPart = p_sg->dma_address;
- p_dma->DmaOperations->FreeCommonBuffer(
- p_dma, p_sg->length, pa,
- p_sg->page, FALSE );
- }
-
-#else
-
- PMDL p_mdl = p_sg->p_mdl;
- PVOID page = p_sg->page;
-
- ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);
- if (p_mdl) {
- p_sg->p_mdl = NULL;
- IoFreeMdl( p_mdl );
- }
- if (page) {
- p_sg->page = NULL;
- MmFreeContiguousMemory(page);
- }
-
-#endif
-}
-
-
-typedef struct _mt_iobuf_seg {
- LIST_ENTRY link;
- PMDL mdl_p;
- u64 va; /* virtual address of the buffer */
- u64 size; /* size in bytes of the buffer */
- u32 nr_pages;
- int is_user;
-} mt_iobuf_seg_t;
-
-// Returns: 0 on success, -ENOMEM or -EACCESS on error
-static int register_segment(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN ib_access_t acc,
- OUT mt_iobuf_seg_t **iobuf_seg)
-{
- PMDL mdl_p;
- int rc;
- KPROCESSOR_MODE mode;
- mt_iobuf_seg_t * new_iobuf;
- static ULONG cnt=0;
- LOCK_OPERATION Operation;
-
- // set Operation
- if (acc & IB_AC_LOCAL_WRITE)
- Operation = IoModifyAccess;
- else
- Operation = IoReadAccess;
-
- // allocate IOBUF segment object
- new_iobuf = (mt_iobuf_seg_t *)kmalloc(sizeof(mt_iobuf_seg_t), GFP_KERNEL );
- if (new_iobuf == NULL) {
- rc = -ENOMEM;
- goto err_nomem;
- }
-
- // allocate MDL
- mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)va, (ULONG)size, FALSE,FALSE,NULL);
- if (mdl_p == NULL) {
- rc = -ENOMEM;
- goto err_alloc_mdl;
- }
-
- // make context-dependent things
- if (is_user) {
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
- mode = UserMode;
- }
- else { /* Mapping to kernel virtual address */
- // MmBuildMdlForNonPagedPool(mdl_p); // fill MDL ??? - should we do that really ?
- mode = KernelMode;
- }
-
- __try { /* try */
- MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */
- } /* try */
-
- __except (EXCEPTION_EXECUTE_HANDLER) {
- HCA_PRINT(TRACE_LEVEL_ERROR, HCA_DBG_MEMORY,
- ("MOSAL_iobuf_register: Exception 0x%x on MmProbeAndLockPages(), va %I64d, sz %I64d\n",
- GetExceptionCode(), va, size));
- rc = -EACCES;
- goto err_probe;
- }
-
- // fill IOBUF object
- new_iobuf->va = va;
- new_iobuf->size= size;
- new_iobuf->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );
- new_iobuf->mdl_p = mdl_p;
- new_iobuf->is_user = is_user;
- *iobuf_seg = new_iobuf;
- return 0;
-
-err_probe:
- IoFreeMdl(mdl_p);
-err_alloc_mdl:
- ExFreePool((PVOID)new_iobuf);
-err_nomem:
- return rc;
-}
-
-void iobuf_init(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN OUT mt_iobuf_t *iobuf_p)
-{
- iobuf_p->va = va;
- iobuf_p->size= size;
- iobuf_p->is_user = is_user;
- InitializeListHead( &iobuf_p->seg_que );
- iobuf_p->seg_num = 0;
- iobuf_p->nr_pages = 0;
- iobuf_p->is_cashed = 0;
-}
-
-int iobuf_register(
- IN u64 va,
- IN u64 size,
- IN int is_user,
- IN ib_access_t acc,
- IN OUT mt_iobuf_t *iobuf_p)
-{
- int rc=0;
- u64 seg_va; // current segment start
- u64 seg_size; // current segment size
- u64 rdc; // remain data counter - what is rest to lock
- u64 delta; // he size of the last not full page of the first segment
- mt_iobuf_seg_t * new_iobuf;
- unsigned page_size = PAGE_SIZE;
-
-// 32 - for any case
-#define PFNS_IN_PAGE_SIZE_MDL ((PAGE_SIZE - sizeof(struct _MDL) - 32) / sizeof(long))
-#define MIN_IOBUF_SEGMENT_SIZE (PAGE_SIZE * PFNS_IN_PAGE_SIZE_MDL) // 4MB
-
- ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);
-
- // we'll try to register all at once.
- seg_va = va;
- seg_size = rdc = size;
-
- // allocate segments
- while (rdc > 0) {
- // map a segment
- rc = register_segment(seg_va, seg_size, is_user, acc, &new_iobuf );
-
- // success - move to another segment
- if (!rc) {
- rdc -= seg_size;
- seg_va += seg_size;
- InsertTailList( &iobuf_p->seg_que, &new_iobuf->link );
- iobuf_p->seg_num++;
- // round the segment size to the next page boundary
- delta = (seg_va + seg_size) & (page_size - 1);
- if (delta) {
- seg_size -= delta;
- seg_size += page_size;
- }
- if (seg_size > rdc)
- seg_size = rdc;
- continue;
- }
-
- // failure - too large a buffer: lessen it and try once more
- if (rc == -ENOMEM) {
- // no where to lessen - too low memory
- if (seg_size <= MIN_IOBUF_SEGMENT_SIZE)
- break;
- // lessen the size
- seg_size >>= 1;
- // round the segment size to the next page boundary
- delta = (seg_va + seg_size) & (page_size - 1);
- if (delta) {
- seg_size -= delta;
- seg_size += page_size;
- }
- if (seg_size > rdc)
- seg_size = rdc;
- continue;
- }
-
- // got unrecoverable error
- break;
- }
-
- // SUCCESS
- if (rc)
- iobuf_deregister( iobuf_p );
- else
- iobuf_p->nr_pages += ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );
-
- return rc;
-}
-
-
-static void __iobuf_copy(
- IN OUT mt_iobuf_t *dst_iobuf_p,
- IN mt_iobuf_t *src_iobuf_p
- )
-{
- int i;
- mt_iobuf_seg_t *iobuf_seg_p;
-
- *dst_iobuf_p = *src_iobuf_p;
- InitializeListHead( &dst_iobuf_p->seg_que );
- for (i=0; i<src_iobuf_p->seg_num; ++i) {
- iobuf_seg_p = (mt_iobuf_seg_t *)(PVOID)RemoveHeadList( &src_iobuf_p->seg_que );
- InsertTailList( &dst_iobuf_p->seg_que, &iobuf_seg_p->link );
- }
-}
-
-/* if the buffer to be registered overlaps a buffer, already registered,
- a race can happen between HCA, writing to the previously registered
- buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory),
- used in the algorithm of memory registration.
- To prevent the race we maintain reference counters for the physical pages, being registered,
- and register every physical page FOR THE WRITE ACCESS only once.*/
-
-int iobuf_register_with_cash(
- IN u64 vaddr,
- IN u64 size,
- IN int is_user,
- IN OUT ib_access_t *acc_p,
- IN OUT mt_iobuf_t *iobuf_p)
-{
- int rc, pa_in;
- mt_iobuf_t sec_iobuf;
- int i, page_in , page_out, page_in_total;
- int nr_pages;
- char *subregion_start, *va;
- u64 subregion_size;
- u64 rdc; // remain data counter - what is rest to lock
- u64 delta; // he size of the last not full page of the first segment
- ib_access_t acc;
-
- down(&g_pa_mutex);
-
- // register memory for read access to bring pages into the memory
- rc = iobuf_register( vaddr, size, is_user, 0, iobuf_p);
-
- // on error or read access - exit
- if (rc || !(*acc_p & IB_AC_LOCAL_WRITE))
- goto exit;
-
- // re-register buffer with the correct access rights
- iobuf_init( (u64)vaddr, size, is_user, &sec_iobuf );
- nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( vaddr, size );
- subregion_start = va = (char*)(ULONG_PTR)vaddr;
- rdc = size;
- pa_in = page_in = page_in_total = page_out = 0;
-
- for (i=0; i<nr_pages; ++i, va+=PAGE_SIZE) {
- // check whether a phys page is to be registered
- PHYSICAL_ADDRESS pa = MmGetPhysicalAddress(va);
- pa_in = pa_is_registered(pa.QuadPart);
- if (pa_in) {
- ++page_in;
- ++page_in_total;
- }
- else
- ++page_out;
-
- // check whether we get at the end of a subregion with the same rights wrt cash
- if (page_in && page_out) {
- // prepare to registration of the subregion
- if (pa_in) { // SUBREGION WITH WRITE ACCESS
- acc = IB_AC_LOCAL_WRITE;
- subregion_size = (u64)page_out * PAGE_SIZE;
- page_out = 0;
- }
- else { // SUBREGION WITH READ ACCESS
- acc = 0;
- subregion_size = (u64)page_in * PAGE_SIZE;
- page_in = 0;
- }
-
- // round the subregion size to the page boundary
- delta = (u64)(subregion_start + subregion_size) & (PAGE_SIZE - 1);
- subregion_size -= delta;
- if (subregion_size > rdc)
- subregion_size = rdc;
-
- // register the subregion
- rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf);
- if (rc)
- goto cleanup;
-
- // prepare to the next loop
- rdc -= subregion_size;
- subregion_start +=subregion_size;
- }
- }
-
- // prepare to registration of the subregion
- if (pa_in) { // SUBREGION WITH READ ACCESS
- acc = 0;
- subregion_size = (u64)page_in * PAGE_SIZE;
- }
- else { // SUBREGION WITH WRITE ACCESS
- acc = IB_AC_LOCAL_WRITE;
- subregion_size = (u64)page_out * PAGE_SIZE;
- }
-
- // round the subregion size to the page boundary
- delta = (u64)(subregion_start + subregion_size) & (PAGE_SIZE - 1);
- subregion_size -= delta;
- if (subregion_size > rdc)
- subregion_size = rdc;
-
- // register the subregion
- rc = iobuf_register( (u64)subregion_start, subregion_size, is_user, acc, &sec_iobuf);
- if (rc)
- goto cleanup;
-
- // cash phys pages
- rc = pa_register(iobuf_p);
- if (rc)
- goto err_pa_reg;
-
- // replace the iobuf
- iobuf_deregister( iobuf_p );
- sec_iobuf.is_cashed = TRUE;
- __iobuf_copy( iobuf_p, &sec_iobuf );
-
- // buffer is a part of also registered buffer - change the rights
- if (page_in_total)
- *acc_p = MTHCA_ACCESS_REMOTE_READ;
-
- goto exit;
-
-err_pa_reg:
- iobuf_deregister( &sec_iobuf );
-cleanup:
- iobuf_deregister( iobuf_p );
-exit:
- up(&g_pa_mutex);
- return rc;
-}
-
-static void deregister_segment(mt_iobuf_seg_t * iobuf_seg_p)
-{
- MmUnlockPages( iobuf_seg_p->mdl_p ); // unlock the buffer
- IoFreeMdl( iobuf_seg_p->mdl_p ); // free MDL
- ExFreePool(iobuf_seg_p);
-}
-
-void iobuf_deregister(mt_iobuf_t *iobuf_p)
-{
- mt_iobuf_seg_t *iobuf_seg_p; // pointer to current segment object
-
- ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);
-
- // release segments
- while (!IsListEmpty( &iobuf_p->seg_que )) {
- iobuf_seg_p = (mt_iobuf_seg_t *)(PVOID)RemoveTailList( &iobuf_p->seg_que );
- deregister_segment(iobuf_seg_p);
- iobuf_p->seg_num--;
- }
- ASSERT(iobuf_p->seg_num == 0);
-}
-
-void iobuf_deregister_with_cash(mt_iobuf_t *iobuf_p)
-{
- ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);
-
- down(&g_pa_mutex);
- if (iobuf_p->is_cashed)
- pa_deregister(iobuf_p);
- iobuf_deregister(iobuf_p);
- up(&g_pa_mutex);
-}
-
-void iobuf_iter_init(
- IN mt_iobuf_t *iobuf_p,
- IN OUT mt_iobuf_iter_t *iterator_p)
-{
- iterator_p->seg_p = iobuf_p->seg_que.Flink;
- iterator_p->pfn_ix = 0;
-}
-
-// the function returns phys addresses of the pages, also for the first page
-// if one wants to get the phys address of the buffer, one has to
-// add the offset from the start of the page to the first phys address
-// Returns: the number of entries, filled in page_tbl_p
-// Returns 0 while at the end of list.
-uint32_t iobuf_get_tpt_seg(
- IN mt_iobuf_t *iobuf_p,
- IN OUT mt_iobuf_iter_t *iterator_p,
- IN uint32_t n_pages_in,
- IN OUT uint64_t *page_tbl_p )
-{
- uint32_t i=0; // has to be initialized here for a premature exit
- mt_iobuf_seg_t *seg_p; // pointer to current segment object
- PPFN_NUMBER pfn_p;
- uint32_t pfn_ix; // index of PFN in PFN array of the current segment
- uint64_t *pa_buf_p = page_tbl_p;
-
- // prepare to the loop
- seg_p = iterator_p->seg_p; // first segment of the first iobuf
- pfn_ix= iterator_p->pfn_ix;
-
- // check, whether we at the end of the list
- if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que)
- goto exit;
- pfn_p = MmGetMdlPfnArray( seg_p->mdl_p ) + pfn_ix;
-
- // pass along all the PFN arrays
- for (; i < n_pages_in; i++, pa_buf_p++) {
- // convert PFN to the physical address
- *pa_buf_p = (uint64_t)*pfn_p++ << PAGE_SHIFT;
-
- // get to the next PFN
- if (++pfn_ix >= seg_p->nr_pages) {
- seg_p = (mt_iobuf_seg_t*)seg_p->link.Flink;
- pfn_ix = 0;
- if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que) {
- i++;
- break;
- }
- pfn_p = MmGetMdlPfnArray( seg_p->mdl_p );
- }
- }
-
-exit:
- iterator_p->seg_p = seg_p;
- iterator_p->pfn_ix = pfn_ix;
- return i;
-}
-
-
-
-
+/*\r
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.\r
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.\r
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
+ *\r
+ * This software is available to you under a choice of one of two\r
+ * licenses. You may choose to be licensed under the terms of the GNU\r
+ * General Public License (GPL) Version 2, available from the file\r
+ * COPYING in the main directory of this source tree, or the\r
+ * OpenIB.org BSD license below:\r
+ *\r
+ * Redistribution and use in source and binary forms, with or\r
+ * without modification, are permitted provided that the following\r
+ * conditions are met:\r
+ *\r
+ * - Redistributions of source code must retain the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer.\r
+ *\r
+ * - Redistributions in binary form must reproduce the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer in the documentation and/or other materials\r
+ * provided with the distribution.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ *\r
+ * $Id$\r
+ */\r
+ #include "hca_driver.h"\r
+#include "mthca_dev.h"\r
+#if defined (EVENT_TRACING)\r
+#ifdef offsetof\r
+#undef offsetof\r
+#endif\r
+#include "mt_memory.tmh"\r
+#endif \r
+\r
+#include "mt_pa_cash.h"\r
+\r
+\r
+/*\r
+* Function: map user buffer to kernel and lock it\r
+*\r
+* Return: \r
+*/\r
+int get_user_pages(\r
+ IN struct mthca_dev *dev, /* device */\r
+ IN u64 start, /* address in user space */\r
+ IN int npages, /* size in pages */\r
+ IN int write_access, /* access rights */\r
+ OUT struct scatterlist *sg /* s/g list */\r
+ )\r
+{\r
+ PMDL mdl_p;\r
+ int size = npages << PAGE_SHIFT;\r
+ int access = (write_access) ? IoWriteAccess : IoReadAccess;\r
+ int err;\r
+ void * kva; /* kernel virtual address */\r
+\r
+ UNREFERENCED_PARAMETER(dev);\r
+ \r
+ HCA_ENTER(HCA_DBG_MEMORY);\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+ \r
+ /* allocate MDL */\r
+ mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)start, (ULONG)size, \r
+ FALSE,\r
+ FALSE, /* not charge quota */\r
+ NULL);\r
+ if (mdl_p == NULL) {\r
+ err = -ENOMEM; \r
+ goto err0;\r
+ }\r
+\r
+ /* lock memory */\r
+ __try { \r
+ MmProbeAndLockPages( mdl_p, UserMode, access ); \r
+ } \r
+ __except (EXCEPTION_EXECUTE_HANDLER)\r
+ {\r
+ NTSTATUS Status = GetExceptionCode();\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_MEMORY ,("Exception 0x%x on MmProbeAndLockPages(), addr 0x%I64x, size %d\n", Status, start, size));\r
+ switch(Status){\r
+ case STATUS_WORKING_SET_QUOTA:\r
+ err = -ENOMEM;break;\r
+ case STATUS_ACCESS_VIOLATION:\r
+ err = -EACCES;break;\r
+ default :\r
+ err = -EINVAL;\r
+ }\r
+\r
+ goto err1;\r
+ }\r
+\r
+ /* map it to kernel */\r
+ kva = MmMapLockedPagesSpecifyCache( mdl_p, \r
+ KernelMode, MmNonCached, \r
+ NULL, FALSE, NormalPagePriority );\r
+ if (kva == NULL) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_MEMORY ,("MmMapLockedPagesSpecifyCache failed\n"));\r
+ err = -EFAULT;\r
+ goto err2;\r
+ }\r
+\r
+ sg->page = kva;\r
+ sg->length = size;\r
+ sg->offset = (unsigned int)(start & ~PAGE_MASK);\r
+ sg->p_mdl = mdl_p; \r
+ sg->dma_address = MmGetPhysicalAddress(kva).QuadPart;\r
+ return 0; \r
+ \r
+err2: \r
+ MmUnlockPages(mdl_p);\r
+err1: \r
+ IoFreeMdl(mdl_p);\r
+err0:\r
+ HCA_EXIT(HCA_DBG_MEMORY);\r
+ return err;\r
+ \r
+ }\r
+\r
+void put_page(struct scatterlist *sg)\r
+{\r
+ if (sg->p_mdl) {\r
+ MmUnmapLockedPages( sg->page, sg->p_mdl );\r
+ MmUnlockPages(sg->p_mdl);\r
+ IoFreeMdl(sg->p_mdl);\r
+ }\r
+}\r
+\r
+VOID\r
+ AdapterListControl(\r
+ IN PDEVICE_OBJECT DeviceObject,\r
+ IN PIRP Irp,\r
+ IN PSCATTER_GATHER_LIST ScatterGather,\r
+ IN PVOID Context\r
+ )\r
+{\r
+ struct scatterlist *p_sg = (struct scatterlist *)Context;\r
+\r
+ UNREFERENCED_PARAMETER(DeviceObject);\r
+ UNREFERENCED_PARAMETER(Irp);\r
+\r
+ // sanity checks\r
+ if (!ScatterGather || !Context) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("AdapterListControl failed: invalid parameters\n"));\r
+ return;\r
+ }\r
+ if (ScatterGather->NumberOfElements > 1) {\r
+ HCA_PRINT(TRACE_LEVEL_WARNING ,HCA_DBG_LOW ,("AdapterListControl failed: unexpected sg size; %d elements \n",\r
+ ScatterGather->NumberOfElements ));\r
+ }\r
+ if (ScatterGather->Elements[0].Length != p_sg->length) {\r
+ HCA_PRINT(TRACE_LEVEL_WARNING ,HCA_DBG_LOW ,("AdapterListControl failed: unexpected buffer size %#x (expected %#x) \n",\r
+ ScatterGather->Elements[0].Length, p_sg->length ));\r
+ }\r
+\r
+ // results \r
+ p_sg->dma_address = ScatterGather->Elements[0].Address.QuadPart; // get logical address\r
+ p_sg->p_os_sg = ScatterGather; // store sg list address for releasing\r
+ //NB: we do not flush the buffers by FlushAdapterBuffers(), because we don't really transfer data\r
+}\r
+\r
+/* Returns: the number of mapped sg elements */\r
+int pci_map_sg(struct mthca_dev *dev, \r
+ struct scatterlist *sg, int nents, int direction)\r
+{\r
+#ifndef USE_GET_SG_LIST\r
+\r
+ UNREFERENCED_PARAMETER(dev);\r
+ UNREFERENCED_PARAMETER(sg);\r
+ UNREFERENCED_PARAMETER(direction);\r
+\r
+ // mapping was performed in alloc_dma_mem\r
+ return nents;\r
+\r
+#else\r
+\r
+ int i;\r
+ NTSTATUS status;\r
+ hca_dev_ext_t *p_ext = dev->ext;\r
+ struct scatterlist *p_sg = sg;\r
+ KIRQL irql = KeRaiseIrqlToDpcLevel();\r
+\r
+ for (i=0; i<nents; ++i, ++p_sg) {\r
+ status = p_ext->p_dma_adapter->DmaOperations->GetScatterGatherList( \r
+ p_ext->p_dma_adapter, p_ext->cl_ext.p_self_do, p_sg->p_mdl, p_sg->page, \r
+ p_sg->length, AdapterListControl, sg, (BOOLEAN)direction );\r
+ if (!NT_SUCCESS(status)) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("GetScatterGatherList failed %#x\n", status)));\r
+ break;\r
+ }\r
+ }\r
+ KeLowerIrql(irql);\r
+ return i; /* i.e., we mapped all the entries */\r
+\r
+#endif \r
+}\r
+\r
+/* Returns: the number of unmapped sg elements */\r
+int pci_unmap_sg(struct mthca_dev *dev, \r
+ struct scatterlist *sg, int nents, int direction)\r
+{\r
+#ifndef USE_GET_SG_LIST\r
+ \r
+ UNREFERENCED_PARAMETER(dev);\r
+ UNREFERENCED_PARAMETER(sg);\r
+ UNREFERENCED_PARAMETER(direction);\r
+ // mapping was performed in alloc_dma_mem\r
+ return nents;\r
+ \r
+#else\r
+\r
+ int i;\r
+ hca_dev_ext_t *p_ext = dev->ext;\r
+ struct scatterlist *p_sg = sg;\r
+ KIRQL irql = KeRaiseIrqlToDpcLevel();\r
+ void *p_os_sg = p_sg->p_os_sg;\r
+\r
+ for (i=0; i<nents; ++i, ++p_sg) {\r
+ if (p_os_sg)\r
+ p_sg->p_os_sg = NULL;\r
+ p_ext->p_dma_adapter->DmaOperations->PutScatterGatherList( \r
+ p_ext->p_dma_adapter, p_os_sg, (BOOLEAN)direction );\r
+ }\r
+ KeLowerIrql(irql);\r
+ return i; /* i.e., we mapped all the entries */\r
+\r
+#endif \r
+}\r
+\r
+/* The function zeroes 'struct scatterlist' and then fills it with values.\r
+ On error 'struct scatterlist' is returned zeroed */\r
+void *alloc_dma_mem(\r
+ IN struct mthca_dev *dev, \r
+ IN unsigned long size,\r
+ OUT struct scatterlist *p_sg)\r
+{\r
+ void *va;\r
+ DMA_ADAPTER *p_dma = dev->ext->p_dma_adapter;\r
+\r
+#ifndef USE_GET_SG_LIST\r
+\r
+ PHYSICAL_ADDRESS pa = {0};\r
+ ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);\r
+\r
+ RtlZeroMemory(p_sg,sizeof *p_sg);\r
+ if (!size)\r
+ return NULL;\r
+\r
+ va = p_dma->DmaOperations->AllocateCommonBuffer(\r
+ p_dma, size, &pa, FALSE );\r
+ if (va) {\r
+ p_sg->length = size;\r
+ p_sg->dma_address = pa.QuadPart;\r
+ p_sg->page = va;\r
+ }\r
+\r
+#else\r
+\r
+ int err;\r
+ PHYSICAL_ADDRESS la = {0}, ba = {0}, ha = {(u64)(-1I64)};\r
+ PMDL p_mdl;\r
+\r
+ ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);\r
+\r
+ RtlZeroMemory(p_sg,sizeof *p_sg);\r
+ if (!size)\r
+ return NULL;\r
+\r
+ // allocate memory\r
+ va = MmAllocateContiguousMemorySpecifyCache(\r
+ size, la, ha, ba, MmNonCached );\r
+ if (!va) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("MmAllocateContiguousMemorySpecifyCache failed on %#x size\n", size )));\r
+ goto err_alloc;\r
+ }\r
+\r
+ // allocate MDL \r
+ p_mdl = IoAllocateMdl( va, size, FALSE, FALSE, NULL );\r
+ if (!p_mdl) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR ,HCA_DBG_LOW ,("MmAllocateContiguousMemorySpecifyCache failed on %#x size\n", size )));\r
+ goto err_mdl;\r
+ }\r
+ MmBuildMdlForNonPagedPool( p_mdl );\r
+\r
+ p_sg->p_mdl = p_mdl;\r
+ p_sg->length = size;\r
+ p_sg->page = va;\r
+\r
+ goto end;\r
+\r
+err_mdl:\r
+ MmFreeContiguousMemory(va);\r
+ va = NULL;\r
+err_alloc:\r
+end:\r
+\r
+#endif\r
+\r
+ return va;\r
+}\r
+\r
+void free_dma_mem(\r
+ IN struct mthca_dev *dev, \r
+ IN struct scatterlist *p_sg)\r
+{\r
+#ifndef USE_GET_SG_LIST\r
+\r
+ PHYSICAL_ADDRESS pa;\r
+ DMA_ADAPTER *p_dma = dev->ext->p_dma_adapter;\r
+\r
+ ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);\r
+\r
+ if (p_sg->length) {\r
+ pa.QuadPart = p_sg->dma_address;\r
+ p_dma->DmaOperations->FreeCommonBuffer( \r
+ p_dma, p_sg->length, pa, \r
+ p_sg->page, FALSE );\r
+ }\r
+\r
+#else\r
+\r
+ PMDL p_mdl = p_sg->p_mdl;\r
+ PVOID page = p_sg->page;\r
+\r
+ ASSERT(KeGetCurrentIrql() == PASSIVE_LEVEL);\r
+ if (p_mdl) {\r
+ p_sg->p_mdl = NULL;\r
+ IoFreeMdl( p_mdl );\r
+ }\r
+ if (page) {\r
+ p_sg->page = NULL;\r
+ MmFreeContiguousMemory(page); \r
+ }\r
+\r
+#endif\r
+}\r
+\r
+\r
+typedef struct _mt_iobuf_seg {\r
+ LIST_ENTRY link;\r
+ PMDL mdl_p;\r
+ u64 va; /* virtual address of the buffer */\r
+ u64 size; /* size in bytes of the buffer */\r
+ u32 nr_pages;\r
+ int is_user;\r
+} mt_iobuf_seg_t;\r
+\r
+// Returns: 0 on success, -ENOMEM or -EACCESS on error\r
+static int register_segment(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN ib_access_t acc,\r
+ OUT mt_iobuf_seg_t **iobuf_seg)\r
+{\r
+ PMDL mdl_p;\r
+ int rc;\r
+ KPROCESSOR_MODE mode; \r
+ mt_iobuf_seg_t * new_iobuf;\r
+ static ULONG cnt=0;\r
+ LOCK_OPERATION Operation;\r
+\r
+ // set Operation\r
+ if (acc & IB_AC_LOCAL_WRITE)\r
+ Operation = IoModifyAccess;\r
+ else\r
+ Operation = IoReadAccess;\r
+ \r
+ // allocate IOBUF segment object\r
+ new_iobuf = (mt_iobuf_seg_t *)kmalloc(sizeof(mt_iobuf_seg_t), GFP_KERNEL );\r
+ if (new_iobuf == NULL) {\r
+ rc = -ENOMEM;\r
+ goto err_nomem;\r
+ }\r
+\r
+ // allocate MDL \r
+ mdl_p = IoAllocateMdl( (PVOID)(ULONG_PTR)va, (ULONG)size, FALSE,FALSE,NULL);\r
+ if (mdl_p == NULL) {\r
+ rc = -ENOMEM;\r
+ goto err_alloc_mdl;\r
+ }\r
+\r
+ // make context-dependent things\r
+ if (is_user) {\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+ mode = UserMode;\r
+ }\r
+ else { /* Mapping to kernel virtual address */\r
+ // MmBuildMdlForNonPagedPool(mdl_p); // fill MDL ??? - should we do that really ?\r
+ mode = KernelMode;\r
+ }\r
+\r
+ __try { /* try */\r
+ MmProbeAndLockPages( mdl_p, mode, Operation ); /* lock memory */\r
+ } /* try */\r
+ \r
+ __except (EXCEPTION_EXECUTE_HANDLER) {\r
+ HCA_PRINT(TRACE_LEVEL_ERROR, HCA_DBG_MEMORY, \r
+ ("MOSAL_iobuf_register: Exception 0x%x on MmProbeAndLockPages(), va %I64d, sz %I64d\n", \r
+ GetExceptionCode(), va, size));\r
+ rc = -EACCES;\r
+ goto err_probe;\r
+ }\r
+ \r
+ // fill IOBUF object\r
+ new_iobuf->va = va;\r
+ new_iobuf->size= size;\r
+ new_iobuf->nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );\r
+ new_iobuf->mdl_p = mdl_p;\r
+ new_iobuf->is_user = is_user;\r
+ *iobuf_seg = new_iobuf;\r
+ return 0;\r
+\r
+err_probe:\r
+ IoFreeMdl(mdl_p);\r
+err_alloc_mdl: \r
+ ExFreePool((PVOID)new_iobuf);\r
+err_nomem: \r
+ return rc;\r
+}\r
+\r
+void iobuf_init(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN OUT mt_iobuf_t *iobuf_p)\r
+{\r
+ iobuf_p->va = va;\r
+ iobuf_p->size= size;\r
+ iobuf_p->is_user = is_user;\r
+ InitializeListHead( &iobuf_p->seg_que );\r
+ iobuf_p->seg_num = 0;\r
+ iobuf_p->nr_pages = 0;\r
+ iobuf_p->is_cashed = 0;\r
+}\r
+\r
+int iobuf_register(\r
+ IN u64 va,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN ib_access_t acc,\r
+ IN OUT mt_iobuf_t *iobuf_p)\r
+{\r
+ int rc=0;\r
+ u64 seg_va; // current segment start\r
+ u64 seg_size; // current segment size\r
+ u64 rdc; // remain data counter - what is rest to lock\r
+ u64 delta; // he size of the last not full page of the first segment\r
+ mt_iobuf_seg_t * new_iobuf;\r
+ unsigned page_size = PAGE_SIZE;\r
+\r
+// 32 - for any case \r
+#define PFNS_IN_PAGE_SIZE_MDL ((PAGE_SIZE - sizeof(struct _MDL) - 32) / sizeof(long))\r
+#define MIN_IOBUF_SEGMENT_SIZE (PAGE_SIZE * PFNS_IN_PAGE_SIZE_MDL) // 4MB \r
+\r
+ ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);\r
+\r
+ // we'll try to register all at once.\r
+ seg_va = va;\r
+ seg_size = rdc = size;\r
+ \r
+ // allocate segments\r
+ while (rdc > 0) {\r
+ // map a segment\r
+ rc = register_segment(seg_va, seg_size, is_user, acc, &new_iobuf );\r
+\r
+ // success - move to another segment\r
+ if (!rc) {\r
+ rdc -= seg_size;\r
+ seg_va += seg_size;\r
+ InsertTailList( &iobuf_p->seg_que, &new_iobuf->link );\r
+ iobuf_p->seg_num++;\r
+ // round the segment size to the next page boundary \r
+ delta = (seg_va + seg_size) & (page_size - 1);\r
+ if (delta) {\r
+ seg_size -= delta;\r
+ seg_size += page_size;\r
+ }\r
+ if (seg_size > rdc)\r
+ seg_size = rdc;\r
+ continue;\r
+ }\r
+\r
+ // failure - too large a buffer: lessen it and try once more\r
+ if (rc == -ENOMEM) {\r
+ // no where to lessen - too low memory\r
+ if (seg_size <= MIN_IOBUF_SEGMENT_SIZE)\r
+ break;\r
+ // lessen the size\r
+ seg_size >>= 1;\r
+ // round the segment size to the next page boundary \r
+ delta = (seg_va + seg_size) & (page_size - 1);\r
+ if (delta) {\r
+ seg_size -= delta;\r
+ seg_size += page_size;\r
+ }\r
+ if (seg_size > rdc)\r
+ seg_size = rdc;\r
+ continue;\r
+ }\r
+\r
+ // got unrecoverable error\r
+ break;\r
+ }\r
+\r
+ // SUCCESS\r
+ if (rc) \r
+ iobuf_deregister( iobuf_p );\r
+ else \r
+ iobuf_p->nr_pages += ADDRESS_AND_SIZE_TO_SPAN_PAGES( va, size );\r
+\r
+ return rc;\r
+}\r
+\r
+\r
+static void __iobuf_copy(\r
+ IN OUT mt_iobuf_t *dst_iobuf_p,\r
+ IN mt_iobuf_t *src_iobuf_p\r
+ )\r
+{\r
+ int i;\r
+ mt_iobuf_seg_t *iobuf_seg_p;\r
+ \r
+ *dst_iobuf_p = *src_iobuf_p;\r
+ InitializeListHead( &dst_iobuf_p->seg_que );\r
+ for (i=0; i<src_iobuf_p->seg_num; ++i) {\r
+ iobuf_seg_p = (mt_iobuf_seg_t *)(PVOID)RemoveHeadList( &src_iobuf_p->seg_que );\r
+ InsertTailList( &dst_iobuf_p->seg_que, &iobuf_seg_p->link );\r
+ }\r
+}\r
+\r
+/* if the buffer to be registered overlaps a buffer, already registered, \r
+ a race can happen between HCA, writing to the previously registered\r
+ buffer and the probing functions (MmProbeAndLockPages, MmSecureVirtualMemory),\r
+ used in the algorithm of memory registration.\r
+ To prevent the race we maintain reference counters for the physical pages, being registered, \r
+ and register every physical page FOR THE WRITE ACCESS only once.*/\r
+\r
+int iobuf_register_with_cash(\r
+ IN u64 vaddr,\r
+ IN u64 size,\r
+ IN int is_user,\r
+ IN OUT ib_access_t *acc_p,\r
+ IN OUT mt_iobuf_t *iobuf_p)\r
+{\r
+ int rc, pa_in;\r
+ mt_iobuf_t sec_iobuf;\r
+ int i, page_in , page_out, page_in_total;\r
+ int nr_pages;\r
+ char *subregion_start, *va;\r
+ u64 subregion_size;\r
+ u64 rdc; // remain data counter - what is rest to lock\r
+ u64 delta; // he size of the last not full page of the first segment\r
+ ib_access_t acc;\r
+\r
+ down(&g_pa_mutex);\r
+\r
+ // register memory for read access to bring pages into the memory\r
+ rc = iobuf_register( vaddr, size, is_user, 0, iobuf_p);\r
+\r
+ // on error or read access - exit\r
+ if (rc || !(*acc_p & IB_AC_LOCAL_WRITE))\r
+ goto exit;\r
+\r
+ // re-register buffer with the correct access rights\r
+ iobuf_init( (u64)vaddr, size, is_user, &sec_iobuf );\r
+ nr_pages = ADDRESS_AND_SIZE_TO_SPAN_PAGES( vaddr, size );\r
+ subregion_start = va = (char*)(ULONG_PTR)vaddr;\r
+ rdc = size;\r
+ pa_in = page_in = page_in_total = page_out = 0;\r
+\r
+ for (i=0; i<nr_pages; ++i, va+=PAGE_SIZE) {\r
+ // check whether a phys page is to be registered\r
+ PHYSICAL_ADDRESS pa = MmGetPhysicalAddress(va);\r
+ pa_in = pa_is_registered(pa.QuadPart);\r
+ if (pa_in) {\r
+ ++page_in;\r
+ ++page_in_total;\r
+ }\r
+ else\r
+ ++page_out;\r
+\r
+ // check whether we get at the end of a subregion with the same rights wrt cash\r
+ if (page_in && page_out) {\r
+ // prepare to registration of the subregion\r
+ if (pa_in) { // SUBREGION WITH WRITE ACCESS\r
+ acc = IB_AC_LOCAL_WRITE;\r
+ subregion_size = (u64)page_out * PAGE_SIZE;\r
+ page_out = 0;\r
+ }\r
+ else { // SUBREGION WITH READ ACCESS\r
+ acc = 0;\r
+ subregion_size = (u64)page_in * PAGE_SIZE;\r
+ page_in = 0;\r
+ }\r
+ \r
+ // round the subregion size to the page boundary \r
+ delta = (ULONG_PTR)(subregion_start + subregion_size) & (PAGE_SIZE - 1);\r
+ subregion_size -= delta;\r
+ if (subregion_size > rdc)\r
+ subregion_size = rdc;\r
+\r
+ // register the subregion\r
+ rc = iobuf_register( (ULONG_PTR)subregion_start, subregion_size, is_user, acc, &sec_iobuf);\r
+ if (rc)\r
+ goto cleanup;\r
+\r
+ // prepare to the next loop\r
+ rdc -= subregion_size;\r
+ subregion_start +=subregion_size;\r
+ }\r
+ }\r
+\r
+ // prepare to registration of the subregion\r
+ if (pa_in) { // SUBREGION WITH READ ACCESS\r
+ acc = 0;\r
+ subregion_size = (u64)page_in * PAGE_SIZE;\r
+ }\r
+ else { // SUBREGION WITH WRITE ACCESS\r
+ acc = IB_AC_LOCAL_WRITE;\r
+ subregion_size = (u64)page_out * PAGE_SIZE;\r
+ }\r
+ \r
+ // round the subregion size to the page boundary \r
+ delta = (ULONG_PTR)(subregion_start + subregion_size) & (PAGE_SIZE - 1);\r
+ subregion_size -= delta;\r
+ if (subregion_size > rdc)\r
+ subregion_size = rdc;\r
+ \r
+ // register the subregion\r
+ rc = iobuf_register( (ULONG_PTR)subregion_start, subregion_size, is_user, acc, &sec_iobuf);\r
+ if (rc)\r
+ goto cleanup;\r
+\r
+ // cash phys pages\r
+ rc = pa_register(iobuf_p);\r
+ if (rc)\r
+ goto err_pa_reg;\r
+\r
+ // replace the iobuf\r
+ iobuf_deregister( iobuf_p );\r
+ sec_iobuf.is_cashed = TRUE;\r
+ __iobuf_copy( iobuf_p, &sec_iobuf );\r
+ \r
+ // buffer is a part of also registered buffer - change the rights \r
+ if (page_in_total)\r
+ *acc_p = MTHCA_ACCESS_REMOTE_READ;\r
+\r
+ goto exit;\r
+ \r
+err_pa_reg: \r
+ iobuf_deregister( &sec_iobuf );\r
+cleanup:\r
+ iobuf_deregister( iobuf_p );\r
+exit: \r
+ up(&g_pa_mutex);\r
+ return rc;\r
+}\r
+\r
+static void deregister_segment(mt_iobuf_seg_t * iobuf_seg_p)\r
+{\r
+ MmUnlockPages( iobuf_seg_p->mdl_p ); // unlock the buffer \r
+ IoFreeMdl( iobuf_seg_p->mdl_p ); // free MDL\r
+ ExFreePool(iobuf_seg_p);\r
+}\r
+\r
+void iobuf_deregister(mt_iobuf_t *iobuf_p)\r
+{\r
+ mt_iobuf_seg_t *iobuf_seg_p; // pointer to current segment object\r
+\r
+ ASSERT(KeGetCurrentIrql() <= DISPATCH_LEVEL);\r
+\r
+ // release segments\r
+ while (!IsListEmpty( &iobuf_p->seg_que )) {\r
+ iobuf_seg_p = (mt_iobuf_seg_t *)(PVOID)RemoveTailList( &iobuf_p->seg_que );\r
+ deregister_segment(iobuf_seg_p);\r
+ iobuf_p->seg_num--;\r
+ }\r
+ ASSERT(iobuf_p->seg_num == 0);\r
+}\r
+\r
+void iobuf_deregister_with_cash(mt_iobuf_t *iobuf_p)\r
+{\r
+ ASSERT(KeGetCurrentIrql() < DISPATCH_LEVEL);\r
+\r
+ down(&g_pa_mutex);\r
+ if (iobuf_p->is_cashed)\r
+ pa_deregister(iobuf_p);\r
+ iobuf_deregister(iobuf_p);\r
+ up(&g_pa_mutex);\r
+}\r
+\r
+void iobuf_iter_init(\r
+ IN mt_iobuf_t *iobuf_p, \r
+ IN OUT mt_iobuf_iter_t *iterator_p)\r
+{\r
+ iterator_p->seg_p = iobuf_p->seg_que.Flink;\r
+ iterator_p->pfn_ix = 0;\r
+}\r
+\r
+// the function returns phys addresses of the pages, also for the first page\r
+// if one wants to get the phys address of the buffer, one has to \r
+// add the offset from the start of the page to the first phys address\r
+// Returns: the number of entries, filled in page_tbl_p\r
+// Returns 0 while at the end of list.\r
+uint32_t iobuf_get_tpt_seg(\r
+ IN mt_iobuf_t *iobuf_p, \r
+ IN OUT mt_iobuf_iter_t *iterator_p,\r
+ IN uint32_t n_pages_in, \r
+ IN OUT uint64_t *page_tbl_p )\r
+{\r
+ uint32_t i=0; // has to be initialized here for a premature exit\r
+ mt_iobuf_seg_t *seg_p; // pointer to current segment object \r
+ PPFN_NUMBER pfn_p; \r
+ uint32_t pfn_ix; // index of PFN in PFN array of the current segment\r
+ uint64_t *pa_buf_p = page_tbl_p;\r
+\r
+ // prepare to the loop\r
+ seg_p = iterator_p->seg_p; // first segment of the first iobuf\r
+ pfn_ix= iterator_p->pfn_ix;\r
+\r
+ // check, whether we at the end of the list\r
+ if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que)\r
+ goto exit;\r
+ pfn_p = MmGetMdlPfnArray( seg_p->mdl_p ) + pfn_ix;\r
+\r
+ // pass along all the PFN arrays\r
+ for (; i < n_pages_in; i++, pa_buf_p++) {\r
+ // convert PFN to the physical address\r
+ *pa_buf_p = (uint64_t)*pfn_p++ << PAGE_SHIFT;\r
+ \r
+ // get to the next PFN \r
+ if (++pfn_ix >= seg_p->nr_pages) {\r
+ seg_p = (mt_iobuf_seg_t*)seg_p->link.Flink;\r
+ pfn_ix = 0;\r
+ if ((PVOID)seg_p == (PVOID)&iobuf_p->seg_que) {\r
+ i++;\r
+ break;\r
+ }\r
+ pfn_p = MmGetMdlPfnArray( seg_p->mdl_p );\r
+ }\r
+ }\r
+\r
+exit:\r
+ iterator_p->seg_p = seg_p;\r
+ iterator_p->pfn_ix = pfn_ix;\r
+ return i;\r
+}\r
+\r
+\r
+\r
+\r
\r
// try register the buffer\r
iobuf_p = &mr->iobuf;\r
- iobuf_init( (u64)vaddr, length, um_call, iobuf_p);\r
+ iobuf_init( (ULONG_PTR)vaddr, length, um_call, iobuf_p);\r
ib_acc = (acc & ~MTHCA_ACCESS_REMOTE_READ) ? IB_AC_LOCAL_WRITE : 0;\r
- err = iobuf_register_with_cash( (u64)vaddr, length, um_call, \r
+ err = iobuf_register_with_cash( (ULONG_PTR)vaddr, length, um_call, \r
&ib_acc, iobuf_p );\r
if (err)\r
goto err_reg_mem;\r
-/*
- * Copyright (c) 2005 Topspin Communications. All rights reserved.
- * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * $Id$
- */
-
-#include <mt_l2w.h>
-#include "mlnx_uvp.h"
-#include "mlnx_uvp_doorbell.h"
-#include "mthca_wqe.h"
-#include "mlnx_ual_data.h"
-
-#if defined(EVENT_TRACING)
-#include "mlnx_uvp_qp.tmh"
-#endif
-
-static const uint8_t mthca_opcode[] = {
- MTHCA_OPCODE_RDMA_WRITE,
- MTHCA_OPCODE_RDMA_WRITE_IMM,
- MTHCA_OPCODE_SEND,
- MTHCA_OPCODE_SEND_IMM,
- MTHCA_OPCODE_RDMA_READ,
- MTHCA_OPCODE_ATOMIC_CS,
- MTHCA_OPCODE_ATOMIC_FA
-};
-
-static enum mthca_wr_opcode conv_ibal_wr_opcode(struct _ib_send_wr *wr)
-{
- enum mthca_wr_opcode opcode = -1; //= wr->wr_type;
-
- switch (wr->wr_type) {
- case WR_SEND:
- opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? MTHCA_OPCODE_SEND_IMM : MTHCA_OPCODE_SEND;
- break;
- case WR_RDMA_WRITE:
- opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? MTHCA_OPCODE_RDMA_WRITE_IMM : MTHCA_OPCODE_RDMA_WRITE;
- break;
- case WR_RDMA_READ: opcode = MTHCA_OPCODE_RDMA_READ; break;
- case WR_COMPARE_SWAP: opcode = MTHCA_OPCODE_ATOMIC_CS; break;
- case WR_FETCH_ADD: opcode = MTHCA_OPCODE_ATOMIC_FA; break;
- default: opcode = MTHCA_OPCODE_INVALID;break;
- }
- return opcode;
-}
-
-
-static void dump_wqe(uint32_t print_lvl, uint32_t *wqe_ptr , struct mthca_qp *qp_ptr)
-{
- net32_t *wqe = wqe_ptr;
-
- (void) wqe; /* avoid warning if mthca_dbg compiled away... */
- UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents QPN 0x%06x \n",qp_ptr->ibv_qp.qp_num));
- UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",0
- , cl_ntoh32(wqe[0]), cl_ntoh32(wqe[1]), cl_ntoh32(wqe[2]), cl_ntoh32(wqe[3])));
- UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",4
- , cl_ntoh32(wqe[4]), cl_ntoh32(wqe[5]), cl_ntoh32(wqe[6]), cl_ntoh32(wqe[7])));
- UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",8
- , cl_ntoh32(wqe[8]), cl_ntoh32(wqe[9]), cl_ntoh32(wqe[10]), cl_ntoh32(wqe[11])));
- UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",12
- , cl_ntoh32(wqe[12]), cl_ntoh32(wqe[13]), cl_ntoh32(wqe[14]), cl_ntoh32(wqe[15])));
-
-}
-static void *get_recv_wqe(struct mthca_qp *qp, int n)
-{
- return qp->buf + (n << qp->rq.wqe_shift);
-}
-
-static void *get_send_wqe(struct mthca_qp *qp, int n)
-{
- void *wqe_addr = qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);
- UVP_PRINT(TRACE_LEVEL_INFORMATION,UVP_DBG_QP,
- ("wqe %p, qp_buf %p, offset %#x, index %d, shift %d \n",
- wqe_addr, qp->buf, qp->send_wqe_offset, n,
- qp->sq.wqe_shift));
-
- return wqe_addr;
-}
-
-void mthca_init_qp_indices(struct mthca_qp *qp)
-{
- qp->sq.next_ind = 0;
- qp->sq.last_comp = qp->sq.max - 1;
- qp->sq.head = 0;
- qp->sq.tail = 0;
- qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
-
- qp->rq.next_ind = 0;
- qp->rq.last_comp = qp->rq.max - 1;
- qp->rq.head = 0;
- qp->rq.tail = 0;
- qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
-}
-
-static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)
-{
- unsigned cur;
-
- cur = wq->head - wq->tail;
- if ((int)(cur + nreq) < wq->max)
- return 0;
-
- cl_spinlock_acquire(&cq->lock);
- cur = wq->head - wq->tail;
- cl_spinlock_release(&cq->lock);
-
- return (int)(cur + nreq) >= wq->max;
-}
-
-
-int mthca_tavor_post_send(struct ibv_qp *ibqp, struct _ib_send_wr *wr,
- struct _ib_send_wr **bad_wr)
-{
- struct mthca_qp *qp = to_mqp(ibqp);
- uint8_t *wqe;
- uint8_t *prev_wqe;
- int ret = 0;
- int nreq;
- int i;
- int size;
- int size0 = 0;
- uint32_t f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;
- int ind;
- int op0 = 0;
- enum ib_wr_opcode opcode;
-
- UVP_ENTER(UVP_DBG_QP);
- cl_spinlock_acquire(&qp->sq.lock);
-
- /* XXX check that state is OK to post send */
-
- ind = qp->sq.next_ind;
-
- if(ibqp->state == IBV_QPS_RESET) {
- ret = -EBUSY;
- if (bad_wr)
- *bad_wr = wr;
- goto err_busy;
- }
-
- for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
-
- if (mthca_wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x full (%u head, %u tail,"
- " %d max, %d nreq)\n", ibqp->qp_num,
- qp->sq.head, qp->sq.tail,
- qp->sq.max, nreq));
- ret = -ENOMEM;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- wqe = get_send_wqe(qp, ind);
- prev_wqe = qp->sq.last;
- qp->sq.last = wqe;
- opcode = conv_ibal_wr_opcode(wr);
- if (opcode == MTHCA_OPCODE_INVALID) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x opcode invalid\n",ibqp->qp_num));
- ret = -EINVAL;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
-
- ((struct mthca_next_seg *) wqe)->nda_op = 0;
- ((struct mthca_next_seg *) wqe)->ee_nds = 0;
- ((struct mthca_next_seg *) wqe)->flags =
- ((wr->send_opt & IB_SEND_OPT_SIGNALED) ?
- cl_hton32(MTHCA_NEXT_CQ_UPDATE) : 0) |
- ((wr->send_opt & IB_SEND_OPT_SOLICITED) ?
- cl_hton32(MTHCA_NEXT_SOLICIT) : 0) |
- cl_hton32(1);
- if (opcode == MTHCA_OPCODE_SEND_IMM||
- opcode == MTHCA_OPCODE_RDMA_WRITE_IMM)
- ((struct mthca_next_seg *) wqe)->imm = wr->immediate_data;
-
- wqe += sizeof (struct mthca_next_seg);
- size = sizeof (struct mthca_next_seg) / 16;
-
-
- switch (ibqp->qp_type) {
- case IB_QPT_RELIABLE_CONN:
- switch (opcode) {
- case MTHCA_OPCODE_ATOMIC_CS:
- case MTHCA_OPCODE_ATOMIC_FA:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
-
- wqe += sizeof (struct mthca_raddr_seg);
-
- if (opcode == MTHCA_OPCODE_ATOMIC_CS) {
- ((struct mthca_atomic_seg *) wqe)->swap_add =
- cl_hton64(wr->remote_ops.atomic2);
- ((struct mthca_atomic_seg *) wqe)->compare =
- cl_hton64(wr->remote_ops.atomic1);
- } else {
- ((struct mthca_atomic_seg *) wqe)->swap_add =
- cl_hton64(wr->remote_ops.atomic1);
- ((struct mthca_atomic_seg *) wqe)->compare = 0;
- }
-
- wqe += sizeof (struct mthca_atomic_seg);
- size += (sizeof (struct mthca_raddr_seg) +
- sizeof (struct mthca_atomic_seg)) / 16;
- break;
-
- case MTHCA_OPCODE_RDMA_WRITE:
- case MTHCA_OPCODE_RDMA_WRITE_IMM:
- case MTHCA_OPCODE_RDMA_READ:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
- wqe += sizeof (struct mthca_raddr_seg);
- size += sizeof (struct mthca_raddr_seg) / 16;
- break;
-
- default:
- /* No extra segments required for sends */
- break;
- }
-
- break;
-
- case IB_QPT_UNRELIABLE_CONN:
- switch (opcode) {
- case MTHCA_OPCODE_RDMA_WRITE:
- case MTHCA_OPCODE_RDMA_WRITE_IMM:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
- wqe += sizeof (struct mthca_raddr_seg);
- size += sizeof (struct mthca_raddr_seg) / 16;
- break;
-
- default:
- /* No extra segments required for sends */
- break;
- }
-
- break;
-
- case IB_QPT_UNRELIABLE_DGRM:
- {
- struct mthca_ah *ah = ((struct mthca_ah *)wr->dgrm.ud.h_av);
- ((struct mthca_tavor_ud_seg *) wqe)->lkey =
- cl_hton32(ah->key);
- ((struct mthca_tavor_ud_seg *) wqe)->av_addr =
- cl_hton64((uint64_t)ah->av);
- ((struct mthca_tavor_ud_seg *) wqe)->dqpn = wr->dgrm.ud.remote_qp;
- ((struct mthca_tavor_ud_seg *) wqe)->qkey = wr->dgrm.ud.remote_qkey;
-
- wqe += sizeof (struct mthca_tavor_ud_seg);
- size += sizeof (struct mthca_tavor_ud_seg) / 16;
- break;
- }
-
- default:
- break;
- }
-
- if ((int)(int)wr->num_ds > qp->sq.max_gs) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x too many gathers\n",ibqp->qp_num));
- ret = -ERANGE;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-//TODO sleybo:
- if (wr->send_opt & IB_SEND_OPT_INLINE) {
- if (wr->num_ds) {
- struct mthca_inline_seg *seg = (struct mthca_inline_seg *)wqe;
- uint32_t s = 0;
-
- wqe += sizeof *seg;
- for (i = 0; i < (int)wr->num_ds; ++i) {
- struct _ib_local_ds *sge = &wr->ds_array[i];
-
- s += sge->length;
-
- if (s > (uint32_t)qp->max_inline_data) {
- ret = -1;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- memcpy(wqe, (void *) (ULONG_PTR) sge->vaddr,
- sge->length);
- wqe += sge->length;
- }
-
- seg->byte_count = cl_hton32(MTHCA_INLINE_SEG | s);
- size += align(s + sizeof *seg, 16) / 16;
- }
- } else {
- for (i = 0; i < (int)wr->num_ds; ++i) {
- ((struct mthca_data_seg *) wqe)->byte_count =
- cl_hton32(wr->ds_array[i].length);
- ((struct mthca_data_seg *) wqe)->lkey =
- cl_hton32(wr->ds_array[i].lkey);
- ((struct mthca_data_seg *) wqe)->addr =
- cl_hton64(wr->ds_array[i].vaddr);
- wqe += sizeof (struct mthca_data_seg);
- size += sizeof (struct mthca_data_seg) / 16;
- }
- }
-
- qp->wrid[ind + qp->rq.max] = wr->wr_id;
-
- ((struct mthca_next_seg *) prev_wqe)->nda_op =
- cl_hton32(((ind << qp->sq.wqe_shift) +
- qp->send_wqe_offset) |opcode);
-
- wmb();
-
- ((struct mthca_next_seg *) prev_wqe)->ee_nds =
- cl_hton32((size0 ? 0 : MTHCA_NEXT_DBD) | size |
- ((wr->send_opt& IB_SEND_OPT_FENCE) ?
- MTHCA_NEXT_FENCE : 0));
-
- if (!size0) {
- size0 = size;
- op0 = opcode;
- }
-
- dump_wqe( TRACE_LEVEL_VERBOSE, (uint32_t*)qp->sq.last,qp);
-
- ++ind;
- if (unlikely(ind >= qp->sq.max))
- ind -= qp->sq.max;
-
- }
-
-out:
- if (likely(nreq)) {
- uint32_t doorbell[2];
-
- doorbell[0] = cl_hton32(((qp->sq.next_ind << qp->sq.wqe_shift) +
- qp->send_wqe_offset) | f0 | op0);
- doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);
-
- wmb();
-
- mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);
- }
-
- qp->sq.next_ind = ind;
- qp->sq.head += nreq;
-
-err_busy:
- cl_spinlock_release(&qp->sq.lock);
-
- UVP_EXIT(UVP_DBG_QP);
- return ret;
-}
-
-
-int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct _ib_recv_wr *wr,
- struct _ib_recv_wr **bad_wr)
-{
- struct mthca_qp *qp = to_mqp(ibqp);
- uint32_t doorbell[2];
- int ret = 0;
- int nreq;
- int i;
- int size;
- int size0 = 0;
- int ind;
- uint8_t *wqe;
- uint8_t *prev_wqe;
-
- UVP_ENTER(UVP_DBG_QP);
-
- cl_spinlock_acquire(&qp->rq.lock);
-
- /* XXX check that state is OK to post receive */
-
- ind = qp->rq.next_ind;
- if(ibqp->state == IBV_QPS_RESET) {
- ret = -EBUSY;
- if (bad_wr)
- *bad_wr = wr;
- goto err_busy;
- }
-
- for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
- if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
- nreq = 0;
-
- doorbell[0] = cl_hton32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
- doorbell[1] = cl_hton32(ibqp->qp_num << 8); //TODO sleybo: add qpn to qp struct
-
- /*
- * Make sure that descriptors are written
- * before doorbell is rung.
- */
- mb();
-
- mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_RECV_DOORBELL);
-
- qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
- size0 = 0;
- }
-
- if (mthca_wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
- UVP_PRINT(TRACE_LEVEL_ERROR,UVP_DBG_QP,("RQ %06x full (%u head, %u tail,"
- " %d max, %d nreq)\n", ibqp->qp_num,
- qp->rq.head, qp->rq.tail,
- qp->rq.max, nreq));
- ret = -ENOMEM;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- wqe = get_recv_wqe(qp, ind);
- prev_wqe = qp->rq.last;
- qp->rq.last = wqe;
-
- ((struct mthca_next_seg *) wqe)->nda_op = 0;
- ((struct mthca_next_seg *) wqe)->ee_nds =
- cl_hton32(MTHCA_NEXT_DBD);
- ((struct mthca_next_seg *) wqe)->flags =
- cl_hton32(MTHCA_NEXT_CQ_UPDATE);
-
- wqe += sizeof (struct mthca_next_seg);
- size = sizeof (struct mthca_next_seg) / 16;
-
- if (unlikely((int)wr->num_ds > qp->rq.max_gs)) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x too many gathers\n",ibqp->qp_num));
- ret = -ERANGE;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- for (i = 0; i < (int)wr->num_ds; ++i) {
- ((struct mthca_data_seg *) wqe)->byte_count =
- cl_hton32(wr->ds_array[i].length);
- ((struct mthca_data_seg *) wqe)->lkey =
- cl_hton32(wr->ds_array[i].lkey);
- ((struct mthca_data_seg *) wqe)->addr =
- cl_hton64(wr->ds_array[i].vaddr);
- wqe += sizeof (struct mthca_data_seg);
- size += sizeof (struct mthca_data_seg) / 16;
- }
-
- qp->wrid[ind] = wr->wr_id;
-
- ((struct mthca_next_seg *) prev_wqe)->nda_op =
- cl_hton32((ind << qp->rq.wqe_shift) | 1);
- wmb();
- ((struct mthca_next_seg *) prev_wqe)->ee_nds =
- cl_hton32(MTHCA_NEXT_DBD | size);
-
- if (!size0)
- size0 = size;
-
- ++ind;
- if (unlikely(ind >= qp->rq.max))
- ind -= qp->rq.max;
- }
-
-out:
- if (likely(nreq)) {
- doorbell[0] = cl_hton32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);
- doorbell[1] = cl_hton32((ibqp->qp_num << 8) | (nreq & 255));
-
- /*
- * Make sure that descriptors are written before
- * doorbell is rung.
- */
- mb();
-
- mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_RECV_DOORBELL);
- }
-
- qp->rq.next_ind = ind;
- qp->rq.head += nreq;
-
-err_busy:
- cl_spinlock_release(&qp->rq.lock);
- UVP_EXIT(UVP_DBG_QP);
- return ret;
-}
-
-int mthca_arbel_post_send(struct ibv_qp *ibqp, struct _ib_send_wr *wr,
- struct _ib_send_wr **bad_wr)
-{
- struct mthca_qp *qp = to_mqp(ibqp);
- uint32_t doorbell[2];
- uint8_t *wqe;
- uint8_t *prev_wqe;
- int ret = 0;
- int nreq;
- int i;
- int size;
- int size0 = 0;
- uint32_t f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;
- int ind;
- uint8_t op0 = 0;
- enum ib_wr_opcode opcode;
-
- UVP_ENTER(UVP_DBG_QP);
-
- cl_spinlock_acquire(&qp->sq.lock);
-
- /* XXX check that state is OK to post send */
-
- ind = qp->sq.head & (qp->sq.max - 1);
- if(ibqp->state == IBV_QPS_RESET) {
- ret = -EBUSY;
- if (bad_wr)
- *bad_wr = wr;
- goto err_busy;
- }
-
- for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
- if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
- nreq = 0;
-
- doorbell[0] = cl_hton32((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
- ((qp->sq.head & 0xffff) << 8) | f0 | op0);
- doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);
- qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
- size0 = 0;
- f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;
-
- /*
- * Make sure that descriptors are written before
- * doorbell record.
- */
- wmb();
- *qp->sq.db = cl_hton32(qp->sq.head & 0xffff);
-
- /*
- * Make sure doorbell record is written before we
- * write MMIO send doorbell.
- */
- wmb();
- mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);
-
- }
-
- if (mthca_wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
- UVP_PRINT(TRACE_LEVEL_ERROR,UVP_DBG_QP,("SQ %06x full (%u head, %u tail,"
- " %d max, %d nreq)\n", ibqp->qp_num,
- qp->sq.head, qp->sq.tail,
- qp->sq.max, nreq));
- ret = -ENOMEM;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- wqe = get_send_wqe(qp, ind);
- prev_wqe = qp->sq.last;
- qp->sq.last = wqe;
- opcode = conv_ibal_wr_opcode(wr);
-
- ((struct mthca_next_seg *) wqe)->flags =
- ((wr->send_opt & IB_SEND_OPT_SIGNALED) ?
- cl_hton32(MTHCA_NEXT_CQ_UPDATE) : 0) |
- ((wr->send_opt & IB_SEND_OPT_SOLICITED) ?
- cl_hton32(MTHCA_NEXT_SOLICIT) : 0) |
- cl_hton32(1);
- if (opcode == MTHCA_OPCODE_SEND_IMM||
- opcode == MTHCA_OPCODE_RDMA_WRITE_IMM)
- ((struct mthca_next_seg *) wqe)->imm = wr->immediate_data;
-
- wqe += sizeof (struct mthca_next_seg);
- size = sizeof (struct mthca_next_seg) / 16;
-
- switch (ibqp->qp_type) {
- case IB_QPT_RELIABLE_CONN:
- switch (opcode) {
- case MTHCA_OPCODE_ATOMIC_CS:
- case MTHCA_OPCODE_ATOMIC_FA:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
-
- wqe += sizeof (struct mthca_raddr_seg);
-
- if (opcode == MTHCA_OPCODE_ATOMIC_CS) {
- ((struct mthca_atomic_seg *) wqe)->swap_add =
- cl_hton64(wr->remote_ops.atomic2);
- ((struct mthca_atomic_seg *) wqe)->compare =
- cl_hton64(wr->remote_ops.atomic1);
- } else {
- ((struct mthca_atomic_seg *) wqe)->swap_add =
- cl_hton64(wr->remote_ops.atomic1);
- ((struct mthca_atomic_seg *) wqe)->compare = 0;
- }
-
- wqe += sizeof (struct mthca_atomic_seg);
- size += (sizeof (struct mthca_raddr_seg) +
- sizeof (struct mthca_atomic_seg)) / 16;
- break;
-
- case MTHCA_OPCODE_RDMA_READ:
- case MTHCA_OPCODE_RDMA_WRITE:
- case MTHCA_OPCODE_RDMA_WRITE_IMM:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
- wqe += sizeof (struct mthca_raddr_seg);
- size += sizeof (struct mthca_raddr_seg) / 16;
- break;
-
- default:
- /* No extra segments required for sends */
- break;
- }
-
- break;
-
- case IB_QPT_UNRELIABLE_CONN:
- switch (opcode) {
- case MTHCA_OPCODE_RDMA_WRITE:
- case MTHCA_OPCODE_RDMA_WRITE_IMM:
- ((struct mthca_raddr_seg *) wqe)->raddr =
- cl_hton64(wr->remote_ops.vaddr);
- ((struct mthca_raddr_seg *) wqe)->rkey =
- wr->remote_ops.rkey;
- ((struct mthca_raddr_seg *) wqe)->reserved = 0;
- wqe += sizeof (struct mthca_raddr_seg);
- size += sizeof (struct mthca_raddr_seg) / 16;
- break;
-
- default:
- /* No extra segments required for sends */
- break;
- }
-
- break;
-
- case IB_QPT_UNRELIABLE_DGRM:
- {
- struct mthca_ah *ah = ((struct mthca_ah *)wr->dgrm.ud.h_av);
- memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,
- ah->av, sizeof ( struct mthca_av));
- ((struct mthca_arbel_ud_seg *) wqe)->dqpn = wr->dgrm.ud.remote_qp;
- ((struct mthca_arbel_ud_seg *) wqe)->qkey = wr->dgrm.ud.remote_qkey;
-
-
- wqe += sizeof (struct mthca_arbel_ud_seg);
- size += sizeof (struct mthca_arbel_ud_seg) / 16;
- break;
- }
-
- default:
- break;
- }
-
- if ((int)wr->num_ds > qp->sq.max_gs) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x full too many gathers\n",ibqp->qp_num));
- ret = -ERANGE;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- if (wr->send_opt & IB_SEND_OPT_INLINE) {
- if (wr->num_ds) {
- struct mthca_inline_seg *seg = (struct mthca_inline_seg *)wqe;
- uint32_t s = 0;
-
- wqe += sizeof *seg;
- for (i = 0; i < (int)wr->num_ds; ++i) {
- struct _ib_local_ds *sge = &wr->ds_array[i];
-
- s += sge->length;
-
- if (s > (uint32_t)qp->max_inline_data) {
- ret = -E2BIG;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- memcpy(wqe, (void *) (uintptr_t) sge->vaddr,
- sge->length);
- wqe += sge->length;
- }
-
- seg->byte_count = cl_hton32(MTHCA_INLINE_SEG | s);
- size += align(s + sizeof *seg, 16) / 16;
- }
- } else {
-
- for (i = 0; i < (int)wr->num_ds; ++i) {
- ((struct mthca_data_seg *) wqe)->byte_count =
- cl_hton32(wr->ds_array[i].length);
- ((struct mthca_data_seg *) wqe)->lkey =
- cl_hton32(wr->ds_array[i].lkey);
- ((struct mthca_data_seg *) wqe)->addr =
- cl_hton64(wr->ds_array[i].vaddr);
- wqe += sizeof (struct mthca_data_seg);
- size += sizeof (struct mthca_data_seg) / 16;
- }
-//TODO do this also in kernel
-// size += wr->num_ds * (sizeof *seg / 16);
- }
-
- qp->wrid[ind + qp->rq.max] = wr->wr_id;
-
- if (opcode == MTHCA_OPCODE_INVALID) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x opcode invalid\n",ibqp->qp_num));
- ret = -EINVAL;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- ((struct mthca_next_seg *) prev_wqe)->nda_op =
- cl_hton32(((ind << qp->sq.wqe_shift) +
- qp->send_wqe_offset) |
- opcode);
- wmb();
- ((struct mthca_next_seg *) prev_wqe)->ee_nds =
- cl_hton32(MTHCA_NEXT_DBD | size |
- ((wr->send_opt & IB_SEND_OPT_FENCE) ?
- MTHCA_NEXT_FENCE : 0));
-
- if (!size0) {
- size0 = size;
- op0 = opcode;
- }
-
- ++ind;
- if (unlikely(ind >= qp->sq.max))
- ind -= qp->sq.max;
- }
-
-out:
- if (likely(nreq)) {
- doorbell[0] = cl_hton32((nreq << 24) |
- ((qp->sq.head & 0xffff) << 8) | f0 | op0);
- doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);
-
- qp->sq.head += nreq;
-
- /*
- * Make sure that descriptors are written before
- * doorbell record.
- */
- wmb();
- *qp->sq.db = cl_hton32(qp->sq.head & 0xffff);
-
- /*
- * Make sure doorbell record is written before we
- * write MMIO send doorbell.
- */
- wmb();
- mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);
- }
-
-err_busy:
- cl_spinlock_release(&qp->sq.lock);
-
- UVP_EXIT(UVP_DBG_QP);
-
- return ret;
-}
-
-int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct _ib_recv_wr *wr,
- struct _ib_recv_wr **bad_wr)
-{
- struct mthca_qp *qp = to_mqp(ibqp);
- int ret = 0;
- int nreq;
- int ind;
- int i;
- uint8_t *wqe;
-
- UVP_ENTER(UVP_DBG_QP);
-
- cl_spinlock_acquire(&qp->rq.lock);
-
- /* XXX check that state is OK to post receive */
-
- ind = qp->rq.head & (qp->rq.max - 1);
- if(ibqp->state == IBV_QPS_RESET) {
- ret = -EBUSY;
- if (bad_wr)
- *bad_wr = wr;
- goto err_busy;
- }
- for (nreq = 0; wr; ++nreq, wr = wr->p_next) {
- if (mthca_wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {//TODO sleybo: check the cq
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x full (%u head, %u tail,"
- " %d max, %d nreq)\n", ibqp->qp_num,
- qp->rq.head, qp->rq.tail,
- qp->rq.max, nreq));
- ret = -ENOMEM;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- wqe = get_recv_wqe(qp, ind);
-
- ((struct mthca_next_seg *) wqe)->flags = 0;
-
- wqe += sizeof (struct mthca_next_seg);
-
- if (unlikely((int)wr->num_ds > qp->rq.max_gs)) {
- UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x full too many scatter\n",ibqp->qp_num));
- ret = -ERANGE;
- if (bad_wr)
- *bad_wr = wr;
- goto out;
- }
-
- for (i = 0; i < (int)wr->num_ds; ++i) {
- ((struct mthca_data_seg *) wqe)->byte_count =
- cl_hton32(wr->ds_array[i].length);
- ((struct mthca_data_seg *) wqe)->lkey =
- cl_hton32(wr->ds_array[i].lkey);
- ((struct mthca_data_seg *) wqe)->addr =
- cl_hton64(wr->ds_array[i].vaddr);
- wqe += sizeof (struct mthca_data_seg);
- }
-
- if (i < qp->rq.max_gs) {
- ((struct mthca_data_seg *) wqe)->byte_count = 0;
- ((struct mthca_data_seg *) wqe)->lkey = cl_hton32(MTHCA_INVAL_LKEY);
- ((struct mthca_data_seg *) wqe)->addr = 0;
- }
-
- qp->wrid[ind] = wr->wr_id;
-
- ++ind;
- if (unlikely(ind >= qp->rq.max))
- ind -= qp->rq.max;
- }
-out:
- if (likely(nreq)) {
- qp->rq.head += nreq;
-
- /*
- * Make sure that descriptors are written before
- * doorbell record.
- */
- mb();
- *qp->rq.db = cl_hton32(qp->rq.head & 0xffff);
- }
-
-err_busy:
- cl_spinlock_release(&qp->rq.lock);
-
- UVP_EXIT(UVP_DBG_QP);
-
- return ret;
-}
-
-int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
- ib_qp_type_t type, struct mthca_qp *qp)
-{
- int size;
- int max_sq_sge;
-
- qp->rq.max_gs = cap->max_recv_sge;
- qp->sq.max_gs = cap->max_send_sge;
- max_sq_sge = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),
- sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);
- if (max_sq_sge < (int)cap->max_send_sge)
- max_sq_sge = cap->max_send_sge;
-
- qp->wrid = cl_malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));
- if (!qp->wrid)
- return -1;
-
- size = sizeof (struct mthca_next_seg) +
- qp->rq.max_gs * sizeof (struct mthca_data_seg);
-
- for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
- qp->rq.wqe_shift++)
- ; /* nothing */
-
- size = max_sq_sge * sizeof (struct mthca_data_seg);
- switch (type) {
- case IB_QPT_UNRELIABLE_DGRM:
- size += mthca_is_memfree(pd->context) ?
- sizeof (struct mthca_arbel_ud_seg) :
- sizeof (struct mthca_tavor_ud_seg);
- break;
-
- case IB_QPT_UNRELIABLE_CONN:
- size += sizeof (struct mthca_raddr_seg);
- break;
-
- case IB_QPT_RELIABLE_CONN:
- size += sizeof (struct mthca_raddr_seg);
- /*
- * An atomic op will require an atomic segment, a
- * remote address segment and one scatter entry.
- */
- if (size < (sizeof (struct mthca_atomic_seg) +
- sizeof (struct mthca_raddr_seg) +
- sizeof (struct mthca_data_seg)))
- size = (sizeof (struct mthca_atomic_seg) +
- sizeof (struct mthca_raddr_seg) +
- sizeof (struct mthca_data_seg));
- break;
-
- default:
- break;
- }
-
- /* Make sure that we have enough space for a bind request */
- if (size < sizeof (struct mthca_bind_seg))
- size = sizeof (struct mthca_bind_seg);
-
- size += sizeof (struct mthca_next_seg);
-
- for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
- qp->sq.wqe_shift++)
- ; /* nothing */
-
- qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,
- 1 << qp->sq.wqe_shift);
-
- qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);
-
- if (posix_memalign(&qp->buf, g_page_size,
- align(qp->buf_size, g_page_size))) {
- cl_free(qp->wrid);
- return -1;
- }
-
- memset(qp->buf, 0, qp->buf_size);
-
- if (mthca_is_memfree(pd->context)) {
- struct mthca_next_seg *next;
- struct mthca_data_seg *scatter;
- int i;
- uint32_t sz;
-
- sz = cl_hton32((sizeof (struct mthca_next_seg) +
- qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);
-
- for (i = 0; i < qp->rq.max; ++i) {
- next = get_recv_wqe(qp, i);
- next->nda_op = cl_hton32(((i + 1) & (qp->rq.max - 1)) <<
- qp->rq.wqe_shift);
- next->ee_nds = sz;
-
- for (scatter = (void *) (next + 1);
- (void *) scatter < (void *) ((char *)next + (uint32_t)(1 << qp->rq.wqe_shift));
- ++scatter)
- scatter->lkey = cl_hton32(MTHCA_INVAL_LKEY);
- }
-
- for (i = 0; i < qp->sq.max; ++i) {
- next = get_send_wqe(qp, i);
- next->nda_op = cl_hton32((((i + 1) & (qp->sq.max - 1)) <<
- qp->sq.wqe_shift) +
- qp->send_wqe_offset);
- }
- }
-
- qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
- qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
-
- return 0;
-}
-
-struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)
-{
- int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
-
- if (ctx->qp_table[tind].refcnt)
- return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
- else
- return NULL;
-}
-
-int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)
-{
- int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
- int ret = 0;
-
- WaitForSingleObject( ctx->qp_table_mutex, INFINITE );
-
- if (!ctx->qp_table[tind].refcnt) {
- ctx->qp_table[tind].table = cl_malloc(
- (ctx->qp_table_mask + 1) * sizeof (struct mthca_qp *));
- if (!ctx->qp_table[tind].table) {
- ret = -1;
- goto out;
- }
- }
- ++ctx->qp_table[tind].refcnt;
- ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
-
-out:
- ReleaseMutex( ctx->qp_table_mutex );
- return ret;
-}
-
-void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)
-{
- int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
-
- WaitForSingleObject( ctx->qp_table_mutex, INFINITE );
-
- if (!--ctx->qp_table[tind].refcnt)
- cl_free(ctx->qp_table[tind].table);
- else
- ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
-
- ReleaseMutex( ctx->qp_table_mutex );
-}
-
-int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,
- int index, int *dbd, uint32_t *new_wqe)
-{
- struct mthca_next_seg *next;
-
- /*
- * For SRQs, all WQEs generate a CQE, so we're always at the
- * end of the doorbell chain.
- */
- if (qp->ibv_qp.srq) {
- *new_wqe = 0;
- return 0;
- }
-
- if (is_send)
- next = get_send_wqe(qp, index);
- else
- next = get_recv_wqe(qp, index);
-
- *dbd = !!(next->ee_nds & cl_hton32(MTHCA_NEXT_DBD));
- if (next->ee_nds & cl_hton32(0x3f))
- *new_wqe = (next->nda_op & cl_hton32(~0x3f)) |
- (next->ee_nds & cl_hton32(0x3f));
- else
- *new_wqe = 0;
-
- return 0;
-}
-
+/*\r
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.\r
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
+ *\r
+ * This software is available to you under a choice of one of two\r
+ * licenses. You may choose to be licensed under the terms of the GNU\r
+ * General Public License (GPL) Version 2, available from the file\r
+ * COPYING in the main directory of this source tree, or the\r
+ * OpenIB.org BSD license below:\r
+ *\r
+ * Redistribution and use in source and binary forms, with or\r
+ * without modification, are permitted provided that the following\r
+ * conditions are met:\r
+ *\r
+ * - Redistributions of source code must retain the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer.\r
+ *\r
+ * - Redistributions in binary form must reproduce the above\r
+ * copyright notice, this list of conditions and the following\r
+ * disclaimer in the documentation and/or other materials\r
+ * provided with the distribution.\r
+ *\r
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,\r
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\r
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\r
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS\r
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN\r
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\r
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
+ * SOFTWARE.\r
+ *\r
+ * $Id$\r
+ */\r
+\r
+#include <mt_l2w.h>\r
+#include "mlnx_uvp.h"\r
+#include "mlnx_uvp_doorbell.h"\r
+#include "mthca_wqe.h"\r
+#include "mlnx_ual_data.h"\r
+\r
+#if defined(EVENT_TRACING)\r
+#include "mlnx_uvp_qp.tmh"\r
+#endif\r
+\r
+static const uint8_t mthca_opcode[] = {\r
+ MTHCA_OPCODE_RDMA_WRITE,\r
+ MTHCA_OPCODE_RDMA_WRITE_IMM,\r
+ MTHCA_OPCODE_SEND,\r
+ MTHCA_OPCODE_SEND_IMM,\r
+ MTHCA_OPCODE_RDMA_READ,\r
+ MTHCA_OPCODE_ATOMIC_CS,\r
+ MTHCA_OPCODE_ATOMIC_FA\r
+};\r
+\r
+static enum mthca_wr_opcode conv_ibal_wr_opcode(struct _ib_send_wr *wr)\r
+{\r
+ enum mthca_wr_opcode opcode = -1; //= wr->wr_type;\r
+\r
+ switch (wr->wr_type) {\r
+ case WR_SEND: \r
+ opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? MTHCA_OPCODE_SEND_IMM : MTHCA_OPCODE_SEND;\r
+ break;\r
+ case WR_RDMA_WRITE: \r
+ opcode = (wr->send_opt & IB_SEND_OPT_IMMEDIATE) ? MTHCA_OPCODE_RDMA_WRITE_IMM : MTHCA_OPCODE_RDMA_WRITE;\r
+ break;\r
+ case WR_RDMA_READ: opcode = MTHCA_OPCODE_RDMA_READ; break;\r
+ case WR_COMPARE_SWAP: opcode = MTHCA_OPCODE_ATOMIC_CS; break;\r
+ case WR_FETCH_ADD: opcode = MTHCA_OPCODE_ATOMIC_FA; break;\r
+ default: opcode = MTHCA_OPCODE_INVALID;break;\r
+ }\r
+ return opcode;\r
+}\r
+\r
+\r
+static void dump_wqe(uint32_t print_lvl, uint32_t *wqe_ptr , struct mthca_qp *qp_ptr)\r
+{\r
+ net32_t *wqe = wqe_ptr;\r
+\r
+ (void) wqe; /* avoid warning if mthca_dbg compiled away... */\r
+ UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents QPN 0x%06x \n",qp_ptr->ibv_qp.qp_num));\r
+ UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",0\r
+ , cl_ntoh32(wqe[0]), cl_ntoh32(wqe[1]), cl_ntoh32(wqe[2]), cl_ntoh32(wqe[3])));\r
+ UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",4\r
+ , cl_ntoh32(wqe[4]), cl_ntoh32(wqe[5]), cl_ntoh32(wqe[6]), cl_ntoh32(wqe[7])));\r
+ UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",8\r
+ , cl_ntoh32(wqe[8]), cl_ntoh32(wqe[9]), cl_ntoh32(wqe[10]), cl_ntoh32(wqe[11])));\r
+ UVP_PRINT(print_lvl,UVP_DBG_QP,("WQE contents [%02x] %08x %08x %08x %08x \n",12\r
+ , cl_ntoh32(wqe[12]), cl_ntoh32(wqe[13]), cl_ntoh32(wqe[14]), cl_ntoh32(wqe[15])));\r
+\r
+}\r
+static void *get_recv_wqe(struct mthca_qp *qp, int n)\r
+{\r
+ return qp->buf + (n << qp->rq.wqe_shift);\r
+}\r
+\r
+static void *get_send_wqe(struct mthca_qp *qp, int n)\r
+{\r
+ void *wqe_addr = qp->buf + qp->send_wqe_offset + (n << qp->sq.wqe_shift);\r
+ UVP_PRINT(TRACE_LEVEL_INFORMATION,UVP_DBG_QP,\r
+ ("wqe %p, qp_buf %p, offset %#x, index %d, shift %d \n",\r
+ wqe_addr, qp->buf, qp->send_wqe_offset, n, \r
+ qp->sq.wqe_shift));\r
+ \r
+ return wqe_addr;\r
+}\r
+\r
+void mthca_init_qp_indices(struct mthca_qp *qp)\r
+{\r
+ qp->sq.next_ind = 0;\r
+ qp->sq.last_comp = qp->sq.max - 1;\r
+ qp->sq.head = 0;\r
+ qp->sq.tail = 0;\r
+ qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);\r
+\r
+ qp->rq.next_ind = 0;\r
+ qp->rq.last_comp = qp->rq.max - 1;\r
+ qp->rq.head = 0;\r
+ qp->rq.tail = 0;\r
+ qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);\r
+}\r
+\r
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq, struct mthca_cq *cq)\r
+{\r
+ unsigned cur;\r
+\r
+ cur = wq->head - wq->tail;\r
+ if ((int)(cur + nreq) < wq->max)\r
+ return 0;\r
+\r
+ cl_spinlock_acquire(&cq->lock);\r
+ cur = wq->head - wq->tail;\r
+ cl_spinlock_release(&cq->lock);\r
+\r
+ return (int)(cur + nreq) >= wq->max;\r
+}\r
+\r
+\r
+int mthca_tavor_post_send(struct ibv_qp *ibqp, struct _ib_send_wr *wr,\r
+ struct _ib_send_wr **bad_wr)\r
+{\r
+ struct mthca_qp *qp = to_mqp(ibqp);\r
+ uint8_t *wqe;\r
+ uint8_t *prev_wqe;\r
+ int ret = 0;\r
+ int nreq;\r
+ int i;\r
+ int size;\r
+ int size0 = 0;\r
+ uint32_t f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;\r
+ int ind;\r
+ int op0 = 0;\r
+ enum ib_wr_opcode opcode;\r
+ \r
+ UVP_ENTER(UVP_DBG_QP);\r
+ cl_spinlock_acquire(&qp->sq.lock);\r
+\r
+ /* XXX check that state is OK to post send */\r
+\r
+ ind = qp->sq.next_ind;\r
+\r
+ if(ibqp->state == IBV_QPS_RESET) {\r
+ ret = -EBUSY;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto err_busy;\r
+ }\r
+\r
+ for (nreq = 0; wr; ++nreq, wr = wr->p_next) {\r
+\r
+ if (mthca_wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x full (%u head, %u tail,"\r
+ " %d max, %d nreq)\n", ibqp->qp_num,\r
+ qp->sq.head, qp->sq.tail,\r
+ qp->sq.max, nreq));\r
+ ret = -ENOMEM;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ wqe = get_send_wqe(qp, ind);\r
+ prev_wqe = qp->sq.last;\r
+ qp->sq.last = wqe;\r
+ opcode = conv_ibal_wr_opcode(wr);\r
+ if (opcode == MTHCA_OPCODE_INVALID) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x opcode invalid\n",ibqp->qp_num));\r
+ ret = -EINVAL;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+\r
+ ((struct mthca_next_seg *) wqe)->nda_op = 0;\r
+ ((struct mthca_next_seg *) wqe)->ee_nds = 0;\r
+ ((struct mthca_next_seg *) wqe)->flags =\r
+ ((wr->send_opt & IB_SEND_OPT_SIGNALED) ?\r
+ cl_hton32(MTHCA_NEXT_CQ_UPDATE) : 0) |\r
+ ((wr->send_opt & IB_SEND_OPT_SOLICITED) ?\r
+ cl_hton32(MTHCA_NEXT_SOLICIT) : 0) |\r
+ cl_hton32(1);\r
+ if (opcode == MTHCA_OPCODE_SEND_IMM||\r
+ opcode == MTHCA_OPCODE_RDMA_WRITE_IMM)\r
+ ((struct mthca_next_seg *) wqe)->imm = wr->immediate_data;\r
+\r
+ wqe += sizeof (struct mthca_next_seg);\r
+ size = sizeof (struct mthca_next_seg) / 16;\r
+\r
+\r
+ switch (ibqp->qp_type) {\r
+ case IB_QPT_RELIABLE_CONN:\r
+ switch (opcode) {\r
+ case MTHCA_OPCODE_ATOMIC_CS:\r
+ case MTHCA_OPCODE_ATOMIC_FA:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+\r
+ if (opcode == MTHCA_OPCODE_ATOMIC_CS) {\r
+ ((struct mthca_atomic_seg *) wqe)->swap_add =\r
+ cl_hton64(wr->remote_ops.atomic2);\r
+ ((struct mthca_atomic_seg *) wqe)->compare =\r
+ cl_hton64(wr->remote_ops.atomic1);\r
+ } else {\r
+ ((struct mthca_atomic_seg *) wqe)->swap_add =\r
+ cl_hton64(wr->remote_ops.atomic1);\r
+ ((struct mthca_atomic_seg *) wqe)->compare = 0;\r
+ }\r
+\r
+ wqe += sizeof (struct mthca_atomic_seg);\r
+ size += (sizeof (struct mthca_raddr_seg) +\r
+ sizeof (struct mthca_atomic_seg)) / 16;\r
+ break;\r
+\r
+ case MTHCA_OPCODE_RDMA_WRITE:\r
+ case MTHCA_OPCODE_RDMA_WRITE_IMM:\r
+ case MTHCA_OPCODE_RDMA_READ:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+ size += sizeof (struct mthca_raddr_seg) / 16;\r
+ break;\r
+\r
+ default:\r
+ /* No extra segments required for sends */\r
+ break;\r
+ }\r
+\r
+ break;\r
+\r
+ case IB_QPT_UNRELIABLE_CONN:\r
+ switch (opcode) {\r
+ case MTHCA_OPCODE_RDMA_WRITE:\r
+ case MTHCA_OPCODE_RDMA_WRITE_IMM:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+ size += sizeof (struct mthca_raddr_seg) / 16;\r
+ break;\r
+\r
+ default:\r
+ /* No extra segments required for sends */\r
+ break;\r
+ }\r
+\r
+ break;\r
+\r
+ case IB_QPT_UNRELIABLE_DGRM:\r
+ {\r
+ struct mthca_ah *ah = ((struct mthca_ah *)wr->dgrm.ud.h_av);\r
+ ((struct mthca_tavor_ud_seg *) wqe)->lkey =\r
+ cl_hton32(ah->key);\r
+ ((struct mthca_tavor_ud_seg *) wqe)->av_addr =\r
+ cl_hton64((ULONG_PTR)ah->av);\r
+ ((struct mthca_tavor_ud_seg *) wqe)->dqpn = wr->dgrm.ud.remote_qp;\r
+ ((struct mthca_tavor_ud_seg *) wqe)->qkey = wr->dgrm.ud.remote_qkey;\r
+\r
+ wqe += sizeof (struct mthca_tavor_ud_seg);\r
+ size += sizeof (struct mthca_tavor_ud_seg) / 16;\r
+ break;\r
+ }\r
+\r
+ default:\r
+ break;\r
+ }\r
+\r
+ if ((int)(int)wr->num_ds > qp->sq.max_gs) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x too many gathers\n",ibqp->qp_num));\r
+ ret = -ERANGE;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+//TODO sleybo:\r
+ if (wr->send_opt & IB_SEND_OPT_INLINE) {\r
+ if (wr->num_ds) {\r
+ struct mthca_inline_seg *seg = (struct mthca_inline_seg *)wqe;\r
+ uint32_t s = 0;\r
+\r
+ wqe += sizeof *seg;\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ struct _ib_local_ds *sge = &wr->ds_array[i];\r
+\r
+ s += sge->length;\r
+\r
+ if (s > (uint32_t)qp->max_inline_data) {\r
+ ret = -1;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ memcpy(wqe, (void *) (ULONG_PTR) sge->vaddr,\r
+ sge->length);\r
+ wqe += sge->length;\r
+ }\r
+\r
+ seg->byte_count = cl_hton32(MTHCA_INLINE_SEG | s);\r
+ size += align(s + sizeof *seg, 16) / 16;\r
+ }\r
+ } else {\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ ((struct mthca_data_seg *) wqe)->byte_count =\r
+ cl_hton32(wr->ds_array[i].length);\r
+ ((struct mthca_data_seg *) wqe)->lkey =\r
+ cl_hton32(wr->ds_array[i].lkey);\r
+ ((struct mthca_data_seg *) wqe)->addr =\r
+ cl_hton64(wr->ds_array[i].vaddr);\r
+ wqe += sizeof (struct mthca_data_seg);\r
+ size += sizeof (struct mthca_data_seg) / 16;\r
+ }\r
+ }\r
+\r
+ qp->wrid[ind + qp->rq.max] = wr->wr_id;\r
+\r
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =\r
+ cl_hton32(((ind << qp->sq.wqe_shift) +\r
+ qp->send_wqe_offset) |opcode);\r
+ \r
+ wmb();\r
+ \r
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =\r
+ cl_hton32((size0 ? 0 : MTHCA_NEXT_DBD) | size |\r
+ ((wr->send_opt& IB_SEND_OPT_FENCE) ?\r
+ MTHCA_NEXT_FENCE : 0));\r
+\r
+ if (!size0) {\r
+ size0 = size;\r
+ op0 = opcode;\r
+ }\r
+ \r
+ dump_wqe( TRACE_LEVEL_VERBOSE, (uint32_t*)qp->sq.last,qp);\r
+ \r
+ ++ind;\r
+ if (unlikely(ind >= qp->sq.max))\r
+ ind -= qp->sq.max;\r
+\r
+ }\r
+\r
+out:\r
+ if (likely(nreq)) {\r
+ uint32_t doorbell[2];\r
+\r
+ doorbell[0] = cl_hton32(((qp->sq.next_ind << qp->sq.wqe_shift) +\r
+ qp->send_wqe_offset) | f0 | op0);\r
+ doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);\r
+\r
+ wmb();\r
+\r
+ mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);\r
+ }\r
+\r
+ qp->sq.next_ind = ind;\r
+ qp->sq.head += nreq;\r
+\r
+err_busy:\r
+ cl_spinlock_release(&qp->sq.lock);\r
+ \r
+ UVP_EXIT(UVP_DBG_QP);\r
+ return ret;\r
+}\r
+\r
+\r
+int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct _ib_recv_wr *wr,\r
+ struct _ib_recv_wr **bad_wr)\r
+{\r
+ struct mthca_qp *qp = to_mqp(ibqp);\r
+ uint32_t doorbell[2];\r
+ int ret = 0;\r
+ int nreq;\r
+ int i;\r
+ int size;\r
+ int size0 = 0;\r
+ int ind;\r
+ uint8_t *wqe;\r
+ uint8_t *prev_wqe;\r
+ \r
+ UVP_ENTER(UVP_DBG_QP);\r
+ \r
+ cl_spinlock_acquire(&qp->rq.lock);\r
+\r
+ /* XXX check that state is OK to post receive */\r
+ \r
+ ind = qp->rq.next_ind;\r
+ if(ibqp->state == IBV_QPS_RESET) {\r
+ ret = -EBUSY;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto err_busy;\r
+ }\r
+ \r
+ for (nreq = 0; wr; ++nreq, wr = wr->p_next) {\r
+ if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {\r
+ nreq = 0;\r
+\r
+ doorbell[0] = cl_hton32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);\r
+ doorbell[1] = cl_hton32(ibqp->qp_num << 8); //TODO sleybo: add qpn to qp struct \r
+\r
+ /*\r
+ * Make sure that descriptors are written\r
+ * before doorbell is rung.\r
+ */\r
+ mb();\r
+\r
+ mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_RECV_DOORBELL);\r
+\r
+ qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;\r
+ size0 = 0;\r
+ }\r
+\r
+ if (mthca_wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR,UVP_DBG_QP,("RQ %06x full (%u head, %u tail,"\r
+ " %d max, %d nreq)\n", ibqp->qp_num,\r
+ qp->rq.head, qp->rq.tail,\r
+ qp->rq.max, nreq));\r
+ ret = -ENOMEM;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ wqe = get_recv_wqe(qp, ind);\r
+ prev_wqe = qp->rq.last;\r
+ qp->rq.last = wqe;\r
+\r
+ ((struct mthca_next_seg *) wqe)->nda_op = 0;\r
+ ((struct mthca_next_seg *) wqe)->ee_nds =\r
+ cl_hton32(MTHCA_NEXT_DBD);\r
+ ((struct mthca_next_seg *) wqe)->flags =\r
+ cl_hton32(MTHCA_NEXT_CQ_UPDATE);\r
+\r
+ wqe += sizeof (struct mthca_next_seg);\r
+ size = sizeof (struct mthca_next_seg) / 16;\r
+\r
+ if (unlikely((int)wr->num_ds > qp->rq.max_gs)) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x too many gathers\n",ibqp->qp_num));\r
+ ret = -ERANGE;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ ((struct mthca_data_seg *) wqe)->byte_count =\r
+ cl_hton32(wr->ds_array[i].length);\r
+ ((struct mthca_data_seg *) wqe)->lkey =\r
+ cl_hton32(wr->ds_array[i].lkey);\r
+ ((struct mthca_data_seg *) wqe)->addr =\r
+ cl_hton64(wr->ds_array[i].vaddr);\r
+ wqe += sizeof (struct mthca_data_seg);\r
+ size += sizeof (struct mthca_data_seg) / 16;\r
+ }\r
+\r
+ qp->wrid[ind] = wr->wr_id;\r
+\r
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =\r
+ cl_hton32((ind << qp->rq.wqe_shift) | 1);\r
+ wmb();\r
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =\r
+ cl_hton32(MTHCA_NEXT_DBD | size);\r
+\r
+ if (!size0)\r
+ size0 = size;\r
+\r
+ ++ind;\r
+ if (unlikely(ind >= qp->rq.max))\r
+ ind -= qp->rq.max;\r
+ }\r
+\r
+out:\r
+ if (likely(nreq)) {\r
+ doorbell[0] = cl_hton32((qp->rq.next_ind << qp->rq.wqe_shift) | size0);\r
+ doorbell[1] = cl_hton32((ibqp->qp_num << 8) | (nreq & 255));\r
+\r
+ /*\r
+ * Make sure that descriptors are written before\r
+ * doorbell is rung.\r
+ */\r
+ mb();\r
+\r
+ mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_RECV_DOORBELL);\r
+ }\r
+\r
+ qp->rq.next_ind = ind;\r
+ qp->rq.head += nreq;\r
+\r
+err_busy:\r
+ cl_spinlock_release(&qp->rq.lock);\r
+ UVP_EXIT(UVP_DBG_QP);\r
+ return ret;\r
+}\r
+\r
+int mthca_arbel_post_send(struct ibv_qp *ibqp, struct _ib_send_wr *wr,\r
+ struct _ib_send_wr **bad_wr)\r
+{\r
+ struct mthca_qp *qp = to_mqp(ibqp);\r
+ uint32_t doorbell[2];\r
+ uint8_t *wqe;\r
+ uint8_t *prev_wqe;\r
+ int ret = 0;\r
+ int nreq; \r
+ int i;\r
+ int size;\r
+ int size0 = 0;\r
+ uint32_t f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;\r
+ int ind;\r
+ uint8_t op0 = 0;\r
+ enum ib_wr_opcode opcode;\r
+ \r
+ UVP_ENTER(UVP_DBG_QP);\r
+ \r
+ cl_spinlock_acquire(&qp->sq.lock);\r
+\r
+ /* XXX check that state is OK to post send */\r
+\r
+ ind = qp->sq.head & (qp->sq.max - 1);\r
+ if(ibqp->state == IBV_QPS_RESET) {\r
+ ret = -EBUSY;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto err_busy;\r
+ }\r
+\r
+ for (nreq = 0; wr; ++nreq, wr = wr->p_next) {\r
+ if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {\r
+ nreq = 0;\r
+\r
+ doorbell[0] = cl_hton32((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |\r
+ ((qp->sq.head & 0xffff) << 8) | f0 | op0);\r
+ doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);\r
+ qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;\r
+ size0 = 0;\r
+ f0 = unlikely(wr->send_opt & IB_SEND_OPT_FENCE) ? MTHCA_SEND_DOORBELL_FENCE : 0;\r
+\r
+ /*\r
+ * Make sure that descriptors are written before\r
+ * doorbell record.\r
+ */\r
+ wmb();\r
+ *qp->sq.db = cl_hton32(qp->sq.head & 0xffff);\r
+\r
+ /*\r
+ * Make sure doorbell record is written before we\r
+ * write MMIO send doorbell.\r
+ */\r
+ wmb();\r
+ mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);\r
+\r
+ }\r
+\r
+ if (mthca_wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR,UVP_DBG_QP,("SQ %06x full (%u head, %u tail,"\r
+ " %d max, %d nreq)\n", ibqp->qp_num,\r
+ qp->sq.head, qp->sq.tail,\r
+ qp->sq.max, nreq)); \r
+ ret = -ENOMEM;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ wqe = get_send_wqe(qp, ind);\r
+ prev_wqe = qp->sq.last;\r
+ qp->sq.last = wqe;\r
+ opcode = conv_ibal_wr_opcode(wr);\r
+\r
+ ((struct mthca_next_seg *) wqe)->flags =\r
+ ((wr->send_opt & IB_SEND_OPT_SIGNALED) ?\r
+ cl_hton32(MTHCA_NEXT_CQ_UPDATE) : 0) |\r
+ ((wr->send_opt & IB_SEND_OPT_SOLICITED) ?\r
+ cl_hton32(MTHCA_NEXT_SOLICIT) : 0) |\r
+ cl_hton32(1);\r
+ if (opcode == MTHCA_OPCODE_SEND_IMM||\r
+ opcode == MTHCA_OPCODE_RDMA_WRITE_IMM)\r
+ ((struct mthca_next_seg *) wqe)->imm = wr->immediate_data;\r
+\r
+ wqe += sizeof (struct mthca_next_seg);\r
+ size = sizeof (struct mthca_next_seg) / 16;\r
+\r
+ switch (ibqp->qp_type) {\r
+ case IB_QPT_RELIABLE_CONN:\r
+ switch (opcode) {\r
+ case MTHCA_OPCODE_ATOMIC_CS:\r
+ case MTHCA_OPCODE_ATOMIC_FA:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+\r
+ if (opcode == MTHCA_OPCODE_ATOMIC_CS) {\r
+ ((struct mthca_atomic_seg *) wqe)->swap_add =\r
+ cl_hton64(wr->remote_ops.atomic2);\r
+ ((struct mthca_atomic_seg *) wqe)->compare =\r
+ cl_hton64(wr->remote_ops.atomic1);\r
+ } else {\r
+ ((struct mthca_atomic_seg *) wqe)->swap_add =\r
+ cl_hton64(wr->remote_ops.atomic1);\r
+ ((struct mthca_atomic_seg *) wqe)->compare = 0;\r
+ }\r
+\r
+ wqe += sizeof (struct mthca_atomic_seg);\r
+ size += (sizeof (struct mthca_raddr_seg) +\r
+ sizeof (struct mthca_atomic_seg)) / 16;\r
+ break;\r
+\r
+ case MTHCA_OPCODE_RDMA_READ:\r
+ case MTHCA_OPCODE_RDMA_WRITE:\r
+ case MTHCA_OPCODE_RDMA_WRITE_IMM:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+ size += sizeof (struct mthca_raddr_seg) / 16;\r
+ break;\r
+\r
+ default:\r
+ /* No extra segments required for sends */\r
+ break;\r
+ }\r
+\r
+ break;\r
+\r
+ case IB_QPT_UNRELIABLE_CONN:\r
+ switch (opcode) {\r
+ case MTHCA_OPCODE_RDMA_WRITE:\r
+ case MTHCA_OPCODE_RDMA_WRITE_IMM:\r
+ ((struct mthca_raddr_seg *) wqe)->raddr =\r
+ cl_hton64(wr->remote_ops.vaddr);\r
+ ((struct mthca_raddr_seg *) wqe)->rkey =\r
+ wr->remote_ops.rkey;\r
+ ((struct mthca_raddr_seg *) wqe)->reserved = 0;\r
+ wqe += sizeof (struct mthca_raddr_seg);\r
+ size += sizeof (struct mthca_raddr_seg) / 16;\r
+ break;\r
+\r
+ default:\r
+ /* No extra segments required for sends */\r
+ break;\r
+ }\r
+\r
+ break;\r
+\r
+ case IB_QPT_UNRELIABLE_DGRM:\r
+ {\r
+ struct mthca_ah *ah = ((struct mthca_ah *)wr->dgrm.ud.h_av);\r
+ memcpy(((struct mthca_arbel_ud_seg *) wqe)->av,\r
+ ah->av, sizeof ( struct mthca_av));\r
+ ((struct mthca_arbel_ud_seg *) wqe)->dqpn = wr->dgrm.ud.remote_qp;\r
+ ((struct mthca_arbel_ud_seg *) wqe)->qkey = wr->dgrm.ud.remote_qkey;\r
+\r
+\r
+ wqe += sizeof (struct mthca_arbel_ud_seg);\r
+ size += sizeof (struct mthca_arbel_ud_seg) / 16;\r
+ break;\r
+ }\r
+\r
+ default:\r
+ break;\r
+ }\r
+\r
+ if ((int)wr->num_ds > qp->sq.max_gs) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x full too many gathers\n",ibqp->qp_num));\r
+ ret = -ERANGE;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ if (wr->send_opt & IB_SEND_OPT_INLINE) {\r
+ if (wr->num_ds) {\r
+ struct mthca_inline_seg *seg = (struct mthca_inline_seg *)wqe;\r
+ uint32_t s = 0;\r
+\r
+ wqe += sizeof *seg;\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ struct _ib_local_ds *sge = &wr->ds_array[i];\r
+\r
+ s += sge->length;\r
+\r
+ if (s > (uint32_t)qp->max_inline_data) {\r
+ ret = -E2BIG;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ memcpy(wqe, (void *) (uintptr_t) sge->vaddr,\r
+ sge->length);\r
+ wqe += sge->length;\r
+ }\r
+\r
+ seg->byte_count = cl_hton32(MTHCA_INLINE_SEG | s);\r
+ size += align(s + sizeof *seg, 16) / 16;\r
+ }\r
+ } else {\r
+\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ ((struct mthca_data_seg *) wqe)->byte_count =\r
+ cl_hton32(wr->ds_array[i].length);\r
+ ((struct mthca_data_seg *) wqe)->lkey =\r
+ cl_hton32(wr->ds_array[i].lkey);\r
+ ((struct mthca_data_seg *) wqe)->addr =\r
+ cl_hton64(wr->ds_array[i].vaddr);\r
+ wqe += sizeof (struct mthca_data_seg);\r
+ size += sizeof (struct mthca_data_seg) / 16;\r
+ }\r
+//TODO do this also in kernel\r
+// size += wr->num_ds * (sizeof *seg / 16);\r
+ }\r
+\r
+ qp->wrid[ind + qp->rq.max] = wr->wr_id;\r
+\r
+ if (opcode == MTHCA_OPCODE_INVALID) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("SQ %06x opcode invalid\n",ibqp->qp_num));\r
+ ret = -EINVAL;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ ((struct mthca_next_seg *) prev_wqe)->nda_op =\r
+ cl_hton32(((ind << qp->sq.wqe_shift) +\r
+ qp->send_wqe_offset) |\r
+ opcode);\r
+ wmb();\r
+ ((struct mthca_next_seg *) prev_wqe)->ee_nds =\r
+ cl_hton32(MTHCA_NEXT_DBD | size |\r
+ ((wr->send_opt & IB_SEND_OPT_FENCE) ?\r
+ MTHCA_NEXT_FENCE : 0));\r
+\r
+ if (!size0) {\r
+ size0 = size;\r
+ op0 = opcode;\r
+ }\r
+\r
+ ++ind;\r
+ if (unlikely(ind >= qp->sq.max))\r
+ ind -= qp->sq.max;\r
+ }\r
+\r
+out:\r
+ if (likely(nreq)) {\r
+ doorbell[0] = cl_hton32((nreq << 24) |\r
+ ((qp->sq.head & 0xffff) << 8) | f0 | op0);\r
+ doorbell[1] = cl_hton32((ibqp->qp_num << 8) | size0);\r
+\r
+ qp->sq.head += nreq;\r
+\r
+ /*\r
+ * Make sure that descriptors are written before\r
+ * doorbell record.\r
+ */\r
+ wmb();\r
+ *qp->sq.db = cl_hton32(qp->sq.head & 0xffff);\r
+\r
+ /*\r
+ * Make sure doorbell record is written before we\r
+ * write MMIO send doorbell.\r
+ */\r
+ wmb();\r
+ mthca_write64(doorbell, to_mctx(ibqp->pd->context), MTHCA_SEND_DOORBELL);\r
+ }\r
+\r
+err_busy:\r
+ cl_spinlock_release(&qp->sq.lock);\r
+\r
+ UVP_EXIT(UVP_DBG_QP);\r
+ \r
+ return ret;\r
+}\r
+\r
+int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct _ib_recv_wr *wr,\r
+ struct _ib_recv_wr **bad_wr)\r
+{\r
+ struct mthca_qp *qp = to_mqp(ibqp);\r
+ int ret = 0;\r
+ int nreq;\r
+ int ind;\r
+ int i;\r
+ uint8_t *wqe;\r
+ \r
+ UVP_ENTER(UVP_DBG_QP);\r
+ \r
+ cl_spinlock_acquire(&qp->rq.lock);\r
+\r
+ /* XXX check that state is OK to post receive */\r
+\r
+ ind = qp->rq.head & (qp->rq.max - 1);\r
+ if(ibqp->state == IBV_QPS_RESET) {\r
+ ret = -EBUSY;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto err_busy;\r
+ }\r
+ for (nreq = 0; wr; ++nreq, wr = wr->p_next) {\r
+ if (mthca_wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {//TODO sleybo: check the cq\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x full (%u head, %u tail,"\r
+ " %d max, %d nreq)\n", ibqp->qp_num,\r
+ qp->rq.head, qp->rq.tail,\r
+ qp->rq.max, nreq));\r
+ ret = -ENOMEM;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ wqe = get_recv_wqe(qp, ind);\r
+\r
+ ((struct mthca_next_seg *) wqe)->flags = 0;\r
+\r
+ wqe += sizeof (struct mthca_next_seg);\r
+\r
+ if (unlikely((int)wr->num_ds > qp->rq.max_gs)) {\r
+ UVP_PRINT(TRACE_LEVEL_ERROR ,UVP_DBG_QP ,("RQ %06x full too many scatter\n",ibqp->qp_num));\r
+ ret = -ERANGE;\r
+ if (bad_wr)\r
+ *bad_wr = wr;\r
+ goto out;\r
+ }\r
+\r
+ for (i = 0; i < (int)wr->num_ds; ++i) {\r
+ ((struct mthca_data_seg *) wqe)->byte_count =\r
+ cl_hton32(wr->ds_array[i].length);\r
+ ((struct mthca_data_seg *) wqe)->lkey =\r
+ cl_hton32(wr->ds_array[i].lkey);\r
+ ((struct mthca_data_seg *) wqe)->addr =\r
+ cl_hton64(wr->ds_array[i].vaddr);\r
+ wqe += sizeof (struct mthca_data_seg);\r
+ }\r
+\r
+ if (i < qp->rq.max_gs) {\r
+ ((struct mthca_data_seg *) wqe)->byte_count = 0;\r
+ ((struct mthca_data_seg *) wqe)->lkey = cl_hton32(MTHCA_INVAL_LKEY);\r
+ ((struct mthca_data_seg *) wqe)->addr = 0;\r
+ }\r
+\r
+ qp->wrid[ind] = wr->wr_id;\r
+\r
+ ++ind;\r
+ if (unlikely(ind >= qp->rq.max))\r
+ ind -= qp->rq.max;\r
+ }\r
+out:\r
+ if (likely(nreq)) {\r
+ qp->rq.head += nreq;\r
+\r
+ /*\r
+ * Make sure that descriptors are written before\r
+ * doorbell record.\r
+ */\r
+ mb();\r
+ *qp->rq.db = cl_hton32(qp->rq.head & 0xffff);\r
+ }\r
+\r
+err_busy:\r
+ cl_spinlock_release(&qp->rq.lock);\r
+ \r
+ UVP_EXIT(UVP_DBG_QP);\r
+ \r
+ return ret;\r
+}\r
+\r
+int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,\r
+ ib_qp_type_t type, struct mthca_qp *qp)\r
+{\r
+ int size;\r
+ int max_sq_sge;\r
+\r
+ qp->rq.max_gs = cap->max_recv_sge;\r
+ qp->sq.max_gs = cap->max_send_sge;\r
+ max_sq_sge = align(cap->max_inline_data + sizeof (struct mthca_inline_seg),\r
+ sizeof (struct mthca_data_seg)) / sizeof (struct mthca_data_seg);\r
+ if (max_sq_sge < (int)cap->max_send_sge)\r
+ max_sq_sge = cap->max_send_sge;\r
+\r
+ qp->wrid = cl_malloc((qp->rq.max + qp->sq.max) * sizeof (uint64_t));\r
+ if (!qp->wrid)\r
+ return -1;\r
+\r
+ size = sizeof (struct mthca_next_seg) +\r
+ qp->rq.max_gs * sizeof (struct mthca_data_seg);\r
+\r
+ for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;\r
+ qp->rq.wqe_shift++)\r
+ ; /* nothing */\r
+\r
+ size = max_sq_sge * sizeof (struct mthca_data_seg);\r
+ switch (type) {\r
+ case IB_QPT_UNRELIABLE_DGRM:\r
+ size += mthca_is_memfree(pd->context) ?\r
+ sizeof (struct mthca_arbel_ud_seg) :\r
+ sizeof (struct mthca_tavor_ud_seg);\r
+ break;\r
+\r
+ case IB_QPT_UNRELIABLE_CONN:\r
+ size += sizeof (struct mthca_raddr_seg);\r
+ break;\r
+\r
+ case IB_QPT_RELIABLE_CONN:\r
+ size += sizeof (struct mthca_raddr_seg);\r
+ /*\r
+ * An atomic op will require an atomic segment, a\r
+ * remote address segment and one scatter entry.\r
+ */\r
+ if (size < (sizeof (struct mthca_atomic_seg) +\r
+ sizeof (struct mthca_raddr_seg) +\r
+ sizeof (struct mthca_data_seg)))\r
+ size = (sizeof (struct mthca_atomic_seg) +\r
+ sizeof (struct mthca_raddr_seg) +\r
+ sizeof (struct mthca_data_seg));\r
+ break;\r
+\r
+ default:\r
+ break;\r
+ }\r
+\r
+ /* Make sure that we have enough space for a bind request */\r
+ if (size < sizeof (struct mthca_bind_seg))\r
+ size = sizeof (struct mthca_bind_seg);\r
+\r
+ size += sizeof (struct mthca_next_seg);\r
+\r
+ for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;\r
+ qp->sq.wqe_shift++)\r
+ ; /* nothing */\r
+\r
+ qp->send_wqe_offset = align(qp->rq.max << qp->rq.wqe_shift,\r
+ 1 << qp->sq.wqe_shift);\r
+\r
+ qp->buf_size = qp->send_wqe_offset + (qp->sq.max << qp->sq.wqe_shift);\r
+\r
+ if (posix_memalign(&qp->buf, g_page_size,\r
+ align(qp->buf_size, g_page_size))) {\r
+ cl_free(qp->wrid);\r
+ return -1;\r
+ }\r
+\r
+ memset(qp->buf, 0, qp->buf_size);\r
+\r
+ if (mthca_is_memfree(pd->context)) {\r
+ struct mthca_next_seg *next;\r
+ struct mthca_data_seg *scatter;\r
+ int i;\r
+ uint32_t sz;\r
+\r
+ sz = cl_hton32((sizeof (struct mthca_next_seg) +\r
+ qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16);\r
+\r
+ for (i = 0; i < qp->rq.max; ++i) {\r
+ next = get_recv_wqe(qp, i);\r
+ next->nda_op = cl_hton32(((i + 1) & (qp->rq.max - 1)) <<\r
+ qp->rq.wqe_shift);\r
+ next->ee_nds = sz;\r
+\r
+ for (scatter = (void *) (next + 1);\r
+ (void *) scatter < (void *) ((char *)next + (uint32_t)(1 << qp->rq.wqe_shift));\r
+ ++scatter)\r
+ scatter->lkey = cl_hton32(MTHCA_INVAL_LKEY);\r
+ }\r
+\r
+ for (i = 0; i < qp->sq.max; ++i) {\r
+ next = get_send_wqe(qp, i);\r
+ next->nda_op = cl_hton32((((i + 1) & (qp->sq.max - 1)) <<\r
+ qp->sq.wqe_shift) +\r
+ qp->send_wqe_offset);\r
+ }\r
+ }\r
+\r
+ qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);\r
+ qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);\r
+\r
+ return 0;\r
+}\r
+\r
+struct mthca_qp *mthca_find_qp(struct mthca_context *ctx, uint32_t qpn)\r
+{\r
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;\r
+\r
+ if (ctx->qp_table[tind].refcnt)\r
+ return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];\r
+ else\r
+ return NULL;\r
+}\r
+\r
+int mthca_store_qp(struct mthca_context *ctx, uint32_t qpn, struct mthca_qp *qp)\r
+{\r
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;\r
+ int ret = 0;\r
+\r
+ WaitForSingleObject( ctx->qp_table_mutex, INFINITE );\r
+\r
+ if (!ctx->qp_table[tind].refcnt) {\r
+ ctx->qp_table[tind].table = cl_malloc(\r
+ (ctx->qp_table_mask + 1) * sizeof (struct mthca_qp *));\r
+ if (!ctx->qp_table[tind].table) {\r
+ ret = -1;\r
+ goto out;\r
+ }\r
+ }\r
+ ++ctx->qp_table[tind].refcnt;\r
+ ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;\r
+\r
+out:\r
+ ReleaseMutex( ctx->qp_table_mutex );\r
+ return ret;\r
+}\r
+\r
+void mthca_clear_qp(struct mthca_context *ctx, uint32_t qpn)\r
+{\r
+ int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;\r
+\r
+ WaitForSingleObject( ctx->qp_table_mutex, INFINITE );\r
+\r
+ if (!--ctx->qp_table[tind].refcnt)\r
+ cl_free(ctx->qp_table[tind].table);\r
+ else\r
+ ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;\r
+ \r
+ ReleaseMutex( ctx->qp_table_mutex );\r
+}\r
+\r
+int mthca_free_err_wqe(struct mthca_qp *qp, int is_send,\r
+ int index, int *dbd, uint32_t *new_wqe)\r
+{\r
+ struct mthca_next_seg *next;\r
+\r
+ /*\r
+ * For SRQs, all WQEs generate a CQE, so we're always at the\r
+ * end of the doorbell chain.\r
+ */\r
+ if (qp->ibv_qp.srq) {\r
+ *new_wqe = 0;\r
+ return 0;\r
+ }\r
+\r
+ if (is_send)\r
+ next = get_send_wqe(qp, index);\r
+ else\r
+ next = get_recv_wqe(qp, index);\r
+\r
+ *dbd = !!(next->ee_nds & cl_hton32(MTHCA_NEXT_DBD));\r
+ if (next->ee_nds & cl_hton32(0x3f))\r
+ *new_wqe = (next->nda_op & cl_hton32(~0x3f)) |\r
+ (next->ee_nds & cl_hton32(0x3f));\r
+ else\r
+ *new_wqe = 0;\r
+\r
+ return 0;\r
+}\r
+\r
/*\r
* Copyright (c) 2005 SilverStorm Technologies. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
*/\r
#pragma warning( disable:4232 )\r
\r
+/*\r
+ * Enable warnings about pointer sign extension.\r
+ */\r
+#pragma warning( default:4826 )\r
+\r
/* For DECLSPEC_EXPORT and DECLSPEC_IMPORT */\r
#include <ntdef.h>\r
\r
/*\r
* Copyright (c) 2005 SilverStorm Technologies. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
#include <windows.h>\r
#endif // !defined( _WINDOWS_ )\r
\r
+/*\r
+ * Enable warnings about pointer sign extension.\r
+ */\r
+#pragma warning( default:4826 )\r
+\r
#if defined( _DEBUG ) || DBG\r
#define _DEBUG_\r
#else\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
vnic_adapter_t *p_primary_adapter; \r
#endif\r
\r
- vnic_adapter_t * __ptr64 p_adapter = (vnic_adapter_t * __ptr64)p_pnp_rec->pnp_context;\r
+ vnic_adapter_t * p_adapter = (vnic_adapter_t *)p_pnp_rec->pnp_context;\r
\r
VNIC_ENTER( VNIC_DBG_PNP );\r
\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
pIo->wrq.p_next = NULL;\r
pIo->wrq.wr_type = WR_SEND;\r
pIo->wrq.send_opt = IB_SEND_OPT_SIGNALED;\r
- pIo->wrq.wr_id = (uint64_t)(pIo);\r
+ pIo->wrq.wr_id = (ULONG_PTR)pIo;\r
pIo->wrq.num_ds = 1;\r
pIo->wrq.ds_array = &pControl->sendIo.dsList;\r
pIo->wrq.ds_array[0].length = sizeof(Inic_ControlPacket_t);\r
pIo->wrq.ds_array[0].lkey = pControl->region.lkey;\r
- pIo->wrq.ds_array[0].vaddr = (uint64_t)(pkt++);\r
+ pIo->wrq.ds_array[0].vaddr = (ULONG_PTR)pkt++;\r
\r
for (i = 0; i < pConfig->numRecvs; i++ )\r
{\r
pIo->pViport = pViport;\r
pIo->pRoutine = control_recvComplete;\r
\r
- pIo->r_wrq.wr_id = (uint64_t)(pIo);\r
+ pIo->r_wrq.wr_id = (ULONG_PTR)pIo;\r
pIo->r_wrq.p_next = NULL;\r
pIo->r_wrq.num_ds = 1;\r
pIo->r_wrq.ds_array = &pControl->pRecvIos[i].dsList;\r
pIo->r_wrq.ds_array[0].length = sizeof(Inic_ControlPacket_t);\r
- pIo->r_wrq.ds_array[0].vaddr = (uint64_t)(pkt++);\r
+ pIo->r_wrq.ds_array[0].vaddr = (ULONG_PTR)pkt++;\r
pIo->r_wrq.ds_array[0].lkey = pControl->region.lkey;\r
\r
if ( ibqp_postRecv( &pControl->qp, pIo ) != IB_SUCCESS )\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
_data_kickTimer_stop(\r
IN Data_t *pData );\r
\r
-#define LOCAL_IO(x) PTR64((x))\r
-\r
#define INBOUND_COPY\r
\r
#ifdef VNIC_STATISTIC\r
pRdmaIo->io.pRoutine = NULL;\r
pRdmaIo->io.wrq.p_next = NULL;\r
pRdmaIo->io.wrq.wr_type = WR_RDMA_WRITE;\r
- pRdmaIo->io.wrq.wr_id = PTR64( pRdmaIo );\r
+ pRdmaIo->io.wrq.wr_id = (ULONG_PTR)pRdmaIo;\r
pRdmaIo->io.wrq.num_ds = 1;\r
pRdmaIo->io.wrq.ds_array = pRdmaIo->dsList;\r
pRdmaIo->dsList[0].lkey = pData->region.lkey;\r
pSendIo->io.pRoutine = NULL;\r
pSendIo->io.wrq.p_next = NULL;\r
pSendIo->io.wrq.wr_type = WR_SEND;\r
- pSendIo->io.wrq.wr_id = PTR64( pSendIo );\r
+ pSendIo->io.wrq.wr_id = (ULONG_PTR)pSendIo;\r
pSendIo->io.wrq.num_ds = 1;\r
pSendIo->io.wrq.ds_array = &pSendIo->dsList;\r
\r
pSendIo->io.wrq.send_opt = IB_SEND_OPT_SIGNALED;\r
\r
pSendIo->dsList.length = 0;\r
- pSendIo->dsList.vaddr = PTR64( pRegionData );\r
+ pSendIo->dsList.vaddr = (ULONG_PTR)pRegionData;\r
pSendIo->dsList.lkey = pData->region.lkey;\r
\r
for ( i = 0; i < pData->p_conf->numRecvs; i++ )\r
{\r
pRecvIo[i].io.pViport = pData->p_viport;\r
pRecvIo[i].io.pRoutine = _data_receivedKick;\r
- pRecvIo[i].io.r_wrq.wr_id = PTR64( &pRecvIo[i].io );\r
+ pRecvIo[i].io.r_wrq.wr_id = (ULONG_PTR)&pRecvIo[i].io;\r
pRecvIo[i].io.r_wrq.p_next = NULL;\r
pRecvIo[i].io.r_wrq.num_ds = 1;\r
pRecvIo[i].io.r_wrq.ds_array = &pRecvIo[i].dsList;\r
pRecvIo[i].dsList.length = 4;\r
- pRecvIo[i].dsList.vaddr = PTR64( pRegionData );\r
+ pRecvIo[i].dsList.vaddr = (ULONG_PTR)pRegionData;\r
pRecvIo[i].dsList.lkey = pData->region.lkey;\r
\r
InitializeListHead( &pRecvIo[i].io.listPtrs );\r
pRdmaIo->io.pRoutine = _data_xmitComplete;\r
pRdmaIo->io.wrq.p_next = NULL;\r
pRdmaIo->io.wrq.wr_type = WR_RDMA_WRITE;\r
- pRdmaIo->io.wrq.wr_id = PTR64(pRdmaIo);\r
+ pRdmaIo->io.wrq.wr_id = (ULONG_PTR)pRdmaIo;\r
pRdmaIo->io.wrq.num_ds = MAX_NUM_SGE; // will set actual number when transmit\r
pRdmaIo->io.wrq.ds_array = pRdmaIo->dsList;\r
pRdmaIo->p_trailer = (ViportTrailer_t *)&pRdmaIo->data[0];\r
}\r
\r
pXmitPool->rdmaRKey = pData->region.rkey;\r
- pXmitPool->rdmaAddr = PTR64( pXmitPool->bufPool );\r
+ pXmitPool->rdmaAddr = (ULONG_PTR)pXmitPool->bufPool;\r
\r
data_postRecvs( pData );\r
\r
pBpe = &p_recvPool->bufPool[index];\r
\r
pBpe->rKey = pRdmaDest->region.rkey;\r
- pBpe->remoteAddr = hton64( PTR64( pRdmaDest->data ) );\r
+ pBpe->remoteAddr = hton64( (ULONG_PTR)pRdmaDest->data );\r
pBpe->valid = (uint32_t)(pRdmaDest - &p_recvPool->pRecvBufs[0]) + 1;\r
++p_recvPool->numFreeBufs;\r
\r
rdmaAddr = p_recvPool->eiocRdmaAddr + offset;\r
\r
pWrq->ds_array->length = sz;\r
- pWrq->ds_array->vaddr = PTR64((uint8_t *)p_recvPool->bufPool + offset);\r
+ pWrq->ds_array->vaddr = (ULONG_PTR)((uint8_t *)p_recvPool->bufPool + offset);\r
pWrq->remote_ops.vaddr = rdmaAddr;\r
\r
if ( ibqp_postSend( &pData->qp, &pData->freeBufsIo.io ) != IB_SUCCESS )\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
IN viport_t *p_viport,\r
IN OUT IbRegion_t *pRegion,\r
IN ib_pd_handle_t hPd,\r
- IN void* __ptr64 vaddr,\r
+ IN void* vaddr,\r
IN uint64_t len,\r
IN ib_access_t access_ctrl )\r
{\r
else\r
{\r
pRegion->len = len;\r
- pRegion->virtAddress = (uint64_t)( vaddr );\r
+ pRegion->virtAddress = (ULONG_PTR)vaddr;\r
}\r
VNIC_EXIT ( VNIC_DBG_IB );\r
return ib_status;\r
_ibqp_detach_cb(\r
IN ib_cm_drep_rec_t *p_drep_rec )\r
{\r
- IbQp_t *pQp = (IbQp_t * __ptr64 )p_drep_rec->qp_context;\r
+ IbQp_t *pQp = (IbQp_t *)p_drep_rec->qp_context;\r
VNIC_ENTER( VNIC_DBG_IB );\r
CL_ASSERT( p_drep_rec );\r
\r
_ibqp_rej_cb(\r
IN ib_cm_rej_rec_t *p_rej_rec )\r
{\r
- IbQp_t *pQp = (IbQp_t * __ptr64 )p_rej_rec->qp_context;\r
+ IbQp_t *pQp = (IbQp_t *)p_rej_rec->qp_context;\r
CL_ASSERT(p_rej_rec );\r
\r
InterlockedExchange( &pQp->qpState, IB_DETACHED );\r
{\r
ib_api_status_t ib_status = IB_SUCCESS;\r
ib_cm_drep_t cm_drep;\r
- IbQp_t *pQp = (IbQp_t * __ptr64 )p_dreq_rec->qp_context;\r
+ IbQp_t *pQp = (IbQp_t *)p_dreq_rec->qp_context;\r
\r
VNIC_ENTER( VNIC_DBG_IB );\r
CL_ASSERT( p_dreq_rec );\r
\r
VNIC_ENTER( VNIC_DBG_IB );\r
\r
- pQp = (IbQp_t * __ptr64 )p_cm_rep->qp_context;\r
+ pQp = (IbQp_t *)p_cm_rep->qp_context;\r
p_viport = pQp->pViport;\r
\r
ASSERT( pQp->qpState == IB_ATTACHING );\r
switch ( pEventRecord->code )\r
{\r
case IB_AE_PORT_DOWN:\r
- p_adapter = ( vnic_adapter_t * __ptr64)pEventRecord->context;\r
+ p_adapter = (vnic_adapter_t *)pEventRecord->context;\r
\r
if( p_adapter &&\r
p_adapter->p_currentPath->pViport &&\r
case IB_AE_WQ_REQ_ERROR:\r
case IB_AE_WQ_ACCESS_ERROR:\r
\r
- p_viport = ((IbQp_t * __ptr64 )pEventRecord->context)->pViport;\r
+ p_viport = ((IbQp_t *)pEventRecord->context)->pViport;\r
\r
if( p_viport && !p_viport->errored )\r
{\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
IN struct _viport *p_viport,\r
OUT IbRegion_t *pRegion,\r
IN ib_pd_handle_t hPd,\r
- IN void* __ptr64 vaddr,\r
+ IN void* vaddr,\r
IN uint64_t len,\r
IN ib_access_t access_ctrl );\r
\r
/*\r
* Copyright (c) 2007 QLogic Corporation. All rights reserved.\r
+ * Portions Copyright (c) 2008 Microsoft Corporation. All rights reserved.\r
*\r
* This software is available to you under the OpenIB.org BSD license\r
* below:\r
#define MAXU32 MAXULONG\r
#define MAXU64 ((uint64_t)(~0))\r
\r
-#define PTR64(what) ((uint64_t)(void * __ptr64)(what))\r
-\r
#ifndef min\r
#define min(a,b) ((a)<(b)?(a):(b))\r
#endif\r