2 * Copyright (c) 2005 SilverStorm Technologies. All rights reserved.
\r
4 * This software is available to you under the OpenIB.org BSD license
\r
7 * Redistribution and use in source and binary forms, with or
\r
8 * without modification, are permitted provided that the following
\r
9 * conditions are met:
\r
11 * - Redistributions of source code must retain the above
\r
12 * copyright notice, this list of conditions and the following
\r
15 * - Redistributions in binary form must reproduce the above
\r
16 * copyright notice, this list of conditions and the following
\r
17 * disclaimer in the documentation and/or other materials
\r
18 * provided with the distribution.
\r
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
\r
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
\r
22 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
\r
23 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
\r
24 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
\r
25 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
\r
26 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
\r
32 #include "ibspdll.h"
\r
35 /* Function: IBSPRegisterMemory
\r
37 * Registers buffer memory
\r
43 IN DWORD dwBufferLength,
\r
47 struct ibsp_socket_info *socket_info = (struct ibsp_socket_info *)s;
\r
48 ib_access_t access_ctrl;
\r
49 struct memory_node *node;
\r
51 IBSP_ENTER( IBSP_DBG_MEM );
\r
53 fzprint(("%s():%d:0x%x:0x%x: socket=0x%p \n", __FUNCTION__,
\r
54 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), s));
\r
56 if( lpBuffer == NULL )
\r
58 IBSP_ERROR_EXIT( ("invalid buffer %p\n", lpBuffer) );
\r
59 *lpErrno = WSAEFAULT;
\r
63 if( dwBufferLength > socket_info->socket_options.max_msg_size )
\r
65 IBSP_ERROR_EXIT( ("invalid buffer length %d\n", dwBufferLength) );
\r
66 *lpErrno = WSAEFAULT;
\r
77 access_ctrl = IB_AC_LOCAL_WRITE;
\r
81 access_ctrl = IB_AC_LOCAL_WRITE;
\r
85 IBSP_ERROR_EXIT( ("invalid flags %x\n", dwFlags) );
\r
86 *lpErrno = WSAEINVAL;
\r
90 node = ibsp_reg_mem( socket_info, socket_info->hca_pd,
\r
91 lpBuffer, dwBufferLength, access_ctrl, lpErrno );
\r
93 fzprint(("%s():%d:0x%x:0x%x: registering MEM from %p to %p, len %d, handle %p\n",
\r
94 __FUNCTION__, __LINE__, GetCurrentProcessId(), GetCurrentThreadId(),
\r
95 lpBuffer, (unsigned char *)lpBuffer + dwBufferLength, dwBufferLength, node));
\r
101 ("ibsp_reg_mem failed (pd=%p)\n", socket_info->hca_pd) );
\r
102 *lpErrno = WSAENOBUFS;
\r
106 IBSP_TRACE_EXIT( IBSP_DBG_MEM, ("returning node %p\n", node) );
\r
110 IBSP_EXIT( IBSP_DBG_MEM );
\r
112 return (HANDLE) node;
\r
115 /* Function: IBSPDeregisterMemory
\r
117 * This is our provider's DeregisterMemory function.
\r
120 IBSPDeregisterMemory(
\r
123 OUT LPINT lpErrno )
\r
125 struct memory_node *node = handle;
\r
126 struct ibsp_socket_info *socket_info = (struct ibsp_socket_info *)s;
\r
129 IBSP_ENTER( IBSP_DBG_MEM );
\r
131 fzprint(("%s():%d:0x%x:0x%x: handle=0x%p socket=0x%p \n", __FUNCTION__,
\r
132 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), handle, s));
\r
134 if( s == INVALID_SOCKET )
\r
136 IBSP_ERROR( ("invalid socket handle %x\n", s) );
\r
137 *lpErrno = WSAENOTSOCK;
\r
138 return SOCKET_ERROR;
\r
141 ret = ibsp_dereg_mem( socket_info, node, lpErrno );
\r
143 fzprint(("%s():%d:0x%x:0x%x: unregistering MEM %p, mr_num=%d, ret=%d\n",
\r
145 __LINE__, GetCurrentProcessId(),
\r
146 GetCurrentThreadId(), node, g_ibsp.mr_num, ret));
\r
148 IBSP_EXIT( IBSP_DBG_MEM );
\r
152 /* Function: IBSPRegisterRdmaMemory
\r
154 * This is our provider's RegisterRdmaMemory function.
\r
157 IBSPRegisterRdmaMemory(
\r
160 IN DWORD dwBufferLength,
\r
162 OUT LPVOID lpRdmaBufferDescriptor,
\r
163 IN OUT LPDWORD lpdwDescriptorLength,
\r
164 OUT LPINT lpErrno )
\r
166 struct memory_node *node;
\r
167 struct rdma_memory_desc *desc;
\r
168 struct ibsp_socket_info *socket_info = (struct ibsp_socket_info *)s;
\r
169 ib_access_t access_ctrl;
\r
170 struct ibsp_hca *hca;
\r
172 IBSP_ENTER( IBSP_DBG_MEM );
\r
174 fzprint(("%s():%d:0x%x:0x%x: socket=0x%p \n", __FUNCTION__,
\r
175 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), s));
\r
177 if( *lpdwDescriptorLength < sizeof(struct rdma_memory_desc) )
\r
179 /* This is the probe from the switch to learn the length of the descriptor. */
\r
181 ("invalid descriptor length %d (usually not an error)\n",
\r
182 *lpdwDescriptorLength) );
\r
183 *lpdwDescriptorLength = sizeof(struct rdma_memory_desc);
\r
184 *lpErrno = WSAEFAULT;
\r
185 return SOCKET_ERROR;
\r
188 if( lpBuffer == NULL )
\r
190 IBSP_ERROR_EXIT( ("invalid buffer %p\n", lpBuffer) );
\r
191 *lpErrno = WSAEFAULT;
\r
192 return SOCKET_ERROR;
\r
195 if( dwBufferLength > socket_info->socket_options.max_msg_size )
\r
197 IBSP_ERROR_EXIT( ("invalid buffer length %d\n", dwBufferLength) );
\r
198 *lpErrno = WSAEFAULT;
\r
199 return SOCKET_ERROR;
\r
202 access_ctrl = IB_AC_LOCAL_WRITE;
\r
207 access_ctrl |= IB_AC_RDMA_READ;
\r
211 access_ctrl |= IB_AC_RDMA_WRITE;
\r
214 case MEM_READWRITE:
\r
215 access_ctrl |= (IB_AC_RDMA_READ | IB_AC_RDMA_WRITE);
\r
219 IBSP_ERROR_EXIT( ("invalid flags %x\n", dwFlags) );
\r
220 *lpErrno = WSAEINVAL;
\r
221 return SOCKET_ERROR;
\r
224 hca = socket_info->port->hca;
\r
226 /** TODO: Fix locking so we dont' dereference node outside of mutex. */
\r
227 node = ibsp_reg_mem( socket_info, hca->pd,
\r
228 lpBuffer, dwBufferLength, access_ctrl, lpErrno );
\r
232 IBSP_ERROR_EXIT( ("ibsp_reg_mem failed %d\n", *lpErrno) );
\r
233 *lpErrno = WSAENOBUFS;
\r
234 return SOCKET_ERROR;
\r
237 desc = lpRdmaBufferDescriptor;
\r
239 desc->iova = (uint64_t) (uintptr_t) lpBuffer;
\r
240 desc->lkey = node->p_reg->lkey;
\r
241 desc->rkey = node->p_reg->rkey;
\r
246 IBSP_TRACE1( IBSP_DBG_MEM,
\r
247 ("Socket %p registered RDMA MEM at %p, len %d, for access %d, "
\r
248 "returning handle %p, rkey %08x\n",
\r
249 s, lpBuffer, dwBufferLength, dwFlags, node, desc->rkey));
\r
251 IBSP_EXIT( IBSP_DBG_MEM );
\r
256 /* Function: IBSPDeregisterRdmaMemory
\r
258 * This is our provider's DeregisterRdmaMemory function.
\r
261 IBSPDeregisterRdmaMemory(
\r
263 IN LPVOID lpRdmaBufferDescriptor,
\r
264 IN DWORD dwDescriptorLength,
\r
265 OUT LPINT lpErrno )
\r
267 struct rdma_memory_desc *desc;
\r
268 struct ibsp_socket_info *socket_info = (struct ibsp_socket_info *)s;
\r
271 IBSP_ENTER( IBSP_DBG_MEM );
\r
273 fzprint(("%s():%d:0x%x:0x%x: socket=0x%p \n", __FUNCTION__,
\r
274 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), s));
\r
276 if( s == INVALID_SOCKET )
\r
278 /* Seen in real life with overlap/client test.
\r
279 * The switch closes a socket then calls this. Why? */
\r
280 IBSP_ERROR_EXIT( ("invalid socket handle %x\n", s) );
\r
281 *lpErrno = WSAENOTSOCK;
\r
282 return SOCKET_ERROR;
\r
285 CL_ASSERT( lpRdmaBufferDescriptor );
\r
287 if( dwDescriptorLength < sizeof(struct rdma_memory_desc) )
\r
290 ("invalid descriptor length %d)\n", dwDescriptorLength) );
\r
291 *lpErrno = WSAEINVAL;
\r
292 return SOCKET_ERROR;
\r
295 desc = lpRdmaBufferDescriptor;
\r
297 ret = ibsp_dereg_mem( socket_info, desc->node, lpErrno );
\r
299 fzprint(("%s():%d:0x%x:0x%x: Unregistering RDMA MEM %p\n",
\r
300 __FUNCTION__, __LINE__, GetCurrentProcessId(),
\r
301 GetCurrentThreadId(), desc->node));
\r
303 IBSP_EXIT( IBSP_DBG_MEM );
\r
309 * Do a RDMA read or write operation since the code for both is very close.
\r
314 IN LPWSABUFEX lpBuffers,
\r
315 IN DWORD dwBufferCount,
\r
316 IN LPVOID lpTargetBufferDescriptor,
\r
317 IN DWORD dwTargetDescriptorLength,
\r
318 IN DWORD dwTargetBufferOffset,
\r
319 IN LPWSAOVERLAPPED lpOverlapped,
\r
320 IN ib_wr_type_t wr_type,
\r
321 OUT LPINT lpErrno )
\r
323 struct ibsp_socket_info *socket_info = (struct ibsp_socket_info *)s;
\r
324 ib_api_status_t status;
\r
325 struct rdma_memory_desc *desc; /* remote descriptor */
\r
327 ib_send_wr_t send_wr;
\r
328 ib_local_ds_t local_ds[QP_ATTRIB_SQ_SGE];
\r
331 IBSP_ENTER( IBSP_DBG_IO );
\r
333 CL_ASSERT( wr_type == WR_RDMA_WRITE || wr_type == WR_RDMA_READ );
\r
335 cl_spinlock_acquire( &socket_info->mutex );
\r
336 switch( socket_info->socket_state )
\r
338 case IBSP_CONNECTED:
\r
339 case IBSP_DISCONNECTED:
\r
343 cl_spinlock_release( &socket_info->mutex );
\r
345 ("Socket is not in connected socket_state state=%s\n",
\r
346 IBSP_SOCKET_STATE_STR( socket_info->socket_state )) );
\r
347 *lpErrno = WSAENOTCONN;
\r
348 return SOCKET_ERROR;
\r
350 cl_spinlock_release( &socket_info->mutex );
\r
352 if( socket_info->qp_error )
\r
355 ("QP is in error state %d\n", socket_info->qp_error) );
\r
356 *lpErrno = socket_info->qp_error;
\r
357 return SOCKET_ERROR;
\r
360 /* This function only works for that case. */
\r
361 if( dwBufferCount > QP_ATTRIB_SQ_SGE )
\r
363 CL_ASSERT( dwBufferCount <= QP_ATTRIB_SQ_SGE );
\r
364 /* TODO - support splitting large requests into multiple RDMA operations. */
\r
366 ("dwBufferCount is greater than %d\n", QP_ATTRIB_SQ_SGE) );
\r
367 *lpErrno = WSAEINVAL;
\r
368 return SOCKET_ERROR;
\r
371 if( dwTargetDescriptorLength != sizeof(struct rdma_memory_desc) )
\r
374 ("invalid descriptor length %d)\n", dwTargetDescriptorLength) );
\r
375 *lpErrno = WSAEINVAL;
\r
376 return SOCKET_ERROR;
\r
379 desc = lpTargetBufferDescriptor;
\r
381 /* The send lock is only used to serialize posting. */
\r
382 cl_spinlock_acquire( &socket_info->send_lock );
\r
383 if( socket_info->send_cnt == QP_ATTRIB_SQ_DEPTH )
\r
385 /* TODO: queue requests. */
\r
386 cl_spinlock_release( &socket_info->send_lock );
\r
387 IBSP_ERROR_EXIT( ("not enough wr on the free list\n") );
\r
388 *lpErrno = WSAENETDOWN;
\r
389 return SOCKET_ERROR;
\r
392 wr = &socket_info->send_wr[socket_info->send_idx];
\r
394 wr->lpOverlapped = lpOverlapped;
\r
395 wr->socket_info = socket_info;
\r
397 /* Format the send work request and post. */
\r
398 send_wr.p_next = NULL;
\r
399 send_wr.wr_id = (uint64_t)(void* __ptr64)wr;
\r
400 send_wr.wr_type = wr_type;
\r
401 send_wr.send_opt = 0;
\r
402 send_wr.num_ds = dwBufferCount;
\r
403 send_wr.ds_array = local_ds;
\r
405 send_wr.remote_ops.vaddr = desc->iova + dwTargetBufferOffset;
\r
406 send_wr.remote_ops.rkey = desc->rkey;
\r
408 lpOverlapped->InternalHigh = 0;
\r
409 for( ds_idx = 0; ds_idx < dwBufferCount; ds_idx++ )
\r
411 local_ds[ds_idx].vaddr = (uint64_t)(void* __ptr64)lpBuffers[ds_idx].buf;
\r
412 local_ds[ds_idx].length = lpBuffers[ds_idx].len;
\r
413 local_ds[ds_idx].lkey =
\r
414 ((struct memory_node*)lpBuffers[ds_idx].handle)->p_reg->lkey;
\r
416 lpOverlapped->InternalHigh += lpBuffers[ds_idx].len;
\r
419 if( wr_type == WR_RDMA_READ )
\r
422 * Next send must be fenced since it could indicate that this
\r
423 * RDMA READ is complete.
\r
425 socket_info->send_opt = IB_SEND_OPT_FENCE;
\r
427 else if( lpOverlapped->InternalHigh <= socket_info->max_inline )
\r
429 send_wr.send_opt |= IB_SEND_OPT_INLINE;
\r
433 * We must set this now, because the operation could complete
\r
434 * before ib_post_send returns.
\r
436 lpOverlapped->Internal = WSS_OPERATION_IN_PROGRESS;
\r
438 cl_atomic_inc( &socket_info->send_cnt );
\r
441 if( lpOverlapped->hEvent == 0 )
\r
443 cl_atomic_inc( &g_ibsp.overlap_h0_count );
\r
447 cl_atomic_inc( &g_ibsp.overlap_h1_count );
\r
448 cl_atomic_inc( &g_ibsp.overlap_h1_comp_count );
\r
451 fzprint(("%s():%d:0x%x:0x%x: ov=0x%p h0_cnt=%d h1_cnt=%d\n",
\r
452 __FUNCTION__, __LINE__, GetCurrentProcessId(),
\r
453 GetCurrentThreadId(), lpOverlapped,
\r
454 g_ibsp.overlap_h0_count, g_ibsp.overlap_h1_count));
\r
458 status = ib_post_send( socket_info->qp, &send_wr, NULL );
\r
460 if( status == IB_SUCCESS )
\r
462 /* Update the index and wrap as needed */
\r
463 #if QP_ATTRIB_SQ_DEPTH == 256 || QP_ATTRIB_SQ_DEPTH == 128 || \
\r
464 QP_ATTRIB_SQ_DEPTH == 64 || QP_ATTRIB_SQ_DEPTH == 32 || \
\r
465 QP_ATTRIB_SQ_DEPTH == 16 || QP_ATTRIB_SQ_DEPTH == 8
\r
466 socket_info->send_idx++;
\r
467 socket_info->send_idx &= (QP_ATTRIB_SQ_DEPTH - 1);
\r
469 if( ++socket_info->send_idx == QP_ATTRIB_SQ_DEPTH )
\r
470 socket_info->send_idx = 0;
\r
473 *lpErrno = WSA_IO_PENDING;
\r
475 IBSP_TRACE1( IBSP_DBG_IO,
\r
476 ("Posted RDMA: socket=%p, ov=%p, type=%d, local=%p, len=%d, "
\r
477 "dest=%016I64x, rkey=%08x\n",
\r
478 s, lpOverlapped, wr_type, lpBuffers[0].buf, lpBuffers[0].len,
\r
479 send_wr.remote_ops.vaddr, send_wr.remote_ops.rkey) );
\r
481 fzprint(("posted RDMA %p, len=%d, op=%d, mr handle=%p\n",
\r
482 lpOverlapped, lpBuffers[0].len, wr_type, node));
\r
487 ("ib_post_send returned %s\n", ib_get_err_str( status )) );
\r
491 if( lpOverlapped->hEvent == 0 )
\r
493 cl_atomic_dec( &g_ibsp.overlap_h0_count );
\r
497 cl_atomic_dec( &g_ibsp.overlap_h1_count );
\r
498 cl_atomic_dec( &g_ibsp.overlap_h1_comp_count );
\r
501 memset( wr, 0x44, sizeof(struct _wr) );
\r
503 cl_atomic_dec( &socket_info->send_cnt );
\r
505 *lpErrno = ibal_to_wsa_error( status );
\r
508 cl_spinlock_release( &socket_info->send_lock );
\r
510 /* We never complete the operation here. */
\r
511 IBSP_EXIT( IBSP_DBG_IO );
\r
512 return SOCKET_ERROR;
\r
516 /* Function: IBSPRdmaWrite
\r
518 This is our provider's RdmaWrite function. When an app calls WSAIoctl
\r
519 to request the function pointer to RdmaWrite, we return pointer to this
\r
520 function and this function is called by application directly using the function pointer.
\r
525 IN LPWSABUFEX lpBuffers,
\r
526 IN DWORD dwBufferCount,
\r
527 IN LPVOID lpTargetBufferDescriptor,
\r
528 IN DWORD dwTargetDescriptorLength,
\r
529 IN DWORD dwTargetBufferOffset,
\r
530 OUT LPDWORD lpdwNumberOfBytesWritten,
\r
532 IN LPWSAOVERLAPPED lpOverlapped,
\r
533 IN LPWSAOVERLAPPED_COMPLETION_ROUTINE lpCompletionRoutine,
\r
534 IN LPWSATHREADID lpThreadId,
\r
535 OUT LPINT lpErrno )
\r
539 IBSP_ENTER( IBSP_DBG_IO );
\r
541 UNUSED_PARAM( lpThreadId );
\r
542 UNUSED_PARAM( lpCompletionRoutine );
\r
543 UNUSED_PARAM( lpdwNumberOfBytesWritten );
\r
545 if( s == INVALID_SOCKET )
\r
547 IBSP_ERROR_EXIT( ("invalid socket handle %x\n", s) );
\r
548 *lpErrno = WSAENOTSOCK;
\r
549 return SOCKET_ERROR;
\r
552 fzprint(("%s():%d:0x%x:0x%x: socket=0x%p overlapped=0x%p\n", __FUNCTION__,
\r
553 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), s, lpOverlapped));
\r
555 /* Store the flags for reporting back in IBSPGetOverlappedResult */
\r
556 lpOverlapped->Offset = dwFlags;
\r
558 ret = do_rdma_op( s, lpBuffers, dwBufferCount, lpTargetBufferDescriptor,
\r
559 dwTargetDescriptorLength, dwTargetBufferOffset,
\r
560 lpOverlapped, WR_RDMA_WRITE, lpErrno );
\r
562 IBSP_EXIT( IBSP_DBG_IO );
\r
568 /* Function: IBSPRdmaRead
\r
570 This is our provider's RdmaRead function. When an app calls WSAIoctl
\r
571 to request the function pointer to RdmaRead, we return pointer to this
\r
572 function and this function is called by application directly using the function pointer.
\r
577 IN LPWSABUFEX lpBuffers,
\r
578 IN DWORD dwBufferCount,
\r
579 IN LPVOID lpTargetBufferDescriptor,
\r
580 IN DWORD dwTargetDescriptorLength,
\r
581 IN DWORD dwTargetBufferOffset,
\r
582 OUT LPDWORD lpdwNumberOfBytesRead,
\r
584 IN LPWSAOVERLAPPED lpOverlapped,
\r
585 IN LPWSAOVERLAPPED_COMPLETION_ROUTINE lpCompletionRoutine,
\r
586 IN LPWSATHREADID lpThreadId,
\r
587 OUT LPINT lpErrno )
\r
591 IBSP_ENTER( IBSP_DBG_IO );
\r
593 UNUSED_PARAM( lpThreadId );
\r
594 UNUSED_PARAM( lpCompletionRoutine );
\r
595 UNUSED_PARAM( lpdwNumberOfBytesRead );
\r
597 fzprint(("%s():%d:0x%x:0x%x: socket=0x%p overlapped=0x%p \n", __FUNCTION__,
\r
598 __LINE__, GetCurrentProcessId(), GetCurrentThreadId(), s, lpOverlapped));
\r
600 /* Store the flags for reporting back in IBSPGetOverlappedResult */
\r
601 lpOverlapped->Offset = dwFlags;
\r
603 ret = do_rdma_op( s, lpBuffers, dwBufferCount, lpTargetBufferDescriptor,
\r
604 dwTargetDescriptorLength, dwTargetBufferOffset,
\r
605 lpOverlapped, WR_RDMA_READ, lpErrno );
\r
607 IBSP_EXIT( IBSP_DBG_IO );
\r
613 /* Function: IBSPMemoryRegistrationCacheCallback
\r
615 * This is our provider's MemoryRegistrationCacheCallback
\r
616 * function. When an app calls WSAIoctl to request the function
\r
617 * pointer to MemoryRegistrationCacheCallback, we return pointer to
\r
618 * this function and this function is called by application directly
\r
619 * using the function pointer.
\r
622 IBSPMemoryRegistrationCacheCallback(
\r
623 IN LPVOID lpvAddress,
\r
625 OUT LPINT lpErrno )
\r
627 cl_list_item_t *p_item;
\r
629 IBSP_ENTER( IBSP_DBG_MEM );
\r
631 UNUSED_PARAM( lpErrno );
\r
633 cl_spinlock_acquire( &g_ibsp.hca_mutex );
\r
634 for( p_item = cl_qlist_head( &g_ibsp.hca_list );
\r
635 p_item != cl_qlist_end( &g_ibsp.hca_list );
\r
636 p_item = cl_qlist_next( p_item ) )
\r
638 ibsp_hca_flush_mr_cache(
\r
639 PARENT_STRUCT( p_item, struct ibsp_hca, item ), lpvAddress, Size );
\r
641 cl_spinlock_release( &g_ibsp.hca_mutex );
\r
643 IBSP_EXIT( IBSP_DBG_MEM );
\r