2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #include <gpxe/if_arp.h>
26 #include <gpxe/iobuf.h>
27 #include <gpxe/netdevice.h>
28 #include <gpxe/infiniband.h>
29 #include <gpxe/ipoib.h>
36 /** Number of IPoIB data send work queue entries */
37 #define IPOIB_DATA_NUM_SEND_WQES 2
39 /** Number of IPoIB data receive work queue entries */
40 #define IPOIB_DATA_NUM_RECV_WQES 4
42 /** Number of IPoIB data completion entries */
43 #define IPOIB_DATA_NUM_CQES 8
45 /** Number of IPoIB metadata send work queue entries */
46 #define IPOIB_META_NUM_SEND_WQES 2
48 /** Number of IPoIB metadata receive work queue entries */
49 #define IPOIB_META_NUM_RECV_WQES 2
51 /** Number of IPoIB metadata completion entries */
52 #define IPOIB_META_NUM_CQES 8
54 /** An IPoIB queue set */
55 struct ipoib_queue_set {
56 /** Completion queue */
57 struct ib_completion_queue *cq;
59 struct ib_queue_pair *qp;
60 /** Receive work queue fill level */
61 unsigned int recv_fill;
62 /** Receive work queue maximum fill level */
63 unsigned int recv_max_fill;
66 /** An IPoIB device */
69 struct net_device *netdev;
70 /** Underlying Infiniband device */
71 struct ib_device *ibdev;
73 struct ipoib_queue_set data;
75 struct ipoib_queue_set meta;
77 struct ib_gid broadcast_gid;
79 unsigned int broadcast_lid;
81 unsigned long data_qkey;
82 /** Attached to multicast group
84 * This flag indicates whether or not we have attached our
85 * data queue pair to the broadcast multicast GID.
87 int broadcast_attached;
91 * IPoIB path cache entry
93 * This serves a similar role to the ARP cache for Ethernet. (ARP
94 * *is* used on IPoIB; we have two caches to maintain.)
96 struct ipoib_cached_path {
97 /** Destination GID */
99 /** Destination LID */
107 /** Number of IPoIB path cache entries */
108 #define IPOIB_NUM_CACHED_PATHS 2
110 /** IPoIB path cache */
111 static struct ipoib_cached_path ipoib_path_cache[IPOIB_NUM_CACHED_PATHS];
113 /** Oldest IPoIB path cache entry index */
114 static unsigned int ipoib_path_cache_idx = 0;
116 /** TID half used to identify get path record replies */
117 #define IPOIB_TID_GET_PATH_REC 0x11111111UL
119 /** TID half used to identify multicast member record replies */
120 #define IPOIB_TID_MC_MEMBER_REC 0x22222222UL
122 /** IPoIB metadata TID */
123 static uint32_t ipoib_meta_tid = 0;
125 /** IPv4 broadcast GID */
126 static const struct ib_gid ipv4_broadcast_gid = {
127 { { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
128 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff } }
131 /** Maximum time we will wait for the broadcast join to succeed */
132 #define IPOIB_JOIN_MAX_DELAY_MS 1000
134 /****************************************************************************
138 ****************************************************************************
141 /** Broadcast QPN used in IPoIB MAC addresses
143 * This is a guaranteed invalid real QPN
145 #define IPOIB_BROADCAST_QPN 0xffffffffUL
147 /** Broadcast IPoIB address */
148 static struct ipoib_mac ipoib_broadcast = {
149 .qpn = ntohl ( IPOIB_BROADCAST_QPN ),
153 * Add IPoIB link-layer header
155 * @v iobuf I/O buffer
156 * @v ll_dest Link-layer destination address
157 * @v ll_source Source link-layer address
158 * @v net_proto Network-layer protocol, in network-byte order
159 * @ret rc Return status code
161 static int ipoib_push ( struct io_buffer *iobuf, const void *ll_dest,
162 const void *ll_source __unused, uint16_t net_proto ) {
163 struct ipoib_hdr *ipoib_hdr =
164 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
166 /* Build IPoIB header */
167 memcpy ( &ipoib_hdr->pseudo.peer, ll_dest,
168 sizeof ( ipoib_hdr->pseudo.peer ) );
169 ipoib_hdr->real.proto = net_proto;
170 ipoib_hdr->real.reserved = 0;
176 * Remove IPoIB link-layer header
178 * @v iobuf I/O buffer
179 * @ret ll_dest Link-layer destination address
180 * @ret ll_source Source link-layer address
181 * @ret net_proto Network-layer protocol, in network-byte order
182 * @ret rc Return status code
184 static int ipoib_pull ( struct io_buffer *iobuf, const void **ll_dest,
185 const void **ll_source, uint16_t *net_proto ) {
186 struct ipoib_hdr *ipoib_hdr = iobuf->data;
189 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
190 DBG ( "IPoIB packet too short for link-layer header\n" );
191 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
195 /* Strip off IPoIB header */
196 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
198 /* Fill in required fields */
199 *ll_dest = &ipoib_broadcast; /* Doesn't really exist in packet */
200 *ll_source = &ipoib_hdr->pseudo.peer;
201 *net_proto = ipoib_hdr->real.proto;
207 * Transcribe IPoIB address
209 * @v ll_addr Link-layer address
210 * @ret string Link-layer address in human-readable format
212 const char * ipoib_ntoa ( const void *ll_addr ) {
214 const struct ipoib_mac *mac = ll_addr;
216 snprintf ( buf, sizeof ( buf ), "%08lx:%08lx:%08lx:%08lx:%08lx",
217 htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
218 htonl ( mac->gid.u.dwords[1] ),
219 htonl ( mac->gid.u.dwords[2] ),
220 htonl ( mac->gid.u.dwords[3] ) );
225 * Hash multicast address
227 * @v af Address family
228 * @v net_addr Network-layer address
229 * @v ll_addr Link-layer address to fill in
230 * @ret rc Return status code
232 static int ipoib_mc_hash ( unsigned int af __unused,
233 const void *net_addr __unused,
234 void *ll_addr __unused ) {
239 /** IPoIB protocol */
240 struct ll_protocol ipoib_protocol __ll_protocol = {
242 .ll_proto = htons ( ARPHRD_INFINIBAND ),
243 .ll_addr_len = IPOIB_ALEN,
244 .ll_header_len = IPOIB_HLEN,
245 .ll_broadcast = ( uint8_t * ) &ipoib_broadcast,
249 .mc_hash = ipoib_mc_hash,
252 /****************************************************************************
254 * IPoIB network device
256 ****************************************************************************
262 * @v ipoib IPoIB device
265 static void ipoib_destroy_qset ( struct ipoib_device *ipoib,
266 struct ipoib_queue_set *qset ) {
267 struct ib_device *ibdev = ipoib->ibdev;
270 ib_destroy_qp ( ibdev, qset->qp );
272 ib_destroy_cq ( ibdev, qset->cq );
273 memset ( qset, 0, sizeof ( *qset ) );
279 * @v ipoib IPoIB device
281 * @v num_cqes Number of completion queue entries
282 * @v num_send_wqes Number of send work queue entries
283 * @v complete_send Send completion handler
284 * @v num_recv_wqes Number of receive work queue entries
285 * @v complete_recv Receive completion handler
287 * @ret rc Return status code
289 static int ipoib_create_qset ( struct ipoib_device *ipoib,
290 struct ipoib_queue_set *qset,
291 unsigned int num_cqes,
292 unsigned int num_send_wqes,
293 ib_completer_t complete_send,
294 unsigned int num_recv_wqes,
295 ib_completer_t complete_recv,
296 unsigned long qkey ) {
297 struct ib_device *ibdev = ipoib->ibdev;
301 assert ( qset->cq == NULL );
302 assert ( qset->qp == NULL );
304 /* Store queue parameters */
305 qset->recv_max_fill = num_recv_wqes;
307 /* Allocate completion queue */
308 qset->cq = ib_create_cq ( ibdev, num_cqes, complete_send,
311 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
317 /* Allocate queue pair */
318 qset->qp = ib_create_qp ( ibdev, num_send_wqes, qset->cq,
319 num_recv_wqes, qset->cq, qkey );
321 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
326 ib_qp_set_ownerdata ( qset->qp, ipoib->netdev );
331 ipoib_destroy_qset ( ipoib, qset );
336 * Find path cache entry by GID
339 * @ret entry Path cache entry, or NULL
341 static struct ipoib_cached_path *
342 ipoib_find_cached_path ( struct ib_gid *gid ) {
343 struct ipoib_cached_path *path;
346 for ( i = 0 ; i < IPOIB_NUM_CACHED_PATHS ; i++ ) {
347 path = &ipoib_path_cache[i];
348 if ( memcmp ( &path->gid, gid, sizeof ( *gid ) ) == 0 )
351 DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx cache miss\n",
352 htonl ( gid->u.dwords[0] ), htonl ( gid->u.dwords[1] ),
353 htonl ( gid->u.dwords[2] ), htonl ( gid->u.dwords[3] ) );
358 * Transmit path record request
360 * @v ipoib IPoIB device
361 * @v gid Destination GID
362 * @ret rc Return status code
364 static int ipoib_get_path_record ( struct ipoib_device *ipoib,
365 struct ib_gid *gid ) {
366 struct ib_device *ibdev = ipoib->ibdev;
367 struct io_buffer *iobuf;
368 struct ib_mad_path_record *path_record;
369 struct ib_address_vector av;
372 /* Allocate I/O buffer */
373 iobuf = alloc_iob ( sizeof ( *path_record ) );
376 iob_put ( iobuf, sizeof ( *path_record ) );
377 path_record = iobuf->data;
378 memset ( path_record, 0, sizeof ( *path_record ) );
380 /* Construct path record request */
381 path_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
382 path_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
383 path_record->mad_hdr.class_version = 2;
384 path_record->mad_hdr.method = IB_MGMT_METHOD_GET;
385 path_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_PATH_REC );
386 path_record->mad_hdr.tid[0] = IPOIB_TID_GET_PATH_REC;
387 path_record->mad_hdr.tid[1] = ipoib_meta_tid++;
388 path_record->sa_hdr.comp_mask[1] =
389 htonl ( IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID );
390 memcpy ( &path_record->dgid, gid, sizeof ( path_record->dgid ) );
391 memcpy ( &path_record->sgid, &ibdev->port_gid,
392 sizeof ( path_record->sgid ) );
394 /* Construct address vector */
395 memset ( &av, 0, sizeof ( av ) );
396 av.dlid = ibdev->sm_lid;
397 av.dest_qp = IB_SA_QPN;
398 av.qkey = IB_GLOBAL_QKEY;
400 /* Post send request */
401 if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
403 DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
404 ipoib, strerror ( rc ) );
413 * Transmit multicast group membership request
415 * @v ipoib IPoIB device
416 * @v gid Multicast GID
417 * @v join Join (rather than leave) group
418 * @ret rc Return status code
420 static int ipoib_mc_member_record ( struct ipoib_device *ipoib,
421 struct ib_gid *gid, int join ) {
422 struct ib_device *ibdev = ipoib->ibdev;
423 struct io_buffer *iobuf;
424 struct ib_mad_mc_member_record *mc_member_record;
425 struct ib_address_vector av;
428 /* Allocate I/O buffer */
429 iobuf = alloc_iob ( sizeof ( *mc_member_record ) );
432 iob_put ( iobuf, sizeof ( *mc_member_record ) );
433 mc_member_record = iobuf->data;
434 memset ( mc_member_record, 0, sizeof ( *mc_member_record ) );
436 /* Construct path record request */
437 mc_member_record->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
438 mc_member_record->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
439 mc_member_record->mad_hdr.class_version = 2;
440 mc_member_record->mad_hdr.method =
441 ( join ? IB_MGMT_METHOD_SET : IB_MGMT_METHOD_DELETE );
442 mc_member_record->mad_hdr.attr_id = htons ( IB_SA_ATTR_MC_MEMBER_REC );
443 mc_member_record->mad_hdr.tid[0] = IPOIB_TID_MC_MEMBER_REC;
444 mc_member_record->mad_hdr.tid[1] = ipoib_meta_tid++;
445 mc_member_record->sa_hdr.comp_mask[1] =
446 htonl ( IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
447 IB_SA_MCMEMBER_REC_JOIN_STATE );
448 mc_member_record->scope__join_state = 1;
449 memcpy ( &mc_member_record->mgid, gid,
450 sizeof ( mc_member_record->mgid ) );
451 memcpy ( &mc_member_record->port_gid, &ibdev->port_gid,
452 sizeof ( mc_member_record->port_gid ) );
454 /* Construct address vector */
455 memset ( &av, 0, sizeof ( av ) );
456 av.dlid = ibdev->sm_lid;
457 av.dest_qp = IB_SA_QPN;
458 av.qkey = IB_GLOBAL_QKEY;
460 /* Post send request */
461 if ( ( rc = ib_post_send ( ibdev, ipoib->meta.qp, &av,
463 DBGC ( ipoib, "IPoIB %p could not send get path record: %s\n",
464 ipoib, strerror ( rc ) );
473 * Transmit packet via IPoIB network device
475 * @v netdev Network device
476 * @v iobuf I/O buffer
477 * @ret rc Return status code
479 static int ipoib_transmit ( struct net_device *netdev,
480 struct io_buffer *iobuf ) {
481 struct ipoib_device *ipoib = netdev->priv;
482 struct ib_device *ibdev = ipoib->ibdev;
483 struct ipoib_pseudo_hdr *ipoib_pshdr = iobuf->data;
484 struct ib_address_vector av;
486 struct ipoib_cached_path *path;
490 if ( iob_len ( iobuf ) < sizeof ( *ipoib_pshdr ) ) {
491 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
494 iob_pull ( iobuf, ( sizeof ( *ipoib_pshdr ) ) );
496 /* Attempting transmission while link is down will put the
497 * queue pair into an error state, so don't try it.
499 if ( ! ibdev->link_up )
502 /* Construct address vector */
503 memset ( &av, 0, sizeof ( av ) );
504 av.qkey = IB_GLOBAL_QKEY;
506 if ( ipoib_pshdr->peer.qpn == htonl ( IPOIB_BROADCAST_QPN ) ) {
507 /* Broadcast address */
508 av.dest_qp = IB_BROADCAST_QPN;
509 av.dlid = ipoib->broadcast_lid;
510 gid = &ipoib->broadcast_gid;
512 /* Unicast - look in path cache */
513 path = ipoib_find_cached_path ( &ipoib_pshdr->peer.gid );
515 /* No path entry - get path record */
516 rc = ipoib_get_path_record ( ipoib,
517 &ipoib_pshdr->peer.gid );
518 netdev_tx_complete ( netdev, iobuf );
521 av.dest_qp = ntohl ( ipoib_pshdr->peer.qpn );
522 av.dlid = path->dlid;
523 av.rate = path->rate;
525 gid = &ipoib_pshdr->peer.gid;
527 memcpy ( &av.gid, gid, sizeof ( av.gid ) );
529 return ib_post_send ( ibdev, ipoib->data.qp, &av, iobuf );
533 * Handle IPoIB data send completion
535 * @v ibdev Infiniband device
537 * @v completion Completion
538 * @v iobuf I/O buffer
540 static void ipoib_data_complete_send ( struct ib_device *ibdev __unused,
541 struct ib_queue_pair *qp,
542 struct ib_completion *completion,
543 struct io_buffer *iobuf ) {
544 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
546 netdev_tx_complete_err ( netdev, iobuf,
547 ( completion->syndrome ? -EIO : 0 ) );
551 * Handle IPoIB data receive completion
553 * @v ibdev Infiniband device
555 * @v completion Completion
556 * @v iobuf I/O buffer
558 static void ipoib_data_complete_recv ( struct ib_device *ibdev __unused,
559 struct ib_queue_pair *qp,
560 struct ib_completion *completion,
561 struct io_buffer *iobuf ) {
562 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
563 struct ipoib_device *ipoib = netdev->priv;
564 struct ipoib_pseudo_hdr *ipoib_pshdr;
566 if ( completion->syndrome ) {
567 netdev_rx_err ( netdev, iobuf, -EIO );
571 iob_put ( iobuf, completion->len );
572 if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
573 DBGC ( ipoib, "IPoIB %p received data packet too short to "
574 "contain GRH\n", ipoib );
575 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
576 netdev_rx_err ( netdev, iobuf, -EIO );
579 iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
581 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_real_hdr ) ) {
582 DBGC ( ipoib, "IPoIB %p received data packet too short to "
583 "contain IPoIB header\n", ipoib );
584 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
585 netdev_rx_err ( netdev, iobuf, -EIO );
589 ipoib_pshdr = iob_push ( iobuf, sizeof ( *ipoib_pshdr ) );
590 /* FIXME: fill in a MAC address for the sake of AoE! */
592 netdev_rx ( netdev, iobuf );
595 ipoib->data.recv_fill--;
599 * Handle IPoIB metadata send completion
601 * @v ibdev Infiniband device
603 * @v completion Completion
604 * @v iobuf I/O buffer
606 static void ipoib_meta_complete_send ( struct ib_device *ibdev __unused,
607 struct ib_queue_pair *qp,
608 struct ib_completion *completion,
609 struct io_buffer *iobuf ) {
610 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
611 struct ipoib_device *ipoib = netdev->priv;
613 if ( completion->syndrome ) {
614 DBGC ( ipoib, "IPoIB %p metadata TX completion error %x\n",
615 ipoib, completion->syndrome );
621 * Handle received IPoIB path record
623 * @v ipoib IPoIB device
624 * @v path_record Path record
626 static void ipoib_recv_path_record ( struct ipoib_device *ipoib __unused,
627 struct ib_mad_path_record *path_record ) {
628 struct ipoib_cached_path *path;
630 /* Update path cache entry */
631 path = &ipoib_path_cache[ipoib_path_cache_idx];
632 memcpy ( &path->gid, &path_record->dgid, sizeof ( path->gid ) );
633 path->dlid = ntohs ( path_record->dlid );
634 path->sl = ( path_record->reserved__sl & 0x0f );
635 path->rate = ( path_record->rate_selector__rate & 0x3f );
637 DBG ( "IPoIB %08lx:%08lx:%08lx:%08lx dlid %x sl %x rate %x\n",
638 htonl ( path->gid.u.dwords[0] ), htonl ( path->gid.u.dwords[1] ),
639 htonl ( path->gid.u.dwords[2] ), htonl ( path->gid.u.dwords[3] ),
640 path->dlid, path->sl, path->rate );
642 /* Update path cache index */
643 ipoib_path_cache_idx++;
644 if ( ipoib_path_cache_idx == IPOIB_NUM_CACHED_PATHS )
645 ipoib_path_cache_idx = 0;
649 * Handle received IPoIB multicast membership record
651 * @v ipoib IPoIB device
652 * @v mc_member_record Multicast membership record
654 static void ipoib_recv_mc_member_record ( struct ipoib_device *ipoib,
655 struct ib_mad_mc_member_record *mc_member_record ) {
659 /* Record parameters */
660 joined = ( mc_member_record->scope__join_state & 0x0f );
661 ipoib->data_qkey = ntohl ( mc_member_record->qkey );
662 ipoib->broadcast_lid = ntohs ( mc_member_record->mlid );
663 DBGC ( ipoib, "IPoIB %p %s broadcast group: qkey %lx mlid %x\n",
664 ipoib, ( joined ? "joined" : "left" ), ipoib->data_qkey,
665 ipoib->broadcast_lid );
667 /* Update data queue pair qkey */
668 if ( ( rc = ib_modify_qp ( ipoib->ibdev, ipoib->data.qp,
669 IB_MODIFY_QKEY, ipoib->data_qkey ) ) != 0 ){
670 DBGC ( ipoib, "IPoIB %p could not update data qkey: %s\n",
671 ipoib, strerror ( rc ) );
677 * Handle IPoIB metadata receive completion
679 * @v ibdev Infiniband device
681 * @v completion Completion
682 * @v iobuf I/O buffer
684 static void ipoib_meta_complete_recv ( struct ib_device *ibdev __unused,
685 struct ib_queue_pair *qp,
686 struct ib_completion *completion,
687 struct io_buffer *iobuf ) {
688 struct net_device *netdev = ib_qp_get_ownerdata ( qp );
689 struct ipoib_device *ipoib = netdev->priv;
692 if ( completion->syndrome ) {
693 DBGC ( ipoib, "IPoIB %p metadata RX completion error %x\n",
694 ipoib, completion->syndrome );
698 iob_put ( iobuf, completion->len );
699 if ( iob_len ( iobuf ) < sizeof ( struct ib_global_route_header ) ) {
700 DBGC ( ipoib, "IPoIB %p received metadata packet too short "
701 "to contain GRH\n", ipoib );
702 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
705 iob_pull ( iobuf, sizeof ( struct ib_global_route_header ) );
706 if ( iob_len ( iobuf ) < sizeof ( *mad ) ) {
707 DBGC ( ipoib, "IPoIB %p received metadata packet too short "
708 "to contain reply\n", ipoib );
709 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
714 if ( mad->mad_hdr.status != 0 ) {
715 DBGC ( ipoib, "IPoIB %p metadata RX err status %04x\n",
716 ipoib, ntohs ( mad->mad_hdr.status ) );
720 switch ( mad->mad_hdr.tid[0] ) {
721 case IPOIB_TID_GET_PATH_REC:
722 ipoib_recv_path_record ( ipoib, &mad->path_record );
724 case IPOIB_TID_MC_MEMBER_REC:
725 ipoib_recv_mc_member_record ( ipoib, &mad->mc_member_record );
728 DBGC ( ipoib, "IPoIB %p unwanted response:\n",
730 DBGC_HD ( ipoib, mad, sizeof ( *mad ) );
735 ipoib->meta.recv_fill--;
740 * Refill IPoIB receive ring
742 * @v ipoib IPoIB device
744 static void ipoib_refill_recv ( struct ipoib_device *ipoib,
745 struct ipoib_queue_set *qset ) {
746 struct ib_device *ibdev = ipoib->ibdev;
747 struct io_buffer *iobuf;
750 while ( qset->recv_fill < qset->recv_max_fill ) {
751 iobuf = alloc_iob ( IPOIB_PKT_LEN );
754 if ( ( rc = ib_post_recv ( ibdev, qset->qp, iobuf ) ) != 0 ) {
763 * Poll IPoIB network device
765 * @v netdev Network device
767 static void ipoib_poll ( struct net_device *netdev ) {
768 struct ipoib_device *ipoib = netdev->priv;
769 struct ib_device *ibdev = ipoib->ibdev;
771 ib_poll_cq ( ibdev, ipoib->meta.cq );
772 ib_poll_cq ( ibdev, ipoib->data.cq );
773 ipoib_refill_recv ( ipoib, &ipoib->meta );
774 ipoib_refill_recv ( ipoib, &ipoib->data );
778 * Enable/disable interrupts on IPoIB network device
780 * @v netdev Network device
781 * @v enable Interrupts should be enabled
783 static void ipoib_irq ( struct net_device *netdev __unused,
784 int enable __unused ) {
785 /* No implementation */
789 * Join IPv4 broadcast multicast group
791 * @v ipoib IPoIB device
792 * @ret rc Return status code
794 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
798 if ( ! ipoib->data.qp )
801 /* Attach data queue to broadcast multicast GID */
802 assert ( ipoib->broadcast_attached == 0 );
803 if ( ( rc = ib_mcast_attach ( ipoib->ibdev, ipoib->data.qp,
804 &ipoib->broadcast_gid ) ) != 0 ){
805 DBGC ( ipoib, "IPoIB %p could not attach to broadcast GID: "
806 "%s\n", ipoib, strerror ( rc ) );
809 ipoib->broadcast_attached = 1;
811 /* Initiate broadcast group join */
812 if ( ( rc = ipoib_mc_member_record ( ipoib, &ipoib->broadcast_gid,
814 DBGC ( ipoib, "IPoIB %p could not send broadcast join: %s\n",
815 ipoib, strerror ( rc ) );
819 /* We will set link up on the network device when we receive
820 * the broadcast join response.
827 * Leave IPv4 broadcast multicast group
829 * @v ipoib IPoIB device
831 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
833 /* Detach data queue from broadcast multicast GID */
834 if ( ipoib->broadcast_attached ) {
835 assert ( ipoib->data.qp != NULL );
836 ib_mcast_detach ( ipoib->ibdev, ipoib->data.qp,
837 &ipoib->broadcast_gid );
838 ipoib->broadcast_attached = 0;
843 * Open IPoIB network device
845 * @v netdev Network device
846 * @ret rc Return status code
848 static int ipoib_open ( struct net_device *netdev ) {
849 struct ipoib_device *ipoib = netdev->priv;
850 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
853 /* Allocate metadata queue set */
854 if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->meta,
856 IPOIB_META_NUM_SEND_WQES,
857 ipoib_meta_complete_send,
858 IPOIB_META_NUM_RECV_WQES,
859 ipoib_meta_complete_recv,
860 IB_GLOBAL_QKEY ) ) != 0 ) {
861 DBGC ( ipoib, "IPoIB %p could not allocate metadata QP: %s\n",
862 ipoib, strerror ( rc ) );
863 goto err_create_meta_qset;
866 /* Allocate data queue set */
867 if ( ( rc = ipoib_create_qset ( ipoib, &ipoib->data,
869 IPOIB_DATA_NUM_SEND_WQES,
870 ipoib_data_complete_send,
871 IPOIB_DATA_NUM_RECV_WQES,
872 ipoib_data_complete_recv,
873 IB_GLOBAL_QKEY ) ) != 0 ) {
874 DBGC ( ipoib, "IPoIB %p could not allocate data QP: %s\n",
875 ipoib, strerror ( rc ) );
876 goto err_create_data_qset;
879 /* Update MAC address with data QPN */
880 mac->qpn = htonl ( ipoib->data.qp->qpn );
882 /* Fill receive rings */
883 ipoib_refill_recv ( ipoib, &ipoib->meta );
884 ipoib_refill_recv ( ipoib, &ipoib->data );
886 /* Join broadcast group */
887 if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
888 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
889 ipoib, strerror ( rc ) );
890 goto err_join_broadcast;
896 ipoib_destroy_qset ( ipoib, &ipoib->data );
897 err_create_data_qset:
898 ipoib_destroy_qset ( ipoib, &ipoib->meta );
899 err_create_meta_qset:
904 * Close IPoIB network device
906 * @v netdev Network device
908 static void ipoib_close ( struct net_device *netdev ) {
909 struct ipoib_device *ipoib = netdev->priv;
910 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
912 /* Leave broadcast group */
913 ipoib_leave_broadcast_group ( ipoib );
915 /* Remove data QPN from MAC address */
918 /* Tear down the queues */
919 ipoib_destroy_qset ( ipoib, &ipoib->data );
920 ipoib_destroy_qset ( ipoib, &ipoib->meta );
923 /** IPoIB network device operations */
924 static struct net_device_operations ipoib_operations = {
926 .close = ipoib_close,
927 .transmit = ipoib_transmit,
933 * Update IPoIB dynamic Infiniband parameters
935 * @v ipoib IPoIB device
937 * The Infiniband port GID and partition key will change at runtime,
938 * when the link is established (or lost). The MAC address is based
939 * on the port GID, and the broadcast GID is based on the partition
940 * key. This function recalculates these IPoIB device parameters.
942 static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
943 struct ib_device *ibdev = ipoib->ibdev;
944 struct net_device *netdev = ipoib->netdev;
945 struct ipoib_mac *mac;
947 /* Calculate GID portion of MAC address based on port GID */
948 mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
949 memcpy ( &mac->gid, &ibdev->port_gid, sizeof ( mac->gid ) );
951 /* Calculate broadcast GID based on partition key */
952 memcpy ( &ipoib->broadcast_gid, &ipv4_broadcast_gid,
953 sizeof ( ipoib->broadcast_gid ) );
954 ipoib->broadcast_gid.u.words[2] = htons ( ibdev->pkey );
956 /* Set net device link state to reflect Infiniband link state */
957 if ( ibdev->link_up ) {
958 netdev_link_up ( netdev );
960 netdev_link_down ( netdev );
965 * Handle link status change
967 * @v ibdev Infiniband device
969 void ipoib_link_state_changed ( struct ib_device *ibdev ) {
970 struct net_device *netdev = ib_get_ownerdata ( ibdev );
971 struct ipoib_device *ipoib = netdev->priv;
974 /* Leave existing broadcast group */
975 ipoib_leave_broadcast_group ( ipoib );
977 /* Update MAC address and broadcast GID based on new port GID
980 ipoib_set_ib_params ( ipoib );
982 /* Join new broadcast group */
983 if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
984 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
985 "%s\n", ipoib, strerror ( rc ) );
993 * @v ibdev Infiniband device
994 * @ret rc Return status code
996 int ipoib_probe ( struct ib_device *ibdev ) {
997 struct net_device *netdev;
998 struct ipoib_device *ipoib;
1001 /* Allocate network device */
1002 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
1005 netdev_init ( netdev, &ipoib_operations );
1006 ipoib = netdev->priv;
1007 ib_set_ownerdata ( ibdev, netdev );
1008 netdev->dev = ibdev->dev;
1009 memset ( ipoib, 0, sizeof ( *ipoib ) );
1010 ipoib->netdev = netdev;
1011 ipoib->ibdev = ibdev;
1013 /* Calculate as much of the broadcast GID and the MAC address
1014 * as we can. We won't know either of these in full until we
1017 ipoib_set_ib_params ( ipoib );
1019 /* Register network device */
1020 if ( ( rc = register_netdev ( netdev ) ) != 0 )
1021 goto err_register_netdev;
1025 err_register_netdev:
1026 netdev_nullify ( netdev );
1027 netdev_put ( netdev );
1032 * Remove IPoIB device
1034 * @v ibdev Infiniband device
1036 void ipoib_remove ( struct ib_device *ibdev ) {
1037 struct net_device *netdev = ib_get_ownerdata ( ibdev );
1039 unregister_netdev ( netdev );
1040 netdev_nullify ( netdev );
1041 netdev_put ( netdev );