2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 FILE_LICENCE ( GPL2_OR_LATER );
27 #include <gpxe/errortab.h>
28 #include <gpxe/if_arp.h>
29 #include <gpxe/iobuf.h>
30 #include <gpxe/netdevice.h>
31 #include <gpxe/infiniband.h>
32 #include <gpxe/ib_pathrec.h>
33 #include <gpxe/ib_mcast.h>
34 #include <gpxe/ipoib.h>
41 /** Number of IPoIB send work queue entries */
42 #define IPOIB_NUM_SEND_WQES 2
44 /** Number of IPoIB receive work queue entries */
45 #define IPOIB_NUM_RECV_WQES 4
47 /** Number of IPoIB completion entries */
48 #define IPOIB_NUM_CQES 8
50 /** An IPoIB device */
53 struct net_device *netdev;
54 /** Underlying Infiniband device */
55 struct ib_device *ibdev;
56 /** Completion queue */
57 struct ib_completion_queue *cq;
59 struct ib_queue_pair *qp;
61 struct ipoib_mac broadcast;
62 /** Joined to IPv4 broadcast multicast group
64 * This flag indicates whether or not we have initiated the
65 * join to the IPv4 broadcast multicast group.
68 /** IPv4 broadcast multicast group membership */
69 struct ib_mc_membership broadcast_membership;
72 /** Broadcast IPoIB address */
73 static struct ipoib_mac ipoib_broadcast = {
74 .qpn = htonl ( IB_QPN_BROADCAST ),
75 .gid.u.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
79 /** Link status for "broadcast join in progress" */
80 #define EINPROGRESS_JOINING ( EINPROGRESS | EUNIQ_01 )
82 /** Human-readable message for the link status */
83 struct errortab ipoib_errors[] __errortab = {
84 { EINPROGRESS_JOINING, "Joining" },
87 /****************************************************************************
91 ****************************************************************************
97 * The IPoIB link-layer header is only four bytes long and so does not
98 * have sufficient room to store IPoIB MAC address(es). We therefore
99 * maintain a cache of MAC addresses identified by a single-byte key,
100 * and abuse the spare two bytes within the link-layer header to
101 * communicate these MAC addresses between the link-layer code and the
108 struct ipoib_mac mac;
111 /** Number of IPoIB peer cache entries
113 * Must be a power of two.
115 #define IPOIB_NUM_CACHED_PEERS 4
117 /** IPoIB peer address cache */
118 static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
120 /** Oldest IPoIB peer cache entry index */
121 static unsigned int ipoib_peer_cache_idx = 1;
124 * Look up cached peer by key
126 * @v key Peer cache key
127 * @ret peer Peer cache entry, or NULL
129 static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) {
130 struct ipoib_peer *peer;
133 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
134 peer = &ipoib_peer_cache[i];
135 if ( peer->key == key )
140 DBG ( "IPoIB warning: peer cache lost track of key %x while "
141 "still in use\n", key );
147 * Store GID and QPN in peer cache
151 * @ret peer Peer cache entry
153 static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) {
154 struct ipoib_peer *peer;
158 /* Look for existing cache entry */
159 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
160 peer = &ipoib_peer_cache[i];
161 if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 )
165 /* No entry found: create a new one */
166 key = ipoib_peer_cache_idx++;
167 peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ];
169 DBG ( "IPoIB peer %x evicted from cache\n", peer->key );
171 memset ( peer, 0, sizeof ( *peer ) );
173 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
174 DBG ( "IPoIB peer %x has MAC %s\n",
175 peer->key, ipoib_ntoa ( &peer->mac ) );
179 /****************************************************************************
183 ****************************************************************************
187 * Add IPoIB link-layer header
189 * @v netdev Network device
190 * @v iobuf I/O buffer
191 * @v ll_dest Link-layer destination address
192 * @v ll_source Source link-layer address
193 * @v net_proto Network-layer protocol, in network-byte order
194 * @ret rc Return status code
196 static int ipoib_push ( struct net_device *netdev __unused,
197 struct io_buffer *iobuf, const void *ll_dest,
198 const void *ll_source __unused, uint16_t net_proto ) {
199 struct ipoib_hdr *ipoib_hdr =
200 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
201 const struct ipoib_mac *dest_mac = ll_dest;
202 const struct ipoib_mac *src_mac = ll_source;
203 struct ipoib_peer *dest;
204 struct ipoib_peer *src;
206 /* Add link-layer addresses to cache */
207 dest = ipoib_cache_peer ( dest_mac );
208 src = ipoib_cache_peer ( src_mac );
210 /* Build IPoIB header */
211 ipoib_hdr->proto = net_proto;
212 ipoib_hdr->u.peer.dest = dest->key;
213 ipoib_hdr->u.peer.src = src->key;
219 * Remove IPoIB link-layer header
221 * @v netdev Network device
222 * @v iobuf I/O buffer
223 * @ret ll_dest Link-layer destination address
224 * @ret ll_source Source link-layer address
225 * @ret net_proto Network-layer protocol, in network-byte order
226 * @ret rc Return status code
228 static int ipoib_pull ( struct net_device *netdev,
229 struct io_buffer *iobuf, const void **ll_dest,
230 const void **ll_source, uint16_t *net_proto ) {
231 struct ipoib_device *ipoib = netdev->priv;
232 struct ipoib_hdr *ipoib_hdr = iobuf->data;
233 struct ipoib_peer *dest;
234 struct ipoib_peer *source;
237 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
238 DBG ( "IPoIB packet too short for link-layer header\n" );
239 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
243 /* Strip off IPoIB header */
244 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
246 /* Identify source and destination addresses, and clear
247 * reserved word in IPoIB header
249 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
250 source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
251 ipoib_hdr->u.reserved = 0;
253 /* Fill in required fields */
254 *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast );
255 *ll_source = ( source ? &source->mac : &ipoib->broadcast );
256 *net_proto = ipoib_hdr->proto;
262 * Initialise IPoIB link-layer address
264 * @v hw_addr Hardware address
265 * @v ll_addr Link-layer address
267 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
268 const struct ib_gid_half *guid = hw_addr;
269 struct ipoib_mac *mac = ll_addr;
271 memset ( mac, 0, sizeof ( *mac ) );
272 memcpy ( &mac->gid.u.half[1], guid, sizeof ( mac->gid.u.half[1] ) );
276 * Transcribe IPoIB link-layer address
278 * @v ll_addr Link-layer address
279 * @ret string Link-layer address in human-readable format
281 const char * ipoib_ntoa ( const void *ll_addr ) {
283 const struct ipoib_mac *mac = ll_addr;
285 snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x",
286 htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
287 htonl ( mac->gid.u.dwords[1] ),
288 htonl ( mac->gid.u.dwords[2] ),
289 htonl ( mac->gid.u.dwords[3] ) );
294 * Hash multicast address
296 * @v af Address family
297 * @v net_addr Network-layer address
298 * @v ll_addr Link-layer address to fill in
299 * @ret rc Return status code
301 static int ipoib_mc_hash ( unsigned int af __unused,
302 const void *net_addr __unused,
303 void *ll_addr __unused ) {
309 * Generate Mellanox Ethernet-compatible compressed link-layer address
311 * @v ll_addr Link-layer address
312 * @v eth_addr Ethernet-compatible address to fill in
314 static int ipoib_mlx_eth_addr ( const struct ib_gid_half *guid,
315 uint8_t *eth_addr ) {
316 eth_addr[0] = ( ( guid->u.bytes[3] == 2 ) ? 0x00 : 0x02 );
317 eth_addr[1] = guid->u.bytes[1];
318 eth_addr[2] = guid->u.bytes[2];
319 eth_addr[3] = guid->u.bytes[5];
320 eth_addr[4] = guid->u.bytes[6];
321 eth_addr[5] = guid->u.bytes[7];
325 /** An IPoIB Ethernet-compatible compressed link-layer address generator */
326 struct ipoib_eth_addr_handler {
332 int ( * eth_addr ) ( const struct ib_gid_half *guid,
336 /** IPoIB Ethernet-compatible compressed link-layer address generators */
337 static struct ipoib_eth_addr_handler ipoib_eth_addr_handlers[] = {
338 { 0x02, 0xc9, ipoib_mlx_eth_addr },
342 * Generate Ethernet-compatible compressed link-layer address
344 * @v ll_addr Link-layer address
345 * @v eth_addr Ethernet-compatible address to fill in
347 static int ipoib_eth_addr ( const void *ll_addr, void *eth_addr ) {
348 const struct ipoib_mac *ipoib_addr = ll_addr;
349 const struct ib_gid_half *guid = &ipoib_addr->gid.u.half[1];
350 struct ipoib_eth_addr_handler *handler;
353 for ( i = 0 ; i < ( sizeof ( ipoib_eth_addr_handlers ) /
354 sizeof ( ipoib_eth_addr_handlers[0] ) ) ; i++ ) {
355 handler = &ipoib_eth_addr_handlers[i];
356 if ( ( handler->byte1 == guid->u.bytes[1] ) &&
357 ( handler->byte2 == guid->u.bytes[2] ) ) {
358 return handler->eth_addr ( guid, eth_addr );
364 /** IPoIB protocol */
365 struct ll_protocol ipoib_protocol __ll_protocol = {
367 .ll_proto = htons ( ARPHRD_INFINIBAND ),
368 .hw_addr_len = sizeof ( struct ib_gid_half ),
369 .ll_addr_len = IPOIB_ALEN,
370 .ll_header_len = IPOIB_HLEN,
373 .init_addr = ipoib_init_addr,
375 .mc_hash = ipoib_mc_hash,
376 .eth_addr = ipoib_eth_addr,
380 * Allocate IPoIB device
382 * @v priv_size Size of driver private data
383 * @ret netdev Network device, or NULL
385 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
386 struct net_device *netdev;
388 netdev = alloc_netdev ( priv_size );
390 netdev->ll_protocol = &ipoib_protocol;
391 netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast;
392 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
397 /****************************************************************************
399 * IPoIB network device
401 ****************************************************************************
405 * Transmit packet via IPoIB network device
407 * @v netdev Network device
408 * @v iobuf I/O buffer
409 * @ret rc Return status code
411 static int ipoib_transmit ( struct net_device *netdev,
412 struct io_buffer *iobuf ) {
413 struct ipoib_device *ipoib = netdev->priv;
414 struct ib_device *ibdev = ipoib->ibdev;
415 struct ipoib_hdr *ipoib_hdr;
416 struct ipoib_peer *dest;
417 struct ib_address_vector av;
421 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
422 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
425 ipoib_hdr = iobuf->data;
427 /* Attempting transmission while link is down will put the
428 * queue pair into an error state, so don't try it.
430 if ( ! ib_link_ok ( ibdev ) )
433 /* Identify destination address */
434 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
437 ipoib_hdr->u.reserved = 0;
439 /* Construct address vector */
440 memset ( &av, 0, sizeof ( av ) );
441 av.qpn = ntohl ( dest->mac.qpn );
443 memcpy ( &av.gid, &dest->mac.gid, sizeof ( av.gid ) );
444 if ( ( rc = ib_resolve_path ( ibdev, &av ) ) != 0 ) {
445 /* Path not resolved yet */
449 return ib_post_send ( ibdev, ipoib->qp, &av, iobuf );
453 * Handle IPoIB send completion
455 * @v ibdev Infiniband device
457 * @v iobuf I/O buffer
458 * @v rc Completion status code
460 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
461 struct ib_queue_pair *qp,
462 struct io_buffer *iobuf, int rc ) {
463 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
465 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
469 * Handle IPoIB receive completion
471 * @v ibdev Infiniband device
473 * @v av Address vector, or NULL
474 * @v iobuf I/O buffer
475 * @v rc Completion status code
477 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
478 struct ib_queue_pair *qp,
479 struct ib_address_vector *av,
480 struct io_buffer *iobuf, int rc ) {
481 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
482 struct net_device *netdev = ipoib->netdev;
483 struct ipoib_hdr *ipoib_hdr;
484 struct ipoib_mac ll_src;
485 struct ipoib_peer *src;
488 netdev_rx_err ( netdev, iobuf, rc );
493 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
494 DBGC ( ipoib, "IPoIB %p received packet too short to "
495 "contain IPoIB header\n", ipoib );
496 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
497 netdev_rx_err ( netdev, iobuf, -EIO );
500 ipoib_hdr = iobuf->data;
502 /* Parse source address */
503 if ( av->gid_present ) {
504 ll_src.qpn = htonl ( av->qpn );
505 memcpy ( &ll_src.gid, &av->gid, sizeof ( ll_src.gid ) );
506 src = ipoib_cache_peer ( &ll_src );
507 ipoib_hdr->u.peer.src = src->key;
510 /* Hand off to network layer */
511 netdev_rx ( netdev, iobuf );
514 /** IPoIB completion operations */
515 static struct ib_completion_queue_operations ipoib_cq_op = {
516 .complete_send = ipoib_complete_send,
517 .complete_recv = ipoib_complete_recv,
521 * Poll IPoIB network device
523 * @v netdev Network device
525 static void ipoib_poll ( struct net_device *netdev ) {
526 struct ipoib_device *ipoib = netdev->priv;
527 struct ib_device *ibdev = ipoib->ibdev;
529 ib_poll_eq ( ibdev );
533 * Enable/disable interrupts on IPoIB network device
535 * @v netdev Network device
536 * @v enable Interrupts should be enabled
538 static void ipoib_irq ( struct net_device *netdev __unused,
539 int enable __unused ) {
540 /* No implementation */
544 * Handle IPv4 broadcast multicast group join completion
546 * @v ibdev Infiniband device
548 * @v membership Multicast group membership
550 * @v mad Response MAD (or NULL on error)
552 void ipoib_join_complete ( struct ib_device *ibdev __unused,
553 struct ib_queue_pair *qp __unused,
554 struct ib_mc_membership *membership, int rc,
555 union ib_mad *mad __unused ) {
556 struct ipoib_device *ipoib = container_of ( membership,
557 struct ipoib_device, broadcast_membership );
559 /* Record join status as link status */
560 netdev_link_err ( ipoib->netdev, rc );
564 * Join IPv4 broadcast multicast group
566 * @v ipoib IPoIB device
567 * @ret rc Return status code
569 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
572 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
573 &ipoib->broadcast_membership,
574 &ipoib->broadcast.gid,
575 ipoib_join_complete ) ) != 0 ) {
576 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
577 ipoib, strerror ( rc ) );
580 ipoib->broadcast_joined = 1;
586 * Leave IPv4 broadcast multicast group
588 * @v ipoib IPoIB device
590 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
592 if ( ipoib->broadcast_joined ) {
593 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
594 &ipoib->broadcast_membership );
595 ipoib->broadcast_joined = 0;
600 * Open IPoIB network device
602 * @v netdev Network device
603 * @ret rc Return status code
605 static int ipoib_open ( struct net_device *netdev ) {
606 struct ipoib_device *ipoib = netdev->priv;
607 struct ib_device *ibdev = ipoib->ibdev;
608 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
612 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
613 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
614 ipoib, strerror ( rc ) );
618 /* Allocate completion queue */
619 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
621 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
627 /* Allocate queue pair */
628 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD,
629 IPOIB_NUM_SEND_WQES, ipoib->cq,
630 IPOIB_NUM_RECV_WQES, ipoib->cq );
632 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
637 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
639 /* Update MAC address with QPN */
640 mac->qpn = htonl ( ipoib->qp->qpn );
642 /* Fill receive rings */
643 ib_refill_recv ( ibdev, ipoib->qp );
645 /* Fake a link status change to join the broadcast group */
646 ipoib_link_state_changed ( ibdev );
650 ib_destroy_qp ( ibdev, ipoib->qp );
652 ib_destroy_cq ( ibdev, ipoib->cq );
660 * Close IPoIB network device
662 * @v netdev Network device
664 static void ipoib_close ( struct net_device *netdev ) {
665 struct ipoib_device *ipoib = netdev->priv;
666 struct ib_device *ibdev = ipoib->ibdev;
667 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
669 /* Leave broadcast group */
670 ipoib_leave_broadcast_group ( ipoib );
672 /* Remove QPN from MAC address */
675 /* Tear down the queues */
676 ib_destroy_qp ( ibdev, ipoib->qp );
677 ib_destroy_cq ( ibdev, ipoib->cq );
679 /* Close IB device */
683 /** IPoIB network device operations */
684 static struct net_device_operations ipoib_operations = {
686 .close = ipoib_close,
687 .transmit = ipoib_transmit,
693 * Handle link status change
695 * @v ibdev Infiniband device
697 void ipoib_link_state_changed ( struct ib_device *ibdev ) {
698 struct net_device *netdev = ib_get_ownerdata ( ibdev );
699 struct ipoib_device *ipoib = netdev->priv;
700 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
703 /* Leave existing broadcast group */
704 ipoib_leave_broadcast_group ( ipoib );
706 /* Update MAC address based on potentially-new GID prefix */
707 memcpy ( &mac->gid.u.half[0], &ibdev->gid.u.half[0],
708 sizeof ( mac->gid.u.half[0] ) );
710 /* Update broadcast GID based on potentially-new partition key */
711 ipoib->broadcast.gid.u.words[2] =
712 htons ( ibdev->pkey | IB_PKEY_FULL );
714 /* Set net device link state to reflect Infiniband link state */
715 rc = ib_link_rc ( ibdev );
716 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
718 /* Join new broadcast group */
719 if ( ib_link_ok ( ibdev ) &&
720 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
721 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
722 "%s\n", ipoib, strerror ( rc ) );
723 netdev_link_err ( netdev, rc );
731 * @v ibdev Infiniband device
732 * @ret rc Return status code
734 int ipoib_probe ( struct ib_device *ibdev ) {
735 struct net_device *netdev;
736 struct ipoib_device *ipoib;
739 /* Allocate network device */
740 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
743 netdev_init ( netdev, &ipoib_operations );
744 ipoib = netdev->priv;
745 ib_set_ownerdata ( ibdev, netdev );
746 netdev->dev = ibdev->dev;
747 memset ( ipoib, 0, sizeof ( *ipoib ) );
748 ipoib->netdev = netdev;
749 ipoib->ibdev = ibdev;
751 /* Extract hardware address */
752 memcpy ( netdev->hw_addr, &ibdev->gid.u.half[1],
753 sizeof ( ibdev->gid.u.half[1] ) );
755 /* Set default broadcast address */
756 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
757 sizeof ( ipoib->broadcast ) );
758 netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast );
760 /* Register network device */
761 if ( ( rc = register_netdev ( netdev ) ) != 0 )
762 goto err_register_netdev;
767 netdev_nullify ( netdev );
768 netdev_put ( netdev );
773 * Remove IPoIB device
775 * @v ibdev Infiniband device
777 void ipoib_remove ( struct ib_device *ibdev ) {
778 struct net_device *netdev = ib_get_ownerdata ( ibdev );
780 unregister_netdev ( netdev );
781 netdev_nullify ( netdev );
782 netdev_put ( netdev );