2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 FILE_LICENCE ( GPL2_OR_LATER );
27 #include <gpxe/errortab.h>
28 #include <gpxe/if_arp.h>
29 #include <gpxe/iobuf.h>
30 #include <gpxe/netdevice.h>
31 #include <gpxe/infiniband.h>
32 #include <gpxe/ib_pathrec.h>
33 #include <gpxe/ib_mcast.h>
34 #include <gpxe/ipoib.h>
41 /** Number of IPoIB send work queue entries */
42 #define IPOIB_NUM_SEND_WQES 2
44 /** Number of IPoIB receive work queue entries */
45 #define IPOIB_NUM_RECV_WQES 4
47 /** Number of IPoIB completion entries */
48 #define IPOIB_NUM_CQES 8
50 /** An IPoIB device */
53 struct net_device *netdev;
54 /** Underlying Infiniband device */
55 struct ib_device *ibdev;
56 /** Completion queue */
57 struct ib_completion_queue *cq;
59 struct ib_queue_pair *qp;
61 struct ipoib_mac broadcast;
62 /** Joined to IPv4 broadcast multicast group
64 * This flag indicates whether or not we have initiated the
65 * join to the IPv4 broadcast multicast group.
68 /** IPv4 broadcast multicast group membership */
69 struct ib_mc_membership broadcast_membership;
72 /** Broadcast IPoIB address */
73 static struct ipoib_mac ipoib_broadcast = {
74 .flags__qpn = htonl ( IB_QPN_BROADCAST ),
75 .gid.u.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
76 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
79 /** Link status for "broadcast join in progress" */
80 #define EINPROGRESS_JOINING ( EINPROGRESS | EUNIQ_01 )
82 /** Human-readable message for the link status */
83 struct errortab ipoib_errors[] __errortab = {
84 { EINPROGRESS_JOINING, "Joining" },
87 /****************************************************************************
91 ****************************************************************************
97 * The IPoIB link-layer header is only four bytes long and so does not
98 * have sufficient room to store IPoIB MAC address(es). We therefore
99 * maintain a cache of MAC addresses identified by a single-byte key,
100 * and abuse the spare two bytes within the link-layer header to
101 * communicate these MAC addresses between the link-layer code and the
108 struct ipoib_mac mac;
111 /** Number of IPoIB peer cache entries
113 * Must be a power of two.
115 #define IPOIB_NUM_CACHED_PEERS 4
117 /** IPoIB peer address cache */
118 static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
120 /** Oldest IPoIB peer cache entry index */
121 static unsigned int ipoib_peer_cache_idx = 1;
124 * Look up cached peer by key
126 * @v key Peer cache key
127 * @ret peer Peer cache entry, or NULL
129 static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) {
130 struct ipoib_peer *peer;
133 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
134 peer = &ipoib_peer_cache[i];
135 if ( peer->key == key )
140 DBG ( "IPoIB warning: peer cache lost track of key %x while "
141 "still in use\n", key );
147 * Store GID and QPN in peer cache
149 * @v mac Peer MAC address
150 * @ret peer Peer cache entry
152 static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) {
153 struct ipoib_peer *peer;
157 /* Look for existing cache entry */
158 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
159 peer = &ipoib_peer_cache[i];
160 if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 )
164 /* No entry found: create a new one */
165 key = ipoib_peer_cache_idx++;
166 peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ];
168 DBG ( "IPoIB peer %x evicted from cache\n", peer->key );
170 memset ( peer, 0, sizeof ( *peer ) );
172 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
173 DBG ( "IPoIB peer %x has MAC %s\n",
174 peer->key, ipoib_ntoa ( &peer->mac ) );
178 /****************************************************************************
182 ****************************************************************************
186 * Add IPoIB link-layer header
188 * @v netdev Network device
189 * @v iobuf I/O buffer
190 * @v ll_dest Link-layer destination address
191 * @v ll_source Source link-layer address
192 * @v net_proto Network-layer protocol, in network-byte order
193 * @ret rc Return status code
195 static int ipoib_push ( struct net_device *netdev __unused,
196 struct io_buffer *iobuf, const void *ll_dest,
197 const void *ll_source __unused, uint16_t net_proto ) {
198 struct ipoib_hdr *ipoib_hdr =
199 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
200 const struct ipoib_mac *dest_mac = ll_dest;
201 const struct ipoib_mac *src_mac = ll_source;
202 struct ipoib_peer *dest;
203 struct ipoib_peer *src;
205 /* Add link-layer addresses to cache */
206 dest = ipoib_cache_peer ( dest_mac );
207 src = ipoib_cache_peer ( src_mac );
209 /* Build IPoIB header */
210 ipoib_hdr->proto = net_proto;
211 ipoib_hdr->u.peer.dest = dest->key;
212 ipoib_hdr->u.peer.src = src->key;
218 * Remove IPoIB link-layer header
220 * @v netdev Network device
221 * @v iobuf I/O buffer
222 * @ret ll_dest Link-layer destination address
223 * @ret ll_source Source link-layer address
224 * @ret net_proto Network-layer protocol, in network-byte order
225 * @ret rc Return status code
227 static int ipoib_pull ( struct net_device *netdev,
228 struct io_buffer *iobuf, const void **ll_dest,
229 const void **ll_source, uint16_t *net_proto ) {
230 struct ipoib_device *ipoib = netdev->priv;
231 struct ipoib_hdr *ipoib_hdr = iobuf->data;
232 struct ipoib_peer *dest;
233 struct ipoib_peer *source;
236 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
237 DBG ( "IPoIB packet too short for link-layer header\n" );
238 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
242 /* Strip off IPoIB header */
243 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
245 /* Identify source and destination addresses, and clear
246 * reserved word in IPoIB header
248 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
249 source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
250 ipoib_hdr->u.reserved = 0;
252 /* Fill in required fields */
253 *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast );
254 *ll_source = ( source ? &source->mac : &ipoib->broadcast );
255 *net_proto = ipoib_hdr->proto;
261 * Initialise IPoIB link-layer address
263 * @v hw_addr Hardware address
264 * @v ll_addr Link-layer address
266 static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) {
267 const struct ib_gid_half *guid = hw_addr;
268 struct ipoib_mac *mac = ll_addr;
270 memset ( mac, 0, sizeof ( *mac ) );
271 memcpy ( &mac->gid.u.half[1], guid, sizeof ( mac->gid.u.half[1] ) );
275 * Transcribe IPoIB link-layer address
277 * @v ll_addr Link-layer address
278 * @ret string Link-layer address in human-readable format
280 const char * ipoib_ntoa ( const void *ll_addr ) {
282 const struct ipoib_mac *mac = ll_addr;
284 snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x",
285 htonl ( mac->flags__qpn ), htonl ( mac->gid.u.dwords[0] ),
286 htonl ( mac->gid.u.dwords[1] ),
287 htonl ( mac->gid.u.dwords[2] ),
288 htonl ( mac->gid.u.dwords[3] ) );
293 * Hash multicast address
295 * @v af Address family
296 * @v net_addr Network-layer address
297 * @v ll_addr Link-layer address to fill in
298 * @ret rc Return status code
300 static int ipoib_mc_hash ( unsigned int af __unused,
301 const void *net_addr __unused,
302 void *ll_addr __unused ) {
308 * Generate Mellanox Ethernet-compatible compressed link-layer address
310 * @v ll_addr Link-layer address
311 * @v eth_addr Ethernet-compatible address to fill in
313 static int ipoib_mlx_eth_addr ( const struct ib_gid_half *guid,
314 uint8_t *eth_addr ) {
315 eth_addr[0] = ( ( guid->u.bytes[3] == 2 ) ? 0x00 : 0x02 );
316 eth_addr[1] = guid->u.bytes[1];
317 eth_addr[2] = guid->u.bytes[2];
318 eth_addr[3] = guid->u.bytes[5];
319 eth_addr[4] = guid->u.bytes[6];
320 eth_addr[5] = guid->u.bytes[7];
324 /** An IPoIB Ethernet-compatible compressed link-layer address generator */
325 struct ipoib_eth_addr_handler {
331 int ( * eth_addr ) ( const struct ib_gid_half *guid,
335 /** IPoIB Ethernet-compatible compressed link-layer address generators */
336 static struct ipoib_eth_addr_handler ipoib_eth_addr_handlers[] = {
337 { 0x02, 0xc9, ipoib_mlx_eth_addr },
341 * Generate Ethernet-compatible compressed link-layer address
343 * @v ll_addr Link-layer address
344 * @v eth_addr Ethernet-compatible address to fill in
346 static int ipoib_eth_addr ( const void *ll_addr, void *eth_addr ) {
347 const struct ipoib_mac *ipoib_addr = ll_addr;
348 const struct ib_gid_half *guid = &ipoib_addr->gid.u.half[1];
349 struct ipoib_eth_addr_handler *handler;
352 for ( i = 0 ; i < ( sizeof ( ipoib_eth_addr_handlers ) /
353 sizeof ( ipoib_eth_addr_handlers[0] ) ) ; i++ ) {
354 handler = &ipoib_eth_addr_handlers[i];
355 if ( ( handler->byte1 == guid->u.bytes[1] ) &&
356 ( handler->byte2 == guid->u.bytes[2] ) ) {
357 return handler->eth_addr ( guid, eth_addr );
363 /** IPoIB protocol */
364 struct ll_protocol ipoib_protocol __ll_protocol = {
366 .ll_proto = htons ( ARPHRD_INFINIBAND ),
367 .hw_addr_len = sizeof ( struct ib_gid_half ),
368 .ll_addr_len = IPOIB_ALEN,
369 .ll_header_len = IPOIB_HLEN,
372 .init_addr = ipoib_init_addr,
374 .mc_hash = ipoib_mc_hash,
375 .eth_addr = ipoib_eth_addr,
379 * Allocate IPoIB device
381 * @v priv_size Size of driver private data
382 * @ret netdev Network device, or NULL
384 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
385 struct net_device *netdev;
387 netdev = alloc_netdev ( priv_size );
389 netdev->ll_protocol = &ipoib_protocol;
390 netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast;
391 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
396 /****************************************************************************
398 * IPoIB network device
400 ****************************************************************************
404 * Transmit packet via IPoIB network device
406 * @v netdev Network device
407 * @v iobuf I/O buffer
408 * @ret rc Return status code
410 static int ipoib_transmit ( struct net_device *netdev,
411 struct io_buffer *iobuf ) {
412 struct ipoib_device *ipoib = netdev->priv;
413 struct ib_device *ibdev = ipoib->ibdev;
414 struct ipoib_hdr *ipoib_hdr;
415 struct ipoib_peer *dest;
416 struct ib_address_vector av;
420 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
421 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
424 ipoib_hdr = iobuf->data;
426 /* Attempting transmission while link is down will put the
427 * queue pair into an error state, so don't try it.
429 if ( ! ib_link_ok ( ibdev ) )
432 /* Identify destination address */
433 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
436 ipoib_hdr->u.reserved = 0;
438 /* Construct address vector */
439 memset ( &av, 0, sizeof ( av ) );
440 av.qpn = ( ntohl ( dest->mac.flags__qpn ) & IB_QPN_MASK );
442 memcpy ( &av.gid, &dest->mac.gid, sizeof ( av.gid ) );
443 if ( ( rc = ib_resolve_path ( ibdev, &av ) ) != 0 ) {
444 /* Path not resolved yet */
448 return ib_post_send ( ibdev, ipoib->qp, &av, iobuf );
452 * Handle IPoIB send completion
454 * @v ibdev Infiniband device
456 * @v iobuf I/O buffer
457 * @v rc Completion status code
459 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
460 struct ib_queue_pair *qp,
461 struct io_buffer *iobuf, int rc ) {
462 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
464 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
468 * Handle IPoIB receive completion
470 * @v ibdev Infiniband device
472 * @v av Address vector, or NULL
473 * @v iobuf I/O buffer
474 * @v rc Completion status code
476 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
477 struct ib_queue_pair *qp,
478 struct ib_address_vector *av,
479 struct io_buffer *iobuf, int rc ) {
480 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
481 struct net_device *netdev = ipoib->netdev;
482 struct ipoib_hdr *ipoib_hdr;
483 struct ipoib_mac ll_src;
484 struct ipoib_peer *src;
487 netdev_rx_err ( netdev, iobuf, rc );
492 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
493 DBGC ( ipoib, "IPoIB %p received packet too short to "
494 "contain IPoIB header\n", ipoib );
495 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
496 netdev_rx_err ( netdev, iobuf, -EIO );
499 ipoib_hdr = iobuf->data;
501 /* Parse source address */
502 if ( av->gid_present ) {
503 ll_src.flags__qpn = htonl ( av->qpn );
504 memcpy ( &ll_src.gid, &av->gid, sizeof ( ll_src.gid ) );
505 src = ipoib_cache_peer ( &ll_src );
506 ipoib_hdr->u.peer.src = src->key;
509 /* Hand off to network layer */
510 netdev_rx ( netdev, iobuf );
513 /** IPoIB completion operations */
514 static struct ib_completion_queue_operations ipoib_cq_op = {
515 .complete_send = ipoib_complete_send,
516 .complete_recv = ipoib_complete_recv,
520 * Poll IPoIB network device
522 * @v netdev Network device
524 static void ipoib_poll ( struct net_device *netdev ) {
525 struct ipoib_device *ipoib = netdev->priv;
526 struct ib_device *ibdev = ipoib->ibdev;
528 ib_poll_eq ( ibdev );
532 * Enable/disable interrupts on IPoIB network device
534 * @v netdev Network device
535 * @v enable Interrupts should be enabled
537 static void ipoib_irq ( struct net_device *netdev __unused,
538 int enable __unused ) {
539 /* No implementation */
543 * Handle IPv4 broadcast multicast group join completion
545 * @v ibdev Infiniband device
547 * @v membership Multicast group membership
549 * @v mad Response MAD (or NULL on error)
551 void ipoib_join_complete ( struct ib_device *ibdev __unused,
552 struct ib_queue_pair *qp __unused,
553 struct ib_mc_membership *membership, int rc,
554 union ib_mad *mad __unused ) {
555 struct ipoib_device *ipoib = container_of ( membership,
556 struct ipoib_device, broadcast_membership );
558 /* Record join status as link status */
559 netdev_link_err ( ipoib->netdev, rc );
563 * Join IPv4 broadcast multicast group
565 * @v ipoib IPoIB device
566 * @ret rc Return status code
568 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
571 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
572 &ipoib->broadcast_membership,
573 &ipoib->broadcast.gid,
574 ipoib_join_complete ) ) != 0 ) {
575 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
576 ipoib, strerror ( rc ) );
579 ipoib->broadcast_joined = 1;
585 * Leave IPv4 broadcast multicast group
587 * @v ipoib IPoIB device
589 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
591 if ( ipoib->broadcast_joined ) {
592 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
593 &ipoib->broadcast_membership );
594 ipoib->broadcast_joined = 0;
599 * Open IPoIB network device
601 * @v netdev Network device
602 * @ret rc Return status code
604 static int ipoib_open ( struct net_device *netdev ) {
605 struct ipoib_device *ipoib = netdev->priv;
606 struct ib_device *ibdev = ipoib->ibdev;
607 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
611 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
612 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
613 ipoib, strerror ( rc ) );
617 /* Allocate completion queue */
618 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
620 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
626 /* Allocate queue pair */
627 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD,
628 IPOIB_NUM_SEND_WQES, ipoib->cq,
629 IPOIB_NUM_RECV_WQES, ipoib->cq );
631 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
636 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
638 /* Update MAC address with QPN */
639 mac->flags__qpn = htonl ( ipoib->qp->qpn );
641 /* Fill receive rings */
642 ib_refill_recv ( ibdev, ipoib->qp );
644 /* Fake a link status change to join the broadcast group */
645 ipoib_link_state_changed ( ibdev );
649 ib_destroy_qp ( ibdev, ipoib->qp );
651 ib_destroy_cq ( ibdev, ipoib->cq );
659 * Close IPoIB network device
661 * @v netdev Network device
663 static void ipoib_close ( struct net_device *netdev ) {
664 struct ipoib_device *ipoib = netdev->priv;
665 struct ib_device *ibdev = ipoib->ibdev;
666 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
668 /* Leave broadcast group */
669 ipoib_leave_broadcast_group ( ipoib );
671 /* Remove QPN from MAC address */
674 /* Tear down the queues */
675 ib_destroy_qp ( ibdev, ipoib->qp );
676 ib_destroy_cq ( ibdev, ipoib->cq );
678 /* Close IB device */
682 /** IPoIB network device operations */
683 static struct net_device_operations ipoib_operations = {
685 .close = ipoib_close,
686 .transmit = ipoib_transmit,
692 * Handle link status change
694 * @v ibdev Infiniband device
696 void ipoib_link_state_changed ( struct ib_device *ibdev ) {
697 struct net_device *netdev = ib_get_ownerdata ( ibdev );
698 struct ipoib_device *ipoib = netdev->priv;
699 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
702 /* Leave existing broadcast group */
703 ipoib_leave_broadcast_group ( ipoib );
705 /* Update MAC address based on potentially-new GID prefix */
706 memcpy ( &mac->gid.u.half[0], &ibdev->gid.u.half[0],
707 sizeof ( mac->gid.u.half[0] ) );
709 /* Update broadcast GID based on potentially-new partition key */
710 ipoib->broadcast.gid.u.words[2] =
711 htons ( ibdev->pkey | IB_PKEY_FULL );
713 /* Set net device link state to reflect Infiniband link state */
714 rc = ib_link_rc ( ibdev );
715 netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) );
717 /* Join new broadcast group */
718 if ( ib_link_ok ( ibdev ) &&
719 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
720 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
721 "%s\n", ipoib, strerror ( rc ) );
722 netdev_link_err ( netdev, rc );
730 * @v ibdev Infiniband device
731 * @ret rc Return status code
733 int ipoib_probe ( struct ib_device *ibdev ) {
734 struct net_device *netdev;
735 struct ipoib_device *ipoib;
738 /* Allocate network device */
739 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
742 netdev_init ( netdev, &ipoib_operations );
743 ipoib = netdev->priv;
744 ib_set_ownerdata ( ibdev, netdev );
745 netdev->dev = ibdev->dev;
746 memset ( ipoib, 0, sizeof ( *ipoib ) );
747 ipoib->netdev = netdev;
748 ipoib->ibdev = ibdev;
750 /* Extract hardware address */
751 memcpy ( netdev->hw_addr, &ibdev->gid.u.half[1],
752 sizeof ( ibdev->gid.u.half[1] ) );
754 /* Set default broadcast address */
755 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
756 sizeof ( ipoib->broadcast ) );
757 netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast );
759 /* Register network device */
760 if ( ( rc = register_netdev ( netdev ) ) != 0 )
761 goto err_register_netdev;
766 netdev_nullify ( netdev );
767 netdev_put ( netdev );
772 * Remove IPoIB device
774 * @v ibdev Infiniband device
776 void ipoib_remove ( struct ib_device *ibdev ) {
777 struct net_device *netdev = ib_get_ownerdata ( ibdev );
779 unregister_netdev ( netdev );
780 netdev_nullify ( netdev );
781 netdev_put ( netdev );