2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 FILE_LICENCE ( GPL2_OR_LATER );
27 #include <gpxe/if_arp.h>
28 #include <gpxe/iobuf.h>
29 #include <gpxe/netdevice.h>
30 #include <gpxe/infiniband.h>
31 #include <gpxe/ib_pathrec.h>
32 #include <gpxe/ib_mcast.h>
33 #include <gpxe/ipoib.h>
40 /** Number of IPoIB send work queue entries */
41 #define IPOIB_NUM_SEND_WQES 2
43 /** Number of IPoIB receive work queue entries */
44 #define IPOIB_NUM_RECV_WQES 4
46 /** Number of IPoIB completion entries */
47 #define IPOIB_NUM_CQES 8
49 /** An IPoIB device */
52 struct net_device *netdev;
53 /** Underlying Infiniband device */
54 struct ib_device *ibdev;
55 /** Completion queue */
56 struct ib_completion_queue *cq;
58 struct ib_queue_pair *qp;
60 struct ipoib_mac broadcast;
61 /** Joined to multicast group
63 * This flag indicates whether or not we have initiated the
64 * join to the IPv4 multicast group.
69 /** Broadcast IPoIB address */
70 static struct ipoib_mac ipoib_broadcast = {
71 .qpn = htonl ( IB_QPN_BROADCAST ),
72 .gid.u.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
73 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
76 /****************************************************************************
80 ****************************************************************************
86 * The IPoIB link-layer header is only four bytes long and so does not
87 * have sufficient room to store IPoIB MAC address(es). We therefore
88 * maintain a cache of MAC addresses identified by a single-byte key,
89 * and abuse the spare two bytes within the link-layer header to
90 * communicate these MAC addresses between the link-layer code and the
100 /** Number of IPoIB peer cache entries
102 * Must be a power of two.
104 #define IPOIB_NUM_CACHED_PEERS 4
106 /** IPoIB peer address cache */
107 static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
109 /** Oldest IPoIB peer cache entry index */
110 static unsigned int ipoib_peer_cache_idx = 1;
113 * Look up cached peer by key
115 * @v key Peer cache key
116 * @ret peer Peer cache entry, or NULL
118 static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) {
119 struct ipoib_peer *peer;
122 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
123 peer = &ipoib_peer_cache[i];
124 if ( peer->key == key )
129 DBG ( "IPoIB warning: peer cache lost track of key %x while "
130 "still in use\n", key );
136 * Store GID and QPN in peer cache
140 * @ret peer Peer cache entry
142 static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) {
143 struct ipoib_peer *peer;
147 /* Look for existing cache entry */
148 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
149 peer = &ipoib_peer_cache[i];
150 if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 )
154 /* No entry found: create a new one */
155 key = ipoib_peer_cache_idx++;
156 peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ];
158 DBG ( "IPoIB peer %x evicted from cache\n", peer->key );
160 memset ( peer, 0, sizeof ( *peer ) );
162 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
163 DBG ( "IPoIB peer %x has MAC %s\n",
164 peer->key, ipoib_ntoa ( &peer->mac ) );
168 /****************************************************************************
172 ****************************************************************************
176 * Add IPoIB link-layer header
178 * @v netdev Network device
179 * @v iobuf I/O buffer
180 * @v ll_dest Link-layer destination address
181 * @v ll_source Source link-layer address
182 * @v net_proto Network-layer protocol, in network-byte order
183 * @ret rc Return status code
185 static int ipoib_push ( struct net_device *netdev __unused,
186 struct io_buffer *iobuf, const void *ll_dest,
187 const void *ll_source __unused, uint16_t net_proto ) {
188 struct ipoib_hdr *ipoib_hdr =
189 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
190 const struct ipoib_mac *dest_mac = ll_dest;
191 const struct ipoib_mac *src_mac = ll_source;
192 struct ipoib_peer *dest;
193 struct ipoib_peer *src;
195 /* Add link-layer addresses to cache */
196 dest = ipoib_cache_peer ( dest_mac );
197 src = ipoib_cache_peer ( src_mac );
199 /* Build IPoIB header */
200 ipoib_hdr->proto = net_proto;
201 ipoib_hdr->u.peer.dest = dest->key;
202 ipoib_hdr->u.peer.src = src->key;
208 * Remove IPoIB link-layer header
210 * @v netdev Network device
211 * @v iobuf I/O buffer
212 * @ret ll_dest Link-layer destination address
213 * @ret ll_source Source link-layer address
214 * @ret net_proto Network-layer protocol, in network-byte order
215 * @ret rc Return status code
217 static int ipoib_pull ( struct net_device *netdev,
218 struct io_buffer *iobuf, const void **ll_dest,
219 const void **ll_source, uint16_t *net_proto ) {
220 struct ipoib_device *ipoib = netdev->priv;
221 struct ipoib_hdr *ipoib_hdr = iobuf->data;
222 struct ipoib_peer *dest;
223 struct ipoib_peer *source;
226 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
227 DBG ( "IPoIB packet too short for link-layer header\n" );
228 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
232 /* Strip off IPoIB header */
233 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
235 /* Identify source and destination addresses, and clear
236 * reserved word in IPoIB header
238 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
239 source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
240 ipoib_hdr->u.reserved = 0;
242 /* Fill in required fields */
243 *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast );
244 *ll_source = ( source ? &source->mac : &ipoib->broadcast );
245 *net_proto = ipoib_hdr->proto;
251 * Transcribe IPoIB address
253 * @v ll_addr Link-layer address
254 * @ret string Link-layer address in human-readable format
256 const char * ipoib_ntoa ( const void *ll_addr ) {
258 const struct ipoib_mac *mac = ll_addr;
260 snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x",
261 htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
262 htonl ( mac->gid.u.dwords[1] ),
263 htonl ( mac->gid.u.dwords[2] ),
264 htonl ( mac->gid.u.dwords[3] ) );
269 * Hash multicast address
271 * @v af Address family
272 * @v net_addr Network-layer address
273 * @v ll_addr Link-layer address to fill in
274 * @ret rc Return status code
276 static int ipoib_mc_hash ( unsigned int af __unused,
277 const void *net_addr __unused,
278 void *ll_addr __unused ) {
283 /** IPoIB protocol */
284 struct ll_protocol ipoib_protocol __ll_protocol = {
286 .ll_proto = htons ( ARPHRD_INFINIBAND ),
287 .ll_addr_len = IPOIB_ALEN,
288 .ll_header_len = IPOIB_HLEN,
292 .mc_hash = ipoib_mc_hash,
296 * Allocate IPoIB device
298 * @v priv_size Size of driver private data
299 * @ret netdev Network device, or NULL
301 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
302 struct net_device *netdev;
304 netdev = alloc_netdev ( priv_size );
306 netdev->ll_protocol = &ipoib_protocol;
307 netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast;
308 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
313 /****************************************************************************
315 * IPoIB network device
317 ****************************************************************************
321 * Transmit packet via IPoIB network device
323 * @v netdev Network device
324 * @v iobuf I/O buffer
325 * @ret rc Return status code
327 static int ipoib_transmit ( struct net_device *netdev,
328 struct io_buffer *iobuf ) {
329 struct ipoib_device *ipoib = netdev->priv;
330 struct ib_device *ibdev = ipoib->ibdev;
331 struct ipoib_hdr *ipoib_hdr;
332 struct ipoib_peer *dest;
333 struct ib_address_vector av;
337 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
338 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
341 ipoib_hdr = iobuf->data;
343 /* Attempting transmission while link is down will put the
344 * queue pair into an error state, so don't try it.
346 if ( ! ib_link_ok ( ibdev ) )
349 /* Identify destination address */
350 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
353 ipoib_hdr->u.reserved = 0;
355 /* Construct address vector */
356 memset ( &av, 0, sizeof ( av ) );
357 av.qpn = ntohl ( dest->mac.qpn );
359 memcpy ( &av.gid, &dest->mac.gid, sizeof ( av.gid ) );
360 if ( ( rc = ib_resolve_path ( ibdev, &av ) ) != 0 ) {
361 /* Path not resolved yet */
365 return ib_post_send ( ibdev, ipoib->qp, &av, iobuf );
369 * Handle IPoIB send completion
371 * @v ibdev Infiniband device
373 * @v iobuf I/O buffer
374 * @v rc Completion status code
376 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
377 struct ib_queue_pair *qp,
378 struct io_buffer *iobuf, int rc ) {
379 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
381 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
385 * Handle IPoIB receive completion
387 * @v ibdev Infiniband device
389 * @v av Address vector, or NULL
390 * @v iobuf I/O buffer
391 * @v rc Completion status code
393 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
394 struct ib_queue_pair *qp,
395 struct ib_address_vector *av,
396 struct io_buffer *iobuf, int rc ) {
397 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
398 struct net_device *netdev = ipoib->netdev;
399 struct ipoib_hdr *ipoib_hdr;
400 struct ipoib_mac ll_src;
401 struct ipoib_peer *src;
404 netdev_rx_err ( netdev, iobuf, rc );
409 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
410 DBGC ( ipoib, "IPoIB %p received packet too short to "
411 "contain IPoIB header\n", ipoib );
412 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
413 netdev_rx_err ( netdev, iobuf, -EIO );
416 ipoib_hdr = iobuf->data;
418 /* Parse source address */
419 if ( av->gid_present ) {
420 ll_src.qpn = htonl ( av->qpn );
421 memcpy ( &ll_src.gid, &av->gid, sizeof ( ll_src.gid ) );
422 src = ipoib_cache_peer ( &ll_src );
423 ipoib_hdr->u.peer.src = src->key;
426 /* Hand off to network layer */
427 netdev_rx ( netdev, iobuf );
430 /** IPoIB completion operations */
431 static struct ib_completion_queue_operations ipoib_cq_op = {
432 .complete_send = ipoib_complete_send,
433 .complete_recv = ipoib_complete_recv,
437 * Poll IPoIB network device
439 * @v netdev Network device
441 static void ipoib_poll ( struct net_device *netdev ) {
442 struct ipoib_device *ipoib = netdev->priv;
443 struct ib_device *ibdev = ipoib->ibdev;
445 ib_poll_eq ( ibdev );
449 * Enable/disable interrupts on IPoIB network device
451 * @v netdev Network device
452 * @v enable Interrupts should be enabled
454 static void ipoib_irq ( struct net_device *netdev __unused,
455 int enable __unused ) {
456 /* No implementation */
460 * Join IPv4 broadcast multicast group
462 * @v ipoib IPoIB device
463 * @ret rc Return status code
465 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
468 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp,
469 &ipoib->broadcast.gid ) ) != 0 ) {
470 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
471 ipoib, strerror ( rc ) );
474 ipoib->broadcast_joined = 1;
480 * Leave IPv4 broadcast multicast group
482 * @v ipoib IPoIB device
484 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
486 if ( ipoib->broadcast_joined ) {
487 ib_mcast_leave ( ipoib->ibdev, ipoib->qp,
488 &ipoib->broadcast.gid );
489 ipoib->broadcast_joined = 0;
494 * Open IPoIB network device
496 * @v netdev Network device
497 * @ret rc Return status code
499 static int ipoib_open ( struct net_device *netdev ) {
500 struct ipoib_device *ipoib = netdev->priv;
501 struct ib_device *ibdev = ipoib->ibdev;
502 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
506 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
507 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
508 ipoib, strerror ( rc ) );
512 /* Allocate completion queue */
513 ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op );
515 DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n",
521 /* Allocate queue pair */
522 ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD,
523 IPOIB_NUM_SEND_WQES, ipoib->cq,
524 IPOIB_NUM_RECV_WQES, ipoib->cq, 0 );
526 DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n",
531 ib_qp_set_ownerdata ( ipoib->qp, ipoib );
533 /* Update MAC address with QPN */
534 mac->qpn = htonl ( ipoib->qp->qpn );
536 /* Fill receive rings */
537 ib_refill_recv ( ibdev, ipoib->qp );
539 /* Fake a link status change to join the broadcast group */
540 ipoib_link_state_changed ( ibdev );
544 ib_destroy_qp ( ibdev, ipoib->qp );
546 ib_destroy_cq ( ibdev, ipoib->cq );
554 * Close IPoIB network device
556 * @v netdev Network device
558 static void ipoib_close ( struct net_device *netdev ) {
559 struct ipoib_device *ipoib = netdev->priv;
560 struct ib_device *ibdev = ipoib->ibdev;
561 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
563 /* Leave broadcast group */
564 ipoib_leave_broadcast_group ( ipoib );
566 /* Remove QPN from MAC address */
569 /* Tear down the queues */
570 ib_destroy_qp ( ibdev, ipoib->qp );
571 ib_destroy_cq ( ibdev, ipoib->cq );
573 /* Close IB device */
577 /** IPoIB network device operations */
578 static struct net_device_operations ipoib_operations = {
580 .close = ipoib_close,
581 .transmit = ipoib_transmit,
587 * Update IPoIB dynamic Infiniband parameters
589 * @v ipoib IPoIB device
591 * The Infiniband port GID and partition key will change at runtime,
592 * when the link is established (or lost). The MAC address is based
593 * on the port GID, and the broadcast GID is based on the partition
594 * key. This function recalculates these IPoIB device parameters.
596 static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
597 struct ib_device *ibdev = ipoib->ibdev;
598 struct net_device *netdev = ipoib->netdev;
599 struct ipoib_mac *mac;
601 /* Calculate GID portion of MAC address based on port GID */
602 mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
603 memcpy ( &mac->gid, &ibdev->gid, sizeof ( mac->gid ) );
605 /* Calculate broadcast GID based on partition key */
606 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
607 sizeof ( ipoib->broadcast ) );
608 ipoib->broadcast.gid.u.words[2] = htons ( ibdev->pkey );
610 /* Set net device link state to reflect Infiniband link state */
611 if ( ib_link_ok ( ibdev ) ) {
612 netdev_link_up ( netdev );
614 netdev_link_down ( netdev );
619 * Handle link status change
621 * @v ibdev Infiniband device
623 void ipoib_link_state_changed ( struct ib_device *ibdev ) {
624 struct net_device *netdev = ib_get_ownerdata ( ibdev );
625 struct ipoib_device *ipoib = netdev->priv;
628 /* Leave existing broadcast group */
629 ipoib_leave_broadcast_group ( ipoib );
631 /* Update MAC address and broadcast GID based on new port GID
634 ipoib_set_ib_params ( ipoib );
636 /* Join new broadcast group */
637 if ( ib_link_ok ( ibdev ) &&
638 ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) {
639 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
640 "%s\n", ipoib, strerror ( rc ) );
648 * @v ibdev Infiniband device
649 * @ret rc Return status code
651 int ipoib_probe ( struct ib_device *ibdev ) {
652 struct net_device *netdev;
653 struct ipoib_device *ipoib;
656 /* Allocate network device */
657 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
660 netdev_init ( netdev, &ipoib_operations );
661 ipoib = netdev->priv;
662 ib_set_ownerdata ( ibdev, netdev );
663 netdev->dev = ibdev->dev;
664 netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast );
665 memset ( ipoib, 0, sizeof ( *ipoib ) );
666 ipoib->netdev = netdev;
667 ipoib->ibdev = ibdev;
669 /* Calculate as much of the broadcast GID and the MAC address
670 * as we can. We won't know either of these in full until we
673 ipoib_set_ib_params ( ipoib );
675 /* Register network device */
676 if ( ( rc = register_netdev ( netdev ) ) != 0 )
677 goto err_register_netdev;
682 netdev_nullify ( netdev );
683 netdev_put ( netdev );
688 * Remove IPoIB device
690 * @v ibdev Infiniband device
692 void ipoib_remove ( struct ib_device *ibdev ) {
693 struct net_device *netdev = ib_get_ownerdata ( ibdev );
695 unregister_netdev ( netdev );
696 netdev_nullify ( netdev );
697 netdev_put ( netdev );