2 * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or any later version.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 FILE_LICENCE ( GPL2_OR_LATER );
27 #include <gpxe/if_arp.h>
28 #include <gpxe/iobuf.h>
29 #include <gpxe/netdevice.h>
30 #include <gpxe/infiniband.h>
31 #include <gpxe/ib_qset.h>
32 #include <gpxe/ib_pathrec.h>
33 #include <gpxe/ib_mcast.h>
34 #include <gpxe/ipoib.h>
41 /** Number of IPoIB send work queue entries */
42 #define IPOIB_NUM_SEND_WQES 2
44 /** Number of IPoIB receive work queue entries */
45 #define IPOIB_NUM_RECV_WQES 4
47 /** Number of IPoIB completion entries */
48 #define IPOIB_NUM_CQES 8
50 /** An IPoIB device */
53 struct net_device *netdev;
54 /** Underlying Infiniband device */
55 struct ib_device *ibdev;
57 struct ib_queue_set qset;
59 struct ipoib_mac broadcast;
60 /** Joined to multicast group
62 * This flag indicates whether or not we have initiated the
63 * join to the IPv4 multicast group.
68 /** Broadcast IPoIB address */
69 static struct ipoib_mac ipoib_broadcast = {
70 .qpn = htonl ( IB_QPN_BROADCAST ),
71 .gid.u.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
72 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff },
75 /****************************************************************************
79 ****************************************************************************
85 * The IPoIB link-layer header is only four bytes long and so does not
86 * have sufficient room to store IPoIB MAC address(es). We therefore
87 * maintain a cache of MAC addresses identified by a single-byte key,
88 * and abuse the spare two bytes within the link-layer header to
89 * communicate these MAC addresses between the link-layer code and the
99 /** Number of IPoIB peer cache entries
101 * Must be a power of two.
103 #define IPOIB_NUM_CACHED_PEERS 4
105 /** IPoIB peer address cache */
106 static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS];
108 /** Oldest IPoIB peer cache entry index */
109 static unsigned int ipoib_peer_cache_idx = 1;
112 * Look up cached peer by key
114 * @v key Peer cache key
115 * @ret peer Peer cache entry, or NULL
117 static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) {
118 struct ipoib_peer *peer;
121 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
122 peer = &ipoib_peer_cache[i];
123 if ( peer->key == key )
128 DBG ( "IPoIB warning: peer cache lost track of key %x while "
129 "still in use\n", key );
135 * Store GID and QPN in peer cache
139 * @ret peer Peer cache entry
141 static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) {
142 struct ipoib_peer *peer;
146 /* Look for existing cache entry */
147 for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) {
148 peer = &ipoib_peer_cache[i];
149 if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 )
153 /* No entry found: create a new one */
154 key = ipoib_peer_cache_idx++;
155 peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ];
157 DBG ( "IPoIB peer %x evicted from cache\n", peer->key );
159 memset ( peer, 0, sizeof ( *peer ) );
161 memcpy ( &peer->mac, mac, sizeof ( peer->mac ) );
162 DBG ( "IPoIB peer %x has MAC %s\n",
163 peer->key, ipoib_ntoa ( &peer->mac ) );
167 /****************************************************************************
171 ****************************************************************************
175 * Add IPoIB link-layer header
177 * @v netdev Network device
178 * @v iobuf I/O buffer
179 * @v ll_dest Link-layer destination address
180 * @v ll_source Source link-layer address
181 * @v net_proto Network-layer protocol, in network-byte order
182 * @ret rc Return status code
184 static int ipoib_push ( struct net_device *netdev __unused,
185 struct io_buffer *iobuf, const void *ll_dest,
186 const void *ll_source __unused, uint16_t net_proto ) {
187 struct ipoib_hdr *ipoib_hdr =
188 iob_push ( iobuf, sizeof ( *ipoib_hdr ) );
189 const struct ipoib_mac *dest_mac = ll_dest;
190 const struct ipoib_mac *src_mac = ll_source;
191 struct ipoib_peer *dest;
192 struct ipoib_peer *src;
194 /* Add link-layer addresses to cache */
195 dest = ipoib_cache_peer ( dest_mac );
196 src = ipoib_cache_peer ( src_mac );
198 /* Build IPoIB header */
199 ipoib_hdr->proto = net_proto;
200 ipoib_hdr->u.peer.dest = dest->key;
201 ipoib_hdr->u.peer.src = src->key;
207 * Remove IPoIB link-layer header
209 * @v netdev Network device
210 * @v iobuf I/O buffer
211 * @ret ll_dest Link-layer destination address
212 * @ret ll_source Source link-layer address
213 * @ret net_proto Network-layer protocol, in network-byte order
214 * @ret rc Return status code
216 static int ipoib_pull ( struct net_device *netdev,
217 struct io_buffer *iobuf, const void **ll_dest,
218 const void **ll_source, uint16_t *net_proto ) {
219 struct ipoib_device *ipoib = netdev->priv;
220 struct ipoib_hdr *ipoib_hdr = iobuf->data;
221 struct ipoib_peer *dest;
222 struct ipoib_peer *source;
225 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
226 DBG ( "IPoIB packet too short for link-layer header\n" );
227 DBG_HD ( iobuf->data, iob_len ( iobuf ) );
231 /* Strip off IPoIB header */
232 iob_pull ( iobuf, sizeof ( *ipoib_hdr ) );
234 /* Identify source and destination addresses, and clear
235 * reserved word in IPoIB header
237 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
238 source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src );
239 ipoib_hdr->u.reserved = 0;
241 /* Fill in required fields */
242 *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast );
243 *ll_source = ( source ? &source->mac : &ipoib->broadcast );
244 *net_proto = ipoib_hdr->proto;
250 * Transcribe IPoIB address
252 * @v ll_addr Link-layer address
253 * @ret string Link-layer address in human-readable format
255 const char * ipoib_ntoa ( const void *ll_addr ) {
257 const struct ipoib_mac *mac = ll_addr;
259 snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x",
260 htonl ( mac->qpn ), htonl ( mac->gid.u.dwords[0] ),
261 htonl ( mac->gid.u.dwords[1] ),
262 htonl ( mac->gid.u.dwords[2] ),
263 htonl ( mac->gid.u.dwords[3] ) );
268 * Hash multicast address
270 * @v af Address family
271 * @v net_addr Network-layer address
272 * @v ll_addr Link-layer address to fill in
273 * @ret rc Return status code
275 static int ipoib_mc_hash ( unsigned int af __unused,
276 const void *net_addr __unused,
277 void *ll_addr __unused ) {
282 /** IPoIB protocol */
283 struct ll_protocol ipoib_protocol __ll_protocol = {
285 .ll_proto = htons ( ARPHRD_INFINIBAND ),
286 .ll_addr_len = IPOIB_ALEN,
287 .ll_header_len = IPOIB_HLEN,
291 .mc_hash = ipoib_mc_hash,
295 * Allocate IPoIB device
297 * @v priv_size Size of driver private data
298 * @ret netdev Network device, or NULL
300 struct net_device * alloc_ipoibdev ( size_t priv_size ) {
301 struct net_device *netdev;
303 netdev = alloc_netdev ( priv_size );
305 netdev->ll_protocol = &ipoib_protocol;
306 netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast;
307 netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE;
312 /****************************************************************************
314 * IPoIB network device
316 ****************************************************************************
320 * Transmit packet via IPoIB network device
322 * @v netdev Network device
323 * @v iobuf I/O buffer
324 * @ret rc Return status code
326 static int ipoib_transmit ( struct net_device *netdev,
327 struct io_buffer *iobuf ) {
328 struct ipoib_device *ipoib = netdev->priv;
329 struct ib_device *ibdev = ipoib->ibdev;
330 struct ipoib_hdr *ipoib_hdr;
331 struct ipoib_peer *dest;
332 struct ib_address_vector av;
336 if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) {
337 DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib );
340 ipoib_hdr = iobuf->data;
342 /* Attempting transmission while link is down will put the
343 * queue pair into an error state, so don't try it.
345 if ( ! ib_link_ok ( ibdev ) )
348 /* Identify destination address */
349 dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest );
352 ipoib_hdr->u.reserved = 0;
354 /* Construct address vector */
355 memset ( &av, 0, sizeof ( av ) );
356 av.qpn = ntohl ( dest->mac.qpn );
358 memcpy ( &av.gid, &dest->mac.gid, sizeof ( av.gid ) );
359 if ( ( rc = ib_resolve_path ( ibdev, &av ) ) != 0 ) {
360 /* Path not resolved yet */
364 return ib_post_send ( ibdev, ipoib->qset.qp, &av, iobuf );
368 * Handle IPoIB send completion
370 * @v ibdev Infiniband device
372 * @v iobuf I/O buffer
373 * @v rc Completion status code
375 static void ipoib_complete_send ( struct ib_device *ibdev __unused,
376 struct ib_queue_pair *qp,
377 struct io_buffer *iobuf, int rc ) {
378 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
380 netdev_tx_complete_err ( ipoib->netdev, iobuf, rc );
384 * Handle IPoIB receive completion
386 * @v ibdev Infiniband device
388 * @v av Address vector, or NULL
389 * @v iobuf I/O buffer
390 * @v rc Completion status code
392 static void ipoib_complete_recv ( struct ib_device *ibdev __unused,
393 struct ib_queue_pair *qp,
394 struct ib_address_vector *av,
395 struct io_buffer *iobuf, int rc ) {
396 struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp );
397 struct net_device *netdev = ipoib->netdev;
398 struct ipoib_hdr *ipoib_hdr;
399 struct ipoib_mac ll_src;
400 struct ipoib_peer *src;
403 netdev_rx_err ( netdev, iobuf, rc );
408 if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) {
409 DBGC ( ipoib, "IPoIB %p received packet too short to "
410 "contain IPoIB header\n", ipoib );
411 DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) );
412 netdev_rx_err ( netdev, iobuf, -EIO );
415 ipoib_hdr = iobuf->data;
417 /* Parse source address */
418 if ( av->gid_present ) {
419 ll_src.qpn = htonl ( av->qpn );
420 memcpy ( &ll_src.gid, &av->gid, sizeof ( ll_src.gid ) );
421 src = ipoib_cache_peer ( &ll_src );
422 ipoib_hdr->u.peer.src = src->key;
425 /* Hand off to network layer */
426 netdev_rx ( netdev, iobuf );
429 /** IPoIB completion operations */
430 static struct ib_completion_queue_operations ipoib_cq_op = {
431 .complete_send = ipoib_complete_send,
432 .complete_recv = ipoib_complete_recv,
436 * Poll IPoIB network device
438 * @v netdev Network device
440 static void ipoib_poll ( struct net_device *netdev ) {
441 struct ipoib_device *ipoib = netdev->priv;
442 struct ib_device *ibdev = ipoib->ibdev;
444 ib_poll_eq ( ibdev );
448 * Enable/disable interrupts on IPoIB network device
450 * @v netdev Network device
451 * @v enable Interrupts should be enabled
453 static void ipoib_irq ( struct net_device *netdev __unused,
454 int enable __unused ) {
455 /* No implementation */
459 * Join IPv4 broadcast multicast group
461 * @v ipoib IPoIB device
462 * @ret rc Return status code
464 static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) {
467 if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qset.qp,
468 &ipoib->broadcast.gid ) ) != 0 ) {
469 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
470 ipoib, strerror ( rc ) );
473 ipoib->broadcast_joined = 1;
479 * Leave IPv4 broadcast multicast group
481 * @v ipoib IPoIB device
483 static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) {
485 if ( ipoib->broadcast_joined ) {
486 ib_mcast_leave ( ipoib->ibdev, ipoib->qset.qp,
487 &ipoib->broadcast.gid );
488 ipoib->broadcast_joined = 0;
493 * Open IPoIB network device
495 * @v netdev Network device
496 * @ret rc Return status code
498 static int ipoib_open ( struct net_device *netdev ) {
499 struct ipoib_device *ipoib = netdev->priv;
500 struct ib_device *ibdev = ipoib->ibdev;
501 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
505 if ( ( rc = ib_open ( ibdev ) ) != 0 ) {
506 DBGC ( ipoib, "IPoIB %p could not open device: %s\n",
507 ipoib, strerror ( rc ) );
511 /* Allocate queue set */
512 if ( ( rc = ib_create_qset ( ibdev, &ipoib->qset, IPOIB_NUM_CQES,
513 &ipoib_cq_op, IPOIB_NUM_SEND_WQES,
514 IPOIB_NUM_RECV_WQES, 0 ) ) != 0 ) {
515 DBGC ( ipoib, "IPoIB %p could not allocate queue set: %s\n",
516 ipoib, strerror ( rc ) );
517 goto err_create_qset;
519 ib_qp_set_ownerdata ( ipoib->qset.qp, ipoib );
521 /* Update MAC address with QPN */
522 mac->qpn = htonl ( ipoib->qset.qp->qpn );
524 /* Fill receive rings */
525 ib_refill_recv ( ibdev, ipoib->qset.qp );
527 /* Join broadcast group */
528 if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
529 DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n",
530 ipoib, strerror ( rc ) );
531 goto err_join_broadcast;
537 ib_destroy_qset ( ibdev, &ipoib->qset );
545 * Close IPoIB network device
547 * @v netdev Network device
549 static void ipoib_close ( struct net_device *netdev ) {
550 struct ipoib_device *ipoib = netdev->priv;
551 struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
553 /* Leave broadcast group */
554 ipoib_leave_broadcast_group ( ipoib );
556 /* Remove QPN from MAC address */
559 /* Tear down the queues */
560 ib_destroy_qset ( ipoib->ibdev, &ipoib->qset );
562 /* Close IB device */
563 ib_close ( ipoib->ibdev );
566 /** IPoIB network device operations */
567 static struct net_device_operations ipoib_operations = {
569 .close = ipoib_close,
570 .transmit = ipoib_transmit,
576 * Update IPoIB dynamic Infiniband parameters
578 * @v ipoib IPoIB device
580 * The Infiniband port GID and partition key will change at runtime,
581 * when the link is established (or lost). The MAC address is based
582 * on the port GID, and the broadcast GID is based on the partition
583 * key. This function recalculates these IPoIB device parameters.
585 static void ipoib_set_ib_params ( struct ipoib_device *ipoib ) {
586 struct ib_device *ibdev = ipoib->ibdev;
587 struct net_device *netdev = ipoib->netdev;
588 struct ipoib_mac *mac;
590 /* Calculate GID portion of MAC address based on port GID */
591 mac = ( ( struct ipoib_mac * ) netdev->ll_addr );
592 memcpy ( &mac->gid, &ibdev->gid, sizeof ( mac->gid ) );
594 /* Calculate broadcast GID based on partition key */
595 memcpy ( &ipoib->broadcast, &ipoib_broadcast,
596 sizeof ( ipoib->broadcast ) );
597 ipoib->broadcast.gid.u.words[2] = htons ( ibdev->pkey );
599 /* Set net device link state to reflect Infiniband link state */
600 if ( ib_link_ok ( ibdev ) ) {
601 netdev_link_up ( netdev );
603 netdev_link_down ( netdev );
608 * Handle link status change
610 * @v ibdev Infiniband device
612 void ipoib_link_state_changed ( struct ib_device *ibdev ) {
613 struct net_device *netdev = ib_get_ownerdata ( ibdev );
614 struct ipoib_device *ipoib = netdev->priv;
617 /* Leave existing broadcast group */
618 ipoib_leave_broadcast_group ( ipoib );
620 /* Update MAC address and broadcast GID based on new port GID
623 ipoib_set_ib_params ( ipoib );
625 /* Join new broadcast group */
626 if ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) {
627 DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: "
628 "%s\n", ipoib, strerror ( rc ) );
636 * @v ibdev Infiniband device
637 * @ret rc Return status code
639 int ipoib_probe ( struct ib_device *ibdev ) {
640 struct net_device *netdev;
641 struct ipoib_device *ipoib;
644 /* Allocate network device */
645 netdev = alloc_ipoibdev ( sizeof ( *ipoib ) );
648 netdev_init ( netdev, &ipoib_operations );
649 ipoib = netdev->priv;
650 ib_set_ownerdata ( ibdev, netdev );
651 netdev->dev = ibdev->dev;
652 netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast );
653 memset ( ipoib, 0, sizeof ( *ipoib ) );
654 ipoib->netdev = netdev;
655 ipoib->ibdev = ibdev;
657 /* Calculate as much of the broadcast GID and the MAC address
658 * as we can. We won't know either of these in full until we
661 ipoib_set_ib_params ( ipoib );
663 /* Register network device */
664 if ( ( rc = register_netdev ( netdev ) ) != 0 )
665 goto err_register_netdev;
670 netdev_nullify ( netdev );
671 netdev_put ( netdev );
676 * Remove IPoIB device
678 * @v ibdev Infiniband device
680 void ipoib_remove ( struct ib_device *ibdev ) {
681 struct net_device *netdev = ib_get_ownerdata ( ibdev );
683 unregister_netdev ( netdev );
684 netdev_nullify ( netdev );
685 netdev_put ( netdev );