[IBAL] Fix handling of stale connections.
authorftillier <ftillier@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Fri, 7 Apr 2006 04:22:58 +0000 (04:22 +0000)
committerftillier <ftillier@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Fri, 7 Apr 2006 04:22:58 +0000 (04:22 +0000)
git-svn-id: svn://openib.tc.cornell.edu/gen1/trunk@290 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86

core/al/kernel/al_cm_cep.c

index 1667b4f..7e8e3d0 100644 (file)
@@ -456,6 +456,13 @@ static inline void
 __insert_timewait(\r
        IN                              kcep_t* const                           p_cep );\r
 \r
+static ib_api_status_t\r
+__cep_get_mad(\r
+       IN                              kcep_t* const                           p_cep,\r
+       IN                              net16_t                                         attr_id,\r
+               OUT                     cep_agent_t** const                     pp_port_cep,\r
+               OUT                     ib_mad_element_t** const        pp_mad );\r
+\r
 static ib_api_status_t\r
 __cep_send_mad(\r
        IN                              cep_agent_t* const                      p_port_cep,\r
@@ -863,14 +870,155 @@ __repeat_mad(
 }\r
 \r
 \r
+static ib_api_status_t\r
+__process_rej(\r
+       IN                              kcep_t* const                           p_cep,\r
+       IN                              ib_mad_element_t* const         p_mad )\r
+{\r
+       ib_api_status_t         status;\r
+       mad_cm_rej_t            *p_rej;\r
+\r
+       AL_ENTER( AL_DBG_CM );\r
+\r
+       ASSERT( p_cep );\r
+       ASSERT( p_mad );\r
+       ASSERT( p_mad->p_mad_buf );\r
+\r
+       p_rej = (mad_cm_rej_t*)p_mad->p_mad_buf;\r
+\r
+       switch( p_cep->state )\r
+       {\r
+       case CEP_STATE_REQ_SENT:\r
+               /*\r
+                * Ignore rejects with the status set to IB_REJ_INVALID_SID.  We will\r
+                * continue to retry (up to max_cm_retries) to connect to the remote\r
+                * side.  This is required to support peer-to-peer connections and\r
+                * clients that try to connect before the server comes up.\r
+                */\r
+               if( p_rej->reason == IB_REJ_INVALID_SID )\r
+               {\r
+                       AL_TRACE( AL_DBG_CM,\r
+                               ("Request rejected (invalid SID) - retrying.\n") );\r
+                       goto err1;\r
+               }\r
+\r
+               /* Fall through */\r
+       case CEP_STATE_REP_SENT:\r
+       case CEP_STATE_REQ_MRA_RCVD:\r
+       case CEP_STATE_REP_MRA_RCVD:\r
+               /* Cancel any outstanding MAD. */\r
+               if( p_cep->p_send_mad )\r
+               {\r
+                       ib_cancel_mad( p_cep->h_mad_svc, p_cep->p_send_mad );\r
+                       p_cep->p_send_mad = NULL;\r
+               }\r
+\r
+               /* Fall through */\r
+       case CEP_STATE_REQ_RCVD:\r
+       case CEP_STATE_REP_RCVD:\r
+       case CEP_STATE_REQ_MRA_SENT:\r
+       case CEP_STATE_REP_MRA_SENT:\r
+       case CEP_STATE_PRE_REP:\r
+       case CEP_STATE_PRE_REP_MRA_SENT:\r
+               if( p_cep->state & CEP_STATE_PREP )\r
+               {\r
+                       CL_ASSERT( p_cep->p_mad );\r
+                       ib_put_mad( p_cep->p_mad );\r
+                       p_cep->p_mad = NULL;\r
+               }\r
+               /* Abort connection establishment. No transition to timewait. */\r
+               __remove_cep( p_cep );\r
+               p_cep->state = CEP_STATE_IDLE;\r
+               break;\r
+\r
+       case CEP_STATE_ESTABLISHED:\r
+       case CEP_STATE_LAP_RCVD:\r
+       case CEP_STATE_LAP_SENT:\r
+       case CEP_STATE_LAP_MRA_RCVD:\r
+       case CEP_STATE_LAP_MRA_SENT:\r
+       case CEP_STATE_PRE_APR:\r
+       case CEP_STATE_PRE_APR_MRA_SENT:\r
+               if( p_cep->state & CEP_STATE_PREP )\r
+               {\r
+                       CL_ASSERT( p_cep->p_mad );\r
+                       ib_put_mad( p_cep->p_mad );\r
+                       p_cep->p_mad = NULL;\r
+               }\r
+               p_cep->state = CEP_STATE_TIMEWAIT;\r
+               __insert_timewait( p_cep );\r
+               break;\r
+\r
+       default:\r
+               /* Ignore the REJ. */\r
+               AL_TRACE( AL_DBG_CM, ("REJ received in invalid state.\n") );\r
+err1:\r
+               ib_put_mad( p_mad );\r
+               AL_EXIT( AL_DBG_CM );\r
+               return IB_NO_MATCH;\r
+       }\r
+\r
+       status = __cep_queue_mad( p_cep, p_mad );\r
+\r
+       AL_EXIT( AL_DBG_CM );\r
+       return status;\r
+}\r
+\r
+\r
+static ib_api_status_t\r
+__process_stale(\r
+       IN                              kcep_t* const                           p_cep )\r
+{\r
+       ib_api_status_t         status;\r
+       cep_agent_t                     *p_port_cep;\r
+       ib_mad_element_t        *p_mad;\r
+       mad_cm_rej_t            *p_rej;\r
+\r
+       status = __cep_get_mad( p_cep, CM_REJ_ATTR_ID, &p_port_cep, &p_mad );\r
+       if( status != IB_SUCCESS )\r
+               return status;\r
+\r
+       p_rej = ib_get_mad_buf( p_mad );\r
+\r
+       conn_rej_set_ari( NULL, 0, p_rej );\r
+       conn_rej_set_pdata( NULL, 0, p_rej );\r
+\r
+       p_rej->local_comm_id = p_cep->remote_comm_id;\r
+       p_rej->remote_comm_id = p_cep->local_comm_id;\r
+       p_rej->reason = IB_REJ_STALE_CONN;\r
+\r
+       switch( p_cep->state )\r
+       {\r
+       case CEP_STATE_REQ_RCVD:\r
+       case CEP_STATE_REQ_MRA_SENT:\r
+       case CEP_STATE_PRE_REP:\r
+       case CEP_STATE_PRE_REP_MRA_SENT:\r
+               conn_rej_set_msg_rejected( 0, p_rej );\r
+               break;\r
+\r
+       case CEP_STATE_REQ_SENT:\r
+       case CEP_STATE_REP_RCVD:\r
+       case CEP_STATE_REP_MRA_SENT:\r
+               conn_rej_set_msg_rejected( 1, p_rej );\r
+               break;\r
+\r
+       default:\r
+               conn_rej_set_msg_rejected( 2, p_rej );\r
+               break;\r
+       }\r
+       conn_rej_clr_rsvd_fields( p_rej );\r
+\r
+       return __process_rej( p_cep, p_mad );\r
+}\r
+\r
+\r
 static void\r
-__process_req(\r
+__req_handler(\r
        IN                              cep_agent_t* const                      p_port_cep,\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
-       ib_api_status_t         status;\r
+       ib_api_status_t         status = IB_SUCCESS;\r
        mad_cm_req_t            *p_req;\r
-       kcep_t                          *p_cep, *p_new_cep, *p_stale_cep;\r
+       kcep_t                          *p_cep, *p_new_cep, *p_stale_cep = NULL;\r
        KLOCK_QUEUE_HANDLE      hdl;\r
        ib_rej_status_t         reason;\r
 \r
@@ -958,8 +1106,9 @@ __process_req(
                if( p_stale_cep != p_new_cep )\r
                {\r
                        /* Duplicate - must be a stale connection. */\r
-                       /* TODO: Fail the CEP in p_stale_cep */\r
                        reason = IB_REJ_STALE_CONN;\r
+                       /* Fail the local stale CEP. */\r
+                       status = __process_stale( p_stale_cep );\r
                        goto unbind;\r
                }\r
 \r
@@ -1040,6 +1189,10 @@ reject:
        __reject_req( p_port_cep, p_mad, reason );\r
 \r
        KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
+\r
+       if( reason == IB_REJ_STALE_CONN && status == IB_SUCCESS )\r
+               __process_cep( p_stale_cep );\r
+\r
        AL_EXIT( AL_DBG_CM );\r
 }\r
 \r
@@ -1087,7 +1240,7 @@ __save_wire_rep(
 \r
 \r
 static void\r
-__process_mra(\r
+__mra_handler(\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
        ib_api_status_t         status;\r
@@ -1119,13 +1272,13 @@ __process_mra(
                        goto err;\r
                }\r
        }\r
+\r
        /*\r
         * Note that we don't update the CEP's remote comm ID - it messes up REP\r
         * processing since a non-zero RCID implies the connection is in the RCID\r
         * map.  Adding it here requires checking there and conditionally adding\r
         * it.  Ignoring it is a valid thing to do.\r
         */\r
-\r
        if( !(p_cep->state & CEP_STATE_SENT) ||\r
                (1 << conn_mra_get_msg_mraed( p_mra ) !=\r
                (p_cep->state & CEP_MSG_MASK)) )\r
@@ -1152,7 +1305,6 @@ __process_mra(
        p_cep->state |= CEP_STATE_MRA;\r
 \r
        status = __cep_queue_mad( p_cep, p_mad );\r
-       CL_ASSERT( status != IB_INVALID_STATE );\r
 \r
        KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
 \r
@@ -1170,7 +1322,7 @@ err:
 \r
 \r
 static void\r
-__process_rej(\r
+__rej_handler(\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
        ib_api_status_t         status;\r
@@ -1210,79 +1362,14 @@ __process_rej(
        if( p_cep->remote_comm_id &&\r
                p_cep->remote_comm_id != p_rej->local_comm_id )\r
        {\r
-               goto err2;\r
-       }\r
-\r
-       switch( p_cep->state )\r
-       {\r
-       case CEP_STATE_REQ_SENT:\r
-               /*\r
-                * Ignore rejects with the status set to IB_REJ_INVALID_SID.  We will\r
-                * continue to retry (up to max_cm_retries) to connect to the remote\r
-                * side.  This is required to support peer-to-peer connections and\r
-                * clients that try to connect before the server comes up.\r
-                */\r
-               if( p_rej->reason == IB_REJ_INVALID_SID )\r
-               {\r
-                       AL_TRACE( AL_DBG_CM,\r
-                               ("Request rejected (invalid SID) - retrying.\n") );\r
-                       goto err2;\r
-               }\r
-\r
-               /* Fall through */\r
-       case CEP_STATE_REP_SENT:\r
-       case CEP_STATE_REQ_MRA_RCVD:\r
-       case CEP_STATE_REP_MRA_RCVD:\r
-               /* Cancel any outstanding MAD. */\r
-               if( p_cep->p_send_mad )\r
-               {\r
-                       ib_cancel_mad( p_cep->h_mad_svc, p_cep->p_send_mad );\r
-                       p_cep->p_send_mad = NULL;\r
-               }\r
-\r
-               /* Fall through */\r
-       case CEP_STATE_REQ_RCVD:\r
-       case CEP_STATE_REP_RCVD:\r
-       case CEP_STATE_REQ_MRA_SENT:\r
-       case CEP_STATE_REP_MRA_SENT:\r
-       case CEP_STATE_PRE_REP:\r
-       case CEP_STATE_PRE_REP_MRA_SENT:\r
-               if( p_cep->state & CEP_STATE_PREP )\r
-               {\r
-                       CL_ASSERT( p_cep->p_mad );\r
-                       ib_put_mad( p_cep->p_mad );\r
-                       p_cep->p_mad = NULL;\r
-               }\r
-               /* Abort connection establishment. No transition to timewait. */\r
-               __remove_cep( p_cep );\r
-               p_cep->state = CEP_STATE_IDLE;\r
-               break;\r
-\r
-       case CEP_STATE_ESTABLISHED:\r
-       case CEP_STATE_LAP_RCVD:\r
-       case CEP_STATE_LAP_SENT:\r
-       case CEP_STATE_LAP_MRA_RCVD:\r
-       case CEP_STATE_LAP_MRA_SENT:\r
-       case CEP_STATE_PRE_APR:\r
-       case CEP_STATE_PRE_APR_MRA_SENT:\r
-               if( p_cep->state & CEP_STATE_PREP )\r
-               {\r
-                       CL_ASSERT( p_cep->p_mad );\r
-                       ib_put_mad( p_cep->p_mad );\r
-                       p_cep->p_mad = NULL;\r
-               }\r
-               p_cep->state = CEP_STATE_TIMEWAIT;\r
-               __insert_timewait( p_cep );\r
-               break;\r
-\r
-       default:\r
-               /* Ignore the REJ. */\r
-               AL_TRACE( AL_DBG_CM, ("REJ received in invalid state.\n") );\r
-               goto err2;\r
+       err2:\r
+               KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
+       err1:\r
+               ib_put_mad( p_mad );\r
+               AL_EXIT( AL_DBG_CM );\r
        }\r
 \r
-       status = __cep_queue_mad( p_cep, p_mad );\r
-       CL_ASSERT( status != IB_INVALID_STATE );\r
+       status = __process_rej( p_cep, p_mad );\r
 \r
        KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
 \r
@@ -1290,18 +1377,11 @@ __process_rej(
                __process_cep( p_cep );\r
 \r
        AL_EXIT( AL_DBG_CM );\r
-       return;\r
-\r
-err2:\r
-       KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
-err1:\r
-       ib_put_mad( p_mad );\r
-       AL_EXIT( AL_DBG_CM );\r
 }\r
 \r
 \r
 static void\r
-__process_rep(\r
+__rep_handler(\r
        IN                              cep_agent_t* const                      p_port_cep,\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
@@ -1342,25 +1422,25 @@ __process_rep(
                if( __insert_cep( p_cep ) != p_cep )\r
                {\r
                        /* Roll back the state change. */\r
-                       p_cep->state = old_state;\r
                        __reject_mad( p_port_cep, p_cep, p_mad, IB_REJ_STALE_CONN );\r
-                       /* TODO: Handle stale connection. */\r
-                       break;\r
+                       p_cep->state = old_state;\r
+                       status = __process_stale( p_cep );\r
                }\r
-\r
-               /*\r
-                * Cancel any outstanding send.  Note that we do this only after\r
-                * inserting the CEP - if we failed, then we the send will timeout\r
-                * and we'll finish our way through the state machine.\r
-                */\r
-               if( p_cep->p_send_mad )\r
+               else\r
                {\r
-                       ib_cancel_mad( p_cep->h_mad_svc, p_cep->p_send_mad );\r
-                       p_cep->p_send_mad = NULL;\r
-               }\r
+                       /*\r
+                        * Cancel any outstanding send.  Note that we do this only after\r
+                        * inserting the CEP - if we failed, then the send will timeout\r
+                        * and we'll finish our way through the state machine.\r
+                        */\r
+                       if( p_cep->p_send_mad )\r
+                       {\r
+                               ib_cancel_mad( p_cep->h_mad_svc, p_cep->p_send_mad );\r
+                               p_cep->p_send_mad = NULL;\r
+                       }\r
 \r
-               status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
+                       status = __cep_queue_mad( p_cep, p_mad );\r
+               }\r
 \r
                KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
 \r
@@ -1393,7 +1473,7 @@ __process_rep(
 \r
 \r
 static void\r
-__process_rtu(\r
+__rtu_handler(\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
        ib_api_status_t         status;\r
@@ -1433,7 +1513,6 @@ __process_rtu(
                p_cep->state = CEP_STATE_ESTABLISHED;\r
 \r
                status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
 \r
                /* Update timewait time. */\r
                __calc_timewait( p_cep );\r
@@ -1459,7 +1538,7 @@ done:
 \r
 \r
 static void\r
-__process_dreq(\r
+__dreq_handler(\r
        IN                              cep_agent_t* const                      p_port_cep,\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
@@ -1513,7 +1592,6 @@ __process_dreq(
                p_cep->state = CEP_STATE_DREQ_RCVD;\r
 \r
                status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
 \r
                /* Store the TID for use in the reply DREP. */\r
                p_cep->tid = p_dreq->hdr.trans_id;\r
@@ -1544,7 +1622,7 @@ __process_dreq(
 \r
 \r
 static void\r
-__process_drep(\r
+__drep_handler(\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
        ib_api_status_t         status;\r
@@ -1593,7 +1671,6 @@ __process_drep(
                p_cep->state = CEP_STATE_TIMEWAIT;\r
 \r
                status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
        }\r
        else\r
        {\r
@@ -1684,7 +1761,7 @@ __format_lap_av(
 \r
 \r
 static void\r
-__process_lap(\r
+__lap_handler(\r
        IN                              cep_agent_t* const                      p_port_cep,\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
@@ -1748,7 +1825,6 @@ __process_lap(
                p_cep->state = CEP_STATE_LAP_RCVD;\r
 \r
                status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
 \r
                KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
 \r
@@ -1774,7 +1850,7 @@ __process_lap(
 \r
 \r
 static void\r
-__process_apr(\r
+__apr_handler(\r
        IN                              ib_mad_element_t* const         p_mad )\r
 {\r
        ib_api_status_t         status;\r
@@ -1819,7 +1895,6 @@ __process_apr(
                p_cep->state = CEP_STATE_ESTABLISHED;\r
 \r
                status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
 \r
                KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
 \r
@@ -1869,39 +1944,39 @@ __cep_mad_recv_cb(
        switch( p_hdr->attr_id )\r
        {\r
        case CM_REQ_ATTR_ID:\r
-               __process_req( p_port_cep, p_mad );\r
+               __req_handler( p_port_cep, p_mad );\r
                break;\r
 \r
        case CM_MRA_ATTR_ID:\r
-               __process_mra( p_mad );\r
+               __mra_handler( p_mad );\r
                break;\r
 \r
        case CM_REJ_ATTR_ID:\r
-               __process_rej( p_mad );\r
+               __rej_handler( p_mad );\r
                break;\r
 \r
        case CM_REP_ATTR_ID:\r
-               __process_rep( p_port_cep, p_mad );\r
+               __rep_handler( p_port_cep, p_mad );\r
                break;\r
 \r
        case CM_RTU_ATTR_ID:\r
-               __process_rtu( p_mad );\r
+               __rtu_handler( p_mad );\r
                break;\r
 \r
        case CM_DREQ_ATTR_ID:\r
-               __process_dreq( p_port_cep, p_mad );\r
+               __dreq_handler( p_port_cep, p_mad );\r
                break;\r
 \r
        case CM_DREP_ATTR_ID:\r
-               __process_drep( p_mad );\r
+               __drep_handler( p_mad );\r
                break;\r
 \r
        case CM_LAP_ATTR_ID:\r
-               __process_lap( p_port_cep, p_mad );\r
+               __lap_handler( p_port_cep, p_mad );\r
                break;\r
 \r
        case CM_APR_ATTR_ID:\r
-               __process_apr( p_mad );\r
+               __apr_handler( p_mad );\r
                break;\r
 \r
        case CM_SIDR_REQ_ATTR_ID:\r
@@ -2642,7 +2717,11 @@ __insert_by_id(
                else if( p_new_cep->remote_ca_guid > p_cep->remote_ca_guid )\r
                        p_item = cl_rbmap_right( p_item ), left = FALSE;\r
                else\r
+               {\r
+                       AL_TRACE( AL_DBG_CM | AL_DBG_WARN,\r
+                               ("WARNING: Duplicate remote CID and CA GUID.\n") );\r
                        goto done;\r
+               }\r
        }\r
 \r
        cl_rbmap_insert(\r
@@ -2681,7 +2760,11 @@ __insert_by_qpn(
                else if( p_new_cep->remote_ca_guid > p_cep->remote_ca_guid )\r
                        p_item = cl_rbmap_right( p_item ), left = FALSE;\r
                else\r
+               {\r
+                       AL_TRACE( AL_DBG_CM | AL_DBG_WARN,\r
+                               ("WARNING: Duplicate remote QPN and CA GUID.\n") );\r
                        goto done;\r
+               }\r
        }\r
 \r
        cl_rbmap_insert(\r