ib/cm: fix handling failed send completions
authorE~1\svn\LOCALS~1\Temp/report.7.tmp <E~1\svn\LOCALS~1\Temp/report.7.tmp@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Mon, 4 Jan 2010 19:01:09 +0000 (19:01 +0000)
committerE~1\svn\LOCALS~1\Temp/report.7.tmp <E~1\svn\LOCALS~1\Temp/report.7.tmp@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Mon, 4 Jan 2010 19:01:09 +0000 (19:01 +0000)
__cep_mad_send_cb() assumes that the mad being processed is
associated with the current state of the CEP.  This may not be
the case.

For example, for a short lived connection, it was observed that
a REP mad completed with status canceled.  This is normal.  However,
the user already attempted to disconnect the connection by sending
a DREQ.  This left the cep in the DREQ_SENT state by the time that
the REP mad completed.  Since the REP failed, but the state was
DREQ_SENT, the code assumed that the DREQ had failed and transitioned
the cep into TIMEWAIT.  The result is that the DREQ is never
matched with a DREP or canceled, but holds a reference on the CEP.

Until the DREQ times out (time depends on connection, but easily
up to a minute), attempts to destroy the CEP are blocked.

Fix this by simply discarding any completed sends that were not
sent from the current state of the cep when the completion handler
is invoked.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
git-svn-id: svn://openib.tc.cornell.edu/gen1/trunk@2650 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86

core/al/kernel/al_cm.c
core/al/kernel/al_cm_cep.c

index 48b0cb5..955985a 100644 (file)
@@ -37,7 +37,7 @@
 typedef struct _iba_cm_id_priv\r
 {\r
        iba_cm_id       id;\r
-       KEVENT          destroy_event;  \r
+       KEVENT          destroy_event;\r
 \r
 }      iba_cm_id_priv;\r
 \r
index 49fa417..4987207 100644 (file)
@@ -27,7 +27,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
  * SOFTWARE.\r
  *\r
- * $Id$\r
+ * $Id: al_cm_cep.c 2540 2009-11-03 17:23:09Z shefty $\r
  */\r
 \r
 \r
@@ -2213,10 +2213,7 @@ __cep_mad_send_cb(
 \r
        p_cep = (kcep_t*)p_mad->context1;\r
 \r
-       /*\r
-        * The connection context is not set when performing immediate responses,\r
-        * such as repeating MADS.\r
-        */\r
+       /* The cep context is only set for MADs that are retried. */\r
        if( !p_cep )\r
        {\r
                ib_put_mad( p_mad );\r
@@ -2224,94 +2221,71 @@ __cep_mad_send_cb(
                return;\r
        }\r
 \r
+       CL_ASSERT( p_mad->status != IB_WCS_SUCCESS );\r
        p_mad->context1 = NULL;\r
 \r
        KeAcquireInStackQueuedSpinLockAtDpcLevel( &gp_cep_mgr->lock, &hdl );\r
-       /* Clear the sent MAD pointer so that we don't try cancelling again. */\r
-       if( p_cep->p_send_mad == p_mad )\r
-               p_cep->p_send_mad = NULL;\r
-\r
-       switch( p_mad->status )\r
+       if( p_cep->p_send_mad != p_mad )\r
        {\r
-       case IB_WCS_SUCCESS:\r
                KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
                ib_put_mad( p_mad );\r
-               break;\r
-\r
-       case IB_WCS_CANCELED:\r
-               if( p_cep->state != CEP_STATE_REQ_SENT &&\r
-                       p_cep->state != CEP_STATE_REQ_MRA_RCVD &&\r
-                       p_cep->state != CEP_STATE_REP_SENT &&\r
-                       p_cep->state != CEP_STATE_REP_MRA_RCVD &&\r
-                       p_cep->state != CEP_STATE_LAP_SENT &&\r
-                       p_cep->state != CEP_STATE_LAP_MRA_RCVD &&\r
-                       p_cep->state != CEP_STATE_DREQ_SENT &&\r
-                       p_cep->state != CEP_STATE_SREQ_SENT )\r
-               {\r
-                       KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
-                       ib_put_mad( p_mad );\r
-                       break;\r
-               }\r
-               /* Treat as a timeout so we don't stall the state machine. */\r
-               p_mad->status = IB_WCS_TIMEOUT_RETRY_ERR;\r
-\r
-               /* Fall through. */\r
-       case IB_WCS_TIMEOUT_RETRY_ERR:\r
-       default:\r
-               /* Timeout.  Reject the connection. */\r
-               switch( p_cep->state )\r
-               {\r
-               case CEP_STATE_REQ_SENT:\r
-               case CEP_STATE_REQ_MRA_RCVD:\r
-               case CEP_STATE_REP_SENT:\r
-               case CEP_STATE_REP_MRA_RCVD:\r
-                       /* Send the REJ. */\r
-                       __reject_timeout( p_port_cep, p_cep, p_mad );\r
-                       __remove_cep( p_cep );\r
-                       p_cep->state = CEP_STATE_IDLE;\r
-                       break;\r
-\r
-               case CEP_STATE_DREQ_DESTROY:\r
-                       p_cep->state = CEP_STATE_DESTROY;\r
-                       __insert_timewait( p_cep );\r
-                       /* Fall through. */\r
+               goto done;\r
+       }\r
 \r
-               case CEP_STATE_DESTROY:\r
-                       KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
-                       ib_put_mad( p_mad );\r
-                       goto done;\r
+       /* Clear the sent MAD pointer so that we don't try cancelling again. */\r
+       p_cep->p_send_mad = NULL;\r
 \r
-               case CEP_STATE_DREQ_SENT:\r
-                       /*\r
-                        * Make up a DREP mad so we can respond if we receive\r
-                        * a DREQ while in timewait.\r
-                        */\r
-                       __format_mad_hdr( &p_cep->mads.drep.hdr, p_cep, CM_DREP_ATTR_ID );\r
-                       __format_drep( p_cep, NULL, 0, &p_cep->mads.drep );\r
-                       p_cep->state = CEP_STATE_TIMEWAIT;\r
-                       __insert_timewait( p_cep );\r
-                       break;\r
+       switch( p_cep->state )\r
+       {\r
+       case CEP_STATE_REQ_SENT:\r
+       case CEP_STATE_REQ_MRA_RCVD:\r
+       case CEP_STATE_REP_SENT:\r
+       case CEP_STATE_REP_MRA_RCVD:\r
+               /* Send the REJ. */\r
+               __reject_timeout( p_port_cep, p_cep, p_mad );\r
+               __remove_cep( p_cep );\r
+               p_cep->state = CEP_STATE_IDLE;\r
+               break;\r
 \r
-               case CEP_STATE_LAP_SENT:\r
-                       /*\r
-                        * Before CEP was sent, we have been in CEP_STATE_ESTABLISHED as we\r
-                        * failed to send, we return to that state.\r
-                        */\r
-                       p_cep->state = CEP_STATE_ESTABLISHED;\r
-                       break;\r
-               default:\r
-                       break;\r
-               }\r
+       case CEP_STATE_DREQ_DESTROY:\r
+               p_cep->state = CEP_STATE_DESTROY;\r
+               __insert_timewait( p_cep );\r
+               /* Fall through. */\r
 \r
-               status = __cep_queue_mad( p_cep, p_mad );\r
-               CL_ASSERT( status != IB_INVALID_STATE );\r
+       case CEP_STATE_DESTROY:\r
                KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
+               ib_put_mad( p_mad );\r
+               goto done;\r
 \r
-               if( status == IB_SUCCESS )\r
-                       __process_cep( p_cep );\r
+       case CEP_STATE_DREQ_SENT:\r
+               /*\r
+                * Make up a DREP mad so we can respond if we receive\r
+                * a DREQ while in timewait.\r
+                */\r
+               __format_mad_hdr( &p_cep->mads.drep.hdr, p_cep, CM_DREP_ATTR_ID );\r
+               __format_drep( p_cep, NULL, 0, &p_cep->mads.drep );\r
+               p_cep->state = CEP_STATE_TIMEWAIT;\r
+               __insert_timewait( p_cep );\r
+               break;\r
+\r
+       case CEP_STATE_LAP_SENT:\r
+               /*\r
+                * Before CEP was sent, we have been in CEP_STATE_ESTABLISHED as we\r
+                * failed to send, we return to that state.\r
+                */\r
+               p_cep->state = CEP_STATE_ESTABLISHED;\r
+               break;\r
+       default:\r
                break;\r
        }\r
 \r
+       status = __cep_queue_mad( p_cep, p_mad );\r
+       CL_ASSERT( status != IB_INVALID_STATE );\r
+       KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl );\r
+\r
+       if( status == IB_SUCCESS )\r
+               __process_cep( p_cep );\r
+\r
 done:\r
        pfn_destroy_cb = p_cep->pfn_destroy_cb;\r
        cep_context = p_cep->context;\r
@@ -3938,12 +3912,8 @@ __cleanup_cep(
        CL_ASSERT( KeGetCurrentIrql() == DISPATCH_LEVEL );\r
 \r
        /* If we've already come through here, we're done. */\r
-       if( p_cep->state == CEP_STATE_DESTROY ||\r
-               p_cep->state == CEP_STATE_DREQ_DESTROY )\r
-       {\r
-               AL_EXIT( AL_DBG_CM );\r
-               return -1;\r
-       }\r
+       CL_ASSERT( p_cep->state != CEP_STATE_DESTROY &&\r
+               p_cep->state != CEP_STATE_DREQ_DESTROY );\r
 \r
        /* Cleanup the pending MAD list. */\r
        while( p_cep->p_mad_head )\r