[IPoIB] Report receives with NDIS_STATUS_RESOURCES when below a RQ
authorftillier <ftillier@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Wed, 7 Jun 2006 23:49:48 +0000 (23:49 +0000)
committerftillier <ftillier@ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86>
Wed, 7 Jun 2006 23:49:48 +0000 (23:49 +0000)
low watermark.  This prevents hangs during certain workloads where
the RQ becomes exhausted.

git-svn-id: svn://openib.tc.cornell.edu/gen1/trunk@378 ad392aa1-c5ef-ae45-8dd8-e69d62a5ef86

ulp/ipoib/kernel/ipoib_adapter.h
ulp/ipoib/kernel/ipoib_driver.c
ulp/ipoib/kernel/ipoib_port.c
ulp/ipoib/kernel/ipoib_port.h
ulp/ipoib/kernel/netipoib.inf

index b3abf90..43601a0 100644 (file)
@@ -63,6 +63,7 @@
 typedef struct _ipoib_params\r
 {\r
        int32_t         rq_depth;\r
+       int32_t         rq_low_watermark;\r
        int32_t         sq_depth;\r
        boolean_t       send_chksum_offload;\r
        boolean_t       recv_chksum_offload;\r
@@ -80,6 +81,10 @@ typedef struct _ipoib_params
 *      rq_depth\r
 *              Number of receive WQEs to allocate.\r
 *\r
+*      rq_low_watermark\r
+*              Receives are indicated with NDIS_STATUS_RESOURCES when the number of\r
+*              receives posted to the RQ falls bellow this value.\r
+*\r
 *      sq_depth\r
 *              Number of send WQEs to allocate.\r
 *\r
index f23ccd3..3e95964 100644 (file)
@@ -429,6 +429,20 @@ ipoib_get_adapter_params(
        }\r
        p_adapter->params.rq_depth = p_param->ParameterData.IntegerData;\r
 \r
+       /* Optional: Receive queue low watermark. */\r
+       RtlInitUnicodeString( &keyword, L"RqLowWatermark" );\r
+       NdisReadConfiguration(\r
+               &status, &p_param, h_config, &keyword, NdisParameterInteger );\r
+       if( status != NDIS_STATUS_SUCCESS || !p_param->ParameterData.IntegerData )\r
+       {\r
+               p_adapter->params.rq_low_watermark = p_adapter->params.rq_depth >> 2;\r
+       }\r
+       else\r
+       {\r
+               p_adapter->params.rq_low_watermark =\r
+                       p_adapter->params.rq_depth / p_param->ParameterData.IntegerData;\r
+       }\r
+\r
        /* Required: Send queue depth. */\r
        RtlInitUnicodeString( &keyword, L"SqDepth" );\r
        NdisReadConfiguration(\r
index 2f900b4..37b62df 100644 (file)
@@ -218,7 +218,7 @@ __recv_get_endpts(
                OUT                     ipoib_endpt_t** const           pp_src,\r
                OUT                     ipoib_endpt_t** const           pp_dst );\r
 \r
-static uint32_t\r
+static int32_t\r
 __recv_mgr_filter(\r
        IN                              ipoib_port_t* const                     p_port,\r
        IN                              ib_wc_t* const                          p_done_wc_list,\r
@@ -258,7 +258,9 @@ __recv_mgr_prepare_pkt(
 static uint32_t\r
 __recv_mgr_build_pkt_array(\r
        IN                              ipoib_port_t* const                     p_port,\r
-               OUT                     cl_qlist_t* const                       p_done_list );\r
+       IN                              int32_t                                         shortage,\r
+               OUT                     cl_qlist_t* const                       p_done_list,\r
+               OUT                     int32_t* const                          p_discarded );\r
 \r
 /******************************************************************************\r
 *\r
@@ -1235,7 +1237,6 @@ __buf_mgr_get_ndis_pkt(
 \r
        NdisChainBufferAtFront( p_packet, p_buffer );\r
        NDIS_SET_PACKET_HEADER_SIZE( p_packet, sizeof(eth_hdr_t) );\r
-       NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );\r
 \r
        IPOIB_EXIT(  IPOIB_DBG_RECV );\r
        return p_packet;\r
@@ -1299,8 +1300,13 @@ __recv_mgr_destroy(
 }\r
 \r
 \r
-/* Posts receive buffers to the receive queue. */\r
-static ib_api_status_t\r
+/*\r
+ * Posts receive buffers to the receive queue and returns the number\r
+ * of receives needed to bring the RQ to its low water mark.  Note\r
+ * that the value is signed, and can go negative.  All tests must\r
+ * be for > 0.\r
+ */\r
+static int32_t\r
 __recv_mgr_repost(\r
        IN                              ipoib_port_t* const                     p_port )\r
 {\r
@@ -1319,7 +1325,7 @@ __recv_mgr_repost(
                cl_obj_unlock( &p_port->obj );\r
                IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,\r
                        ("Port in invalid state.  Not reposting.\n") );\r
-               return IB_SUCCESS;\r
+               return 0;\r
        }\r
        cl_obj_ref( &p_port->obj );\r
        cl_obj_unlock( &p_port->obj );\r
@@ -1349,45 +1355,37 @@ __recv_mgr_repost(
 \r
                p_head = p_next;\r
 \r
-               cl_atomic_inc( &p_port->recv_mgr.depth );\r
+               p_port->recv_mgr.depth++;\r
        }\r
 \r
-       if( !p_head )\r
+       if( p_head )\r
        {\r
-               cl_obj_deref( &p_port->obj );\r
-               IPOIB_EXIT( IPOIB_DBG_RECV );\r
-               return IB_SUCCESS;\r
-       }\r
-\r
-       cl_perf_start( PostRecv );\r
-       status = p_port->p_adapter->p_ifc->post_recv(\r
-               p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );\r
-       cl_perf_stop( &p_port->p_adapter->perf, PostRecv );\r
+               cl_perf_start( PostRecv );\r
+               status = p_port->p_adapter->p_ifc->post_recv(\r
+                       p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );\r
+               cl_perf_stop( &p_port->p_adapter->perf, PostRecv );\r
 \r
-       /*\r
-        * If we failed, fix up the work completion list and return those\r
-        * buffers to the pool\r
-        */\r
-       if( status != IB_SUCCESS )\r
-       {\r
-               IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,\r
-                       ("ip_post_recv returned %s\n", \r
-                       p_port->p_adapter->p_ifc->get_err_str( status )) );\r
-               /* return the descriptors to the pool */\r
-               while( p_failed )\r
+               if( status != IB_SUCCESS )\r
                {\r
-                       p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );\r
-                       p_failed = p_failed->p_next;\r
+                       IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,\r
+                               ("ip_post_recv returned %s\n", \r
+                               p_port->p_adapter->p_ifc->get_err_str( status )) );\r
+                       /* return the descriptors to the pool */\r
+                       while( p_failed )\r
+                       {\r
+                               p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );\r
+                               p_failed = p_failed->p_next;\r
 \r
-                       __buf_mgr_put_recv( p_port, p_head, NULL );\r
-                       cl_atomic_dec( &p_port->recv_mgr.depth );\r
+                               __buf_mgr_put_recv( p_port, p_head, NULL );\r
+                               p_port->recv_mgr.depth--;\r
+                       }\r
                }\r
        }\r
 \r
        cl_obj_deref( &p_port->obj );\r
 \r
        IPOIB_EXIT( IPOIB_DBG_RECV );\r
-       return status;\r
+       return p_port->p_adapter->params.rq_low_watermark - p_port->recv_mgr.depth;\r
 }\r
 \r
 \r
@@ -1400,6 +1398,7 @@ ipoib_return_packet(
        ipoib_port_t            *p_port;\r
        ipoib_recv_desc_t       *p_desc;\r
        ib_api_status_t         status = IB_NOT_DONE;\r
+       int32_t                         shortage;\r
        PERF_DECLARE( ReturnPacket );\r
        PERF_DECLARE( ReturnPutRecv );\r
        PERF_DECLARE( ReturnRepostRecv );\r
@@ -1413,30 +1412,25 @@ ipoib_return_packet(
 \r
        cl_perf_start( ReturnPacket );\r
 \r
-       /* Get the manager and descriptor from the packet. */\r
+       /* Get the port and descriptor from the packet. */\r
        p_port = IPOIB_PORT_FROM_PACKET( p_packet );\r
        p_desc = IPOIB_RECV_FROM_PACKET( p_packet );\r
 \r
        cl_spinlock_acquire( &p_port->recv_lock );\r
+\r
        cl_perf_start( ReturnPutRecv );\r
        __buf_mgr_put_recv( p_port, p_desc, p_packet );\r
        cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );\r
 \r
        /* Repost buffers. */\r
        cl_perf_start( ReturnRepostRecv );\r
-       __recv_mgr_repost( p_port );\r
+       shortage = __recv_mgr_repost( p_port );\r
        cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );\r
 \r
-       /* Complete any additional receives waiting for a packet. */\r
-       p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );\r
-       do\r
+       for( p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );\r
+               p_item != cl_qlist_end( &p_port->recv_mgr.done_list );\r
+               p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list ) )\r
        {\r
-               if( p_item == cl_qlist_end( &p_port->recv_mgr.done_list ) )\r
-               {\r
-                       cl_spinlock_release( &p_port->recv_lock );\r
-                       break;\r
-               }\r
-\r
                p_desc = (ipoib_recv_desc_t*)p_item;\r
 \r
                cl_perf_start( ReturnPreparePkt );\r
@@ -1444,11 +1438,29 @@ ipoib_return_packet(
                cl_perf_stop( &p_port->p_adapter->perf, ReturnPreparePkt );\r
                if( status == IB_SUCCESS )\r
                {\r
+                       if( shortage > 0 )\r
+                               NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_RESOURCES );\r
+                       else\r
+                               NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );\r
+\r
                        cl_spinlock_release( &p_port->recv_lock );\r
                        cl_perf_start( ReturnNdisIndicate );\r
                        NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,\r
                                &p_packet, 1 );\r
                        cl_perf_stop( &p_port->p_adapter->perf, ReturnNdisIndicate );\r
+                       cl_spinlock_acquire( &p_port->recv_lock );\r
+\r
+                       if( shortage > 0 )\r
+                       {\r
+                               cl_perf_start( ReturnPutRecv );\r
+                               __buf_mgr_put_recv( p_port, p_desc, p_packet );\r
+                               cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );\r
+\r
+                               /* Repost buffers. */\r
+                               cl_perf_start( ReturnRepostRecv );\r
+                               shortage = __recv_mgr_repost( p_port );\r
+                               cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );\r
+                       }\r
                }\r
                else if( status != IB_NOT_DONE )\r
                {\r
@@ -1457,14 +1469,10 @@ ipoib_return_packet(
                                p_port->p_adapter->p_ifc->get_err_str( status )) );\r
                        /* Return the item to the head of the list. */\r
                        cl_qlist_insert_head( &p_port->recv_mgr.done_list, p_item );\r
-                       cl_spinlock_release( &p_port->recv_lock );\r
-               }\r
-               else\r
-               {\r
-                       p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );\r
+                       break;\r
                }\r
-\r
-       } while( status == IB_NOT_DONE );\r
+       }\r
+       cl_spinlock_release( &p_port->recv_lock );\r
        cl_perf_stop( &p_port->p_adapter->perf, ReturnPacket );\r
 \r
        IPOIB_EXIT( IPOIB_DBG_RECV );\r
@@ -1479,7 +1487,7 @@ __recv_cb(
        ipoib_port_t            *p_port;\r
        ib_api_status_t         status;\r
        ib_wc_t                         wc[MAX_RECV_WC], *p_free, *p_wc;\r
-       uint32_t                        pkt_cnt, recv_cnt = 0;\r
+       int32_t                         pkt_cnt, recv_cnt = 0, shortage, discarded;\r
        cl_qlist_t                      done_list, bad_list;\r
        size_t                          i;\r
        PERF_DECLARE( RecvCompBundle );\r
@@ -1540,35 +1548,78 @@ __recv_cb(
        /* We're done looking at the endpoint map, release the reference. */\r
        cl_atomic_dec( &p_port->endpt_rdr );\r
 \r
+       cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );\r
+\r
+       cl_spinlock_acquire( &p_port->recv_lock );\r
+\r
        /* Update our posted depth. */\r
-       cl_atomic_sub( &p_port->recv_mgr.depth, recv_cnt );\r
+       p_port->recv_mgr.depth -= recv_cnt;\r
 \r
-       cl_perf_start( BuildPktArray );\r
-       /* Notify NDIS of any and all possible receive buffers. */\r
-       pkt_cnt = __recv_mgr_build_pkt_array( p_port, &done_list );\r
-       cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );\r
+       /* Return any discarded receives to the pool */\r
+       cl_perf_start( PutRecvList );\r
+       __buf_mgr_put_recv_list( p_port, &bad_list );\r
+       cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );\r
 \r
-       /* Only indicate receives if we actually had any. */\r
-       if( pkt_cnt )\r
+       do\r
        {\r
+               /* Repost ASAP so we don't starve the RQ. */\r
+               cl_perf_start( RepostRecv );\r
+               shortage = __recv_mgr_repost( p_port );\r
+               cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );\r
+\r
+               cl_perf_start( BuildPktArray );\r
+               /* Notify NDIS of any and all possible receive buffers. */\r
+               pkt_cnt = __recv_mgr_build_pkt_array(\r
+                       p_port, shortage, &done_list, &discarded );\r
+               cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );\r
+\r
+               /* Only indicate receives if we actually had any. */\r
+               if( discarded && shortage > 0 )\r
+               {\r
+                       /* We may have thrown away packets, and have a shortage */\r
+                       cl_perf_start( RepostRecv );\r
+                       __recv_mgr_repost( p_port );\r
+                       cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );\r
+               }\r
+\r
+               if( !pkt_cnt )\r
+                       break;\r
+\r
+               cl_spinlock_release( &p_port->recv_lock );\r
+\r
                cl_perf_start( RecvNdisIndicate );\r
                NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,\r
                        p_port->recv_mgr.recv_pkt_array, pkt_cnt );\r
                cl_perf_stop( &p_port->p_adapter->perf, RecvNdisIndicate );\r
-       }\r
 \r
-       cl_spinlock_acquire( &p_port->recv_lock );\r
+               /*\r
+                * Cap the number of receives to put back to what we just indicated\r
+                * with NDIS_STATUS_RESOURCES.\r
+                */\r
+               if( shortage > 0 )\r
+               {\r
+                       if( pkt_cnt < shortage )\r
+                               shortage = pkt_cnt;\r
 \r
-       /* Return any discarded receives to the pool */\r
-       cl_perf_start( PutRecvList );\r
-       __buf_mgr_put_recv_list( p_port, &bad_list );\r
-       cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );\r
+                       /* Return all but the last packet to the pool. */\r
+                       cl_spinlock_acquire( &p_port->recv_lock );\r
+                       while( shortage-- > 1 )\r
+                       {\r
+                               __buf_mgr_put_recv( p_port,\r
+                                       IPOIB_RECV_FROM_PACKET( p_port->recv_mgr.recv_pkt_array[shortage] ),\r
+                                       p_port->recv_mgr.recv_pkt_array[shortage] );\r
+                       }\r
+                       cl_spinlock_release( &p_port->recv_lock );\r
 \r
-       /* Repost receives. */\r
-       cl_perf_start( RepostRecv );\r
-       __recv_mgr_repost( p_port );\r
-       cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );\r
+                       /*\r
+                        * Return the last packet as if NDIS returned it, so that we repost\r
+                        * and report any other pending receives.\r
+                        */\r
+                       ipoib_return_packet( NULL, p_port->recv_mgr.recv_pkt_array[0] );\r
+               }\r
+               cl_spinlock_acquire( &p_port->recv_lock );\r
 \r
+       } while( pkt_cnt );\r
        cl_spinlock_release( &p_port->recv_lock );\r
 \r
        /*\r
@@ -1585,7 +1636,6 @@ __recv_cb(
        cl_obj_deref( &p_port->obj );\r
 \r
        cl_perf_stop( &p_port->p_adapter->perf, RecvCb );\r
-       cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );\r
 \r
        IPOIB_EXIT( IPOIB_DBG_RECV );\r
 }\r
@@ -1718,7 +1768,7 @@ __recv_get_endpts(
 }\r
 \r
 \r
-static uint32_t\r
+static int32_t\r
 __recv_mgr_filter(\r
        IN                              ipoib_port_t* const                     p_port,\r
        IN                              ib_wc_t* const                          p_done_wc_list,\r
@@ -1731,7 +1781,8 @@ __recv_mgr_filter(
        eth_pkt_t                               *p_eth;\r
        ipoib_endpt_t                   *p_src, *p_dst;\r
        ib_api_status_t                 status;\r
-       uint32_t                                len, recv_cnt = 0;\r
+       uint32_t                                len;\r
+       int32_t                                 recv_cnt = 0;\r
        PERF_DECLARE( GetRecvEndpts );\r
        PERF_DECLARE( RecvGen );\r
        PERF_DECLARE( RecvTcp );\r
@@ -2393,7 +2444,9 @@ __recv_mgr_prepare_pkt(
 static uint32_t\r
 __recv_mgr_build_pkt_array(\r
        IN                              ipoib_port_t* const                     p_port,\r
-               OUT                     cl_qlist_t* const                       p_done_list )\r
+       IN                              int32_t                                         shortage,\r
+               OUT                     cl_qlist_t* const                       p_done_list,\r
+               OUT                     int32_t* const                          p_discarded )\r
 {\r
        cl_list_item_t                  *p_item;\r
        ipoib_recv_desc_t               *p_desc;\r
@@ -2403,7 +2456,7 @@ __recv_mgr_build_pkt_array(
 \r
        IPOIB_ENTER( IPOIB_DBG_RECV );\r
 \r
-       cl_spinlock_acquire( &p_port->recv_lock );\r
+       *p_discarded = 0;\r
 \r
        /* Move any existing receives to the head to preserve ordering. */\r
        cl_qlist_insert_list_head( p_done_list, &p_port->recv_mgr.done_list );\r
@@ -2419,9 +2472,23 @@ __recv_mgr_build_pkt_array(
                if( status == IB_SUCCESS )\r
                {\r
                        CL_ASSERT( p_port->recv_mgr.recv_pkt_array[i] );\r
+                       if( shortage-- > 0 )\r
+                       {\r
+                               NDIS_SET_PACKET_STATUS(\r
+                                       p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_RESOURCES );\r
+                       }\r
+                       else\r
+                       {\r
+                               NDIS_SET_PACKET_STATUS(\r
+                                       p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_SUCCESS );\r
+                       }\r
                        i++;\r
                }\r
-               else if( status != IB_NOT_DONE )\r
+               else if( status == IB_NOT_DONE )\r
+               {\r
+                       (*p_discarded)++;\r
+               }\r
+               else\r
                {\r
                        IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,\r
                                ("__recv_mgr_prepare_pkt returned %s\n",\r
@@ -2435,8 +2502,6 @@ __recv_mgr_build_pkt_array(
                p_item = cl_qlist_remove_head( p_done_list );\r
        }\r
 \r
-       cl_spinlock_release( &p_port->recv_lock );\r
-\r
        IPOIB_EXIT( IPOIB_DBG_RECV );\r
        return i;\r
 }\r
index 9f16152..b693ccc 100644 (file)
@@ -406,7 +406,7 @@ typedef struct _ipoib_send_desc
 \r
 typedef struct _ipoib_recv_mgr\r
 {\r
-       atomic32_t              depth;\r
+       int32_t                 depth;\r
 \r
        NDIS_PACKET             **recv_pkt_array;\r
 \r
index 47f1602..eab489b 100644 (file)
@@ -68,6 +68,14 @@ HKR, Ndi\Params\RqDepth,             Min,            0, "128"
 HKR, Ndi\Params\RqDepth,               Max,            0, "1024"\r
 HKR, Ndi\Params\RqDepth,               Step,           0, "128"\r
 \r
+HKR, Ndi\Params\RqLowWatermark,        ParamDesc,      0, "Receive Queue Low Watermark"\r
+HKR, Ndi\Params\RqLowWatermark,        Type,           0, "dword"\r
+HKR, Ndi\Params\RqLowWatermark,        Default,        0, "4"\r
+HKR, Ndi\Params\RqLowWatermark,        Optional,       0, "1"\r
+HKR, Ndi\Params\RqLowWatermark,        Min,            0, "2"\r
+HKR, Ndi\Params\RqLowWatermark,        Max,            0, "8"\r
+HKR, Ndi\Params\RqLowWatermark,        Step,           0, "1"\r
+\r
 HKR, Ndi\Params\SqDepth,               ParamDesc,      0, "Send Queue Depth"\r
 HKR, Ndi\Params\SqDepth,               Type,           0, "dword"\r
 HKR, Ndi\Params\SqDepth,               Default,        0, "512"\r