[Openib-windows] Receive Queue
Fabian Tillier
ftillier at silverstorm.com
Mon May 22 10:38:41 PDT 2006
Hi Guy,
On 4/27/06, Fabian Tillier <ftillier at silverstorm.com> wrote:
> Hi Guy,
>
> On 4/25/06, Guy Corem <guyc at voltaire.com> wrote:
> >
> > Hi Fab,
> >
> > Thanks for your illuminating reply. Attached is my fix, per your
> > suggestion.
>
> This is a great start. There are some issues, though, but I will put
> something together and send it out for your review.
Sorry it's taken this long to get something to you. Everytime I
thought I was close, I found a potential problem that had to be
closed.
Here are the changes that were made:
- If the receive queue falls bellow a low water mark, the driver will
report the receives with STATUS_RESOURCES.
- For any packets reported with STATUS_RESOURCES, the driver now calls
ipoib_return_packet to repost, as well as to flush any other pending
packets. The existing code only completed a single packet during
ipoib_return_packet, rather than reporting as many as possible.
- ipoib_return_packet now also reports receives with STATUS_RESOURCES,
and loops internally to report as many queued receives as possible.
- I added a tunable to the INF that allows users to configure where
the low watermark is, as a ratio of the receive queue depth. The
default is 4, so 1/4th, with a max of 1/8th, and a min of 2 (1/2).
Please give it a try and let me know if it works for you or not. I'll
wait to apply until I hear back.
Thanks,
- Fab
Index: ulp/ipoib/kernel/netipoib.inf
===================================================================
--- ulp/ipoib/kernel/netipoib.inf (revision 358)
+++ ulp/ipoib/kernel/netipoib.inf (working copy)
@@ -68,6 +68,14 @@
HKR, Ndi\Params\RqDepth, Max, 0, "1024"
HKR, Ndi\Params\RqDepth, Step, 0, "128"
+HKR, Ndi\Params\RqLowWatermark, ParamDesc, 0, "Receive Queue Low Watermark"
+HKR, Ndi\Params\RqLowWatermark, Type, 0, "dword"
+HKR, Ndi\Params\RqLowWatermark, Default, 0, "4"
+HKR, Ndi\Params\RqLowWatermark, Optional, 0, "1"
+HKR, Ndi\Params\RqLowWatermark, Min, 0, "2"
+HKR, Ndi\Params\RqLowWatermark, Max, 0, "8"
+HKR, Ndi\Params\RqLowWatermark, Step, 0, "1"
+
HKR, Ndi\Params\SqDepth, ParamDesc, 0, "Send Queue Depth"
HKR, Ndi\Params\SqDepth, Type, 0, "dword"
HKR, Ndi\Params\SqDepth, Default, 0, "128"
Index: ulp/ipoib/kernel/ipoib_port.c
===================================================================
--- ulp/ipoib/kernel/ipoib_port.c (revision 358)
+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
@@ -218,7 +218,7 @@
OUT ipoib_endpt_t** const pp_src,
OUT ipoib_endpt_t** const pp_dst );
-static uint32_t
+static int32_t
__recv_mgr_filter(
IN ipoib_port_t* const p_port,
IN ib_wc_t* const p_done_wc_list,
@@ -258,7 +258,9 @@
static uint32_t
__recv_mgr_build_pkt_array(
IN ipoib_port_t* const p_port,
- OUT cl_qlist_t* const p_done_list );
+ IN int32_t shortage,
+ OUT cl_qlist_t* const p_done_list,
+ OUT int32_t* const p_discarded );
/******************************************************************************
*
@@ -1235,7 +1237,6 @@
NdisChainBufferAtFront( p_packet, p_buffer );
NDIS_SET_PACKET_HEADER_SIZE( p_packet, sizeof(eth_hdr_t) );
- NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );
IPOIB_EXIT( IPOIB_DBG_RECV );
return p_packet;
@@ -1299,8 +1300,13 @@
}
-/* Posts receive buffers to the receive queue. */
-static ib_api_status_t
+/*
+ * Posts receive buffers to the receive queue and returns the number
+ * of receives needed to bring the RQ to its low water mark. Note
+ * that the value is signed, and can go negative. All tests must
+ * be for > 0.
+ */
+static int32_t
__recv_mgr_repost(
IN ipoib_port_t* const p_port )
{
@@ -1319,7 +1325,7 @@
cl_obj_unlock( &p_port->obj );
IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
("Port in invalid state. Not reposting.\n") );
- return IB_SUCCESS;
+ return 0;
}
cl_obj_ref( &p_port->obj );
cl_obj_unlock( &p_port->obj );
@@ -1349,45 +1355,37 @@
p_head = p_next;
- cl_atomic_inc( &p_port->recv_mgr.depth );
+ p_port->recv_mgr.depth++;
}
- if( !p_head )
+ if( p_head )
{
- cl_obj_deref( &p_port->obj );
- IPOIB_EXIT( IPOIB_DBG_RECV );
- return IB_SUCCESS;
- }
+ cl_perf_start( PostRecv );
+ status = p_port->p_adapter->p_ifc->post_recv(
+ p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );
+ cl_perf_stop( &p_port->p_adapter->perf, PostRecv );
- cl_perf_start( PostRecv );
- status = p_port->p_adapter->p_ifc->post_recv(
- p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );
- cl_perf_stop( &p_port->p_adapter->perf, PostRecv );
-
- /*
- * If we failed, fix up the work completion list and return those
- * buffers to the pool
- */
- if( status != IB_SUCCESS )
- {
- IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
- ("ip_post_recv returned %s\n",
- p_port->p_adapter->p_ifc->get_err_str( status )) );
- /* return the descriptors to the pool */
- while( p_failed )
+ if( status != IB_SUCCESS )
{
- p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );
- p_failed = p_failed->p_next;
+ IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+ ("ip_post_recv returned %s\n",
+ p_port->p_adapter->p_ifc->get_err_str( status )) );
+ /* return the descriptors to the pool */
+ while( p_failed )
+ {
+ p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );
+ p_failed = p_failed->p_next;
- __buf_mgr_put_recv( p_port, p_head, NULL );
- cl_atomic_dec( &p_port->recv_mgr.depth );
+ __buf_mgr_put_recv( p_port, p_head, NULL );
+ p_port->recv_mgr.depth--;
+ }
}
}
cl_obj_deref( &p_port->obj );
IPOIB_EXIT( IPOIB_DBG_RECV );
- return status;
+ return p_port->p_adapter->params.rq_low_watermark - p_port->recv_mgr.depth;
}
@@ -1400,6 +1398,7 @@
ipoib_port_t *p_port;
ipoib_recv_desc_t *p_desc;
ib_api_status_t status = IB_NOT_DONE;
+ int32_t shortage;
PERF_DECLARE( ReturnPacket );
PERF_DECLARE( ReturnPutRecv );
PERF_DECLARE( ReturnRepostRecv );
@@ -1413,30 +1412,25 @@
cl_perf_start( ReturnPacket );
- /* Get the manager and descriptor from the packet. */
+ /* Get the port and descriptor from the packet. */
p_port = IPOIB_PORT_FROM_PACKET( p_packet );
p_desc = IPOIB_RECV_FROM_PACKET( p_packet );
cl_spinlock_acquire( &p_port->recv_lock );
+
cl_perf_start( ReturnPutRecv );
__buf_mgr_put_recv( p_port, p_desc, p_packet );
cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );
/* Repost buffers. */
cl_perf_start( ReturnRepostRecv );
- __recv_mgr_repost( p_port );
+ shortage = __recv_mgr_repost( p_port );
cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );
- /* Complete any additional receives waiting for a packet. */
- p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
- do
+ for( p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
+ p_item != cl_qlist_end( &p_port->recv_mgr.done_list );
+ p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list ) )
{
- if( p_item == cl_qlist_end( &p_port->recv_mgr.done_list ) )
- {
- cl_spinlock_release( &p_port->recv_lock );
- break;
- }
-
p_desc = (ipoib_recv_desc_t*)p_item;
cl_perf_start( ReturnPreparePkt );
@@ -1444,11 +1438,29 @@
cl_perf_stop( &p_port->p_adapter->perf, ReturnPreparePkt );
if( status == IB_SUCCESS )
{
+ if( shortage > 0 )
+ NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_RESOURCES );
+ else
+ NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );
+
cl_spinlock_release( &p_port->recv_lock );
cl_perf_start( ReturnNdisIndicate );
NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,
&p_packet, 1 );
cl_perf_stop( &p_port->p_adapter->perf, ReturnNdisIndicate );
+ cl_spinlock_acquire( &p_port->recv_lock );
+
+ if( shortage > 0 )
+ {
+ cl_perf_start( ReturnPutRecv );
+ __buf_mgr_put_recv( p_port, p_desc, p_packet );
+ cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );
+
+ /* Repost buffers. */
+ cl_perf_start( ReturnRepostRecv );
+ shortage = __recv_mgr_repost( p_port );
+ cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );
+ }
}
else if( status != IB_NOT_DONE )
{
@@ -1457,14 +1469,10 @@
p_port->p_adapter->p_ifc->get_err_str( status )) );
/* Return the item to the head of the list. */
cl_qlist_insert_head( &p_port->recv_mgr.done_list, p_item );
- cl_spinlock_release( &p_port->recv_lock );
+ break;
}
- else
- {
- p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
- }
-
- } while( status == IB_NOT_DONE );
+ }
+ cl_spinlock_release( &p_port->recv_lock );
cl_perf_stop( &p_port->p_adapter->perf, ReturnPacket );
IPOIB_EXIT( IPOIB_DBG_RECV );
@@ -1479,7 +1487,7 @@
ipoib_port_t *p_port;
ib_api_status_t status;
ib_wc_t wc[MAX_RECV_WC], *p_free, *p_wc;
- uint32_t pkt_cnt, recv_cnt = 0;
+ int32_t pkt_cnt, recv_cnt = 0, shortage, discarded;
cl_qlist_t done_list, bad_list;
size_t i;
PERF_DECLARE( RecvCompBundle );
@@ -1540,35 +1548,78 @@
/* We're done looking at the endpoint map, release the reference. */
cl_atomic_dec( &p_port->endpt_rdr );
+ cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );
+
+ cl_spinlock_acquire( &p_port->recv_lock );
+
/* Update our posted depth. */
- cl_atomic_sub( &p_port->recv_mgr.depth, recv_cnt );
+ p_port->recv_mgr.depth -= recv_cnt;
- cl_perf_start( BuildPktArray );
- /* Notify NDIS of any and all possible receive buffers. */
- pkt_cnt = __recv_mgr_build_pkt_array( p_port, &done_list );
- cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );
+ /* Return any discarded receives to the pool */
+ cl_perf_start( PutRecvList );
+ __buf_mgr_put_recv_list( p_port, &bad_list );
+ cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );
- /* Only indicate receives if we actually had any. */
- if( pkt_cnt )
+ do
{
+ /* Repost ASAP so we don't starve the RQ. */
+ cl_perf_start( RepostRecv );
+ shortage = __recv_mgr_repost( p_port );
+ cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+
+ cl_perf_start( BuildPktArray );
+ /* Notify NDIS of any and all possible receive buffers. */
+ pkt_cnt = __recv_mgr_build_pkt_array(
+ p_port, shortage, &done_list, &discarded );
+ cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );
+
+ /* Only indicate receives if we actually had any. */
+ if( discarded && shortage > 0 )
+ {
+ /* We may have thrown away packets, and have a shortage */
+ cl_perf_start( RepostRecv );
+ __recv_mgr_repost( p_port );
+ cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+ }
+
+ if( !pkt_cnt )
+ break;
+
+ cl_spinlock_release( &p_port->recv_lock );
+
cl_perf_start( RecvNdisIndicate );
NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,
p_port->recv_mgr.recv_pkt_array, pkt_cnt );
cl_perf_stop( &p_port->p_adapter->perf, RecvNdisIndicate );
- }
- cl_spinlock_acquire( &p_port->recv_lock );
+ /*
+ * Cap the number of receives to put back to what we just indicated
+ * with NDIS_STATUS_RESOURCES.
+ */
+ if( shortage > 0 )
+ {
+ if( pkt_cnt < shortage )
+ shortage = pkt_cnt;
- /* Return any discarded receives to the pool */
- cl_perf_start( PutRecvList );
- __buf_mgr_put_recv_list( p_port, &bad_list );
- cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );
+ /* Return all but the last packet to the pool. */
+ cl_spinlock_acquire( &p_port->recv_lock );
+ while( shortage-- > 1 )
+ {
+ __buf_mgr_put_recv( p_port,
+ IPOIB_RECV_FROM_PACKET( p_port->recv_mgr.recv_pkt_array[shortage] ),
+ p_port->recv_mgr.recv_pkt_array[shortage] );
+ }
+ cl_spinlock_release( &p_port->recv_lock );
- /* Repost receives. */
- cl_perf_start( RepostRecv );
- __recv_mgr_repost( p_port );
- cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+ /*
+ * Return the last packet as if NDIS returned it, so that we repost
+ * and report any other pending receives.
+ */
+ ipoib_return_packet( NULL, p_port->recv_mgr.recv_pkt_array[0] );
+ }
+ cl_spinlock_acquire( &p_port->recv_lock );
+ } while( pkt_cnt );
cl_spinlock_release( &p_port->recv_lock );
/*
@@ -1585,7 +1636,6 @@
cl_obj_deref( &p_port->obj );
cl_perf_stop( &p_port->p_adapter->perf, RecvCb );
- cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );
IPOIB_EXIT( IPOIB_DBG_RECV );
}
@@ -1718,7 +1768,7 @@
}
-static uint32_t
+static int32_t
__recv_mgr_filter(
IN ipoib_port_t* const p_port,
IN ib_wc_t* const p_done_wc_list,
@@ -1731,7 +1781,8 @@
eth_pkt_t *p_eth;
ipoib_endpt_t *p_src, *p_dst;
ib_api_status_t status;
- uint32_t len, recv_cnt = 0;
+ uint32_t len;
+ int32_t recv_cnt = 0;
PERF_DECLARE( GetRecvEndpts );
PERF_DECLARE( RecvGen );
PERF_DECLARE( RecvTcp );
@@ -2393,7 +2444,9 @@
static uint32_t
__recv_mgr_build_pkt_array(
IN ipoib_port_t* const p_port,
- OUT cl_qlist_t* const p_done_list )
+ IN int32_t shortage,
+ OUT cl_qlist_t* const p_done_list,
+ OUT int32_t* const p_discarded )
{
cl_list_item_t *p_item;
ipoib_recv_desc_t *p_desc;
@@ -2403,7 +2456,7 @@
IPOIB_ENTER( IPOIB_DBG_RECV );
- cl_spinlock_acquire( &p_port->recv_lock );
+ *p_discarded = 0;
/* Move any existing receives to the head to preserve ordering. */
cl_qlist_insert_list_head( p_done_list, &p_port->recv_mgr.done_list );
@@ -2419,10 +2472,24 @@
if( status == IB_SUCCESS )
{
CL_ASSERT( p_port->recv_mgr.recv_pkt_array[i] );
+ if( shortage-- > 0 )
+ {
+ NDIS_SET_PACKET_STATUS(
+ p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_RESOURCES );
+ }
+ else
+ {
+ NDIS_SET_PACKET_STATUS(
+ p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_SUCCESS );
+ }
i++;
}
- else if( status != IB_NOT_DONE )
+ else if( status == IB_NOT_DONE )
{
+ (*p_discarded)++;
+ }
+ else
+ {
IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
("__recv_mgr_prepare_pkt returned %s\n",
p_port->p_adapter->p_ifc->get_err_str( status )) );
@@ -2435,8 +2502,6 @@
p_item = cl_qlist_remove_head( p_done_list );
}
- cl_spinlock_release( &p_port->recv_lock );
-
IPOIB_EXIT( IPOIB_DBG_RECV );
return i;
}
Index: ulp/ipoib/kernel/ipoib_adapter.h
===================================================================
--- ulp/ipoib/kernel/ipoib_adapter.h (revision 358)
+++ ulp/ipoib/kernel/ipoib_adapter.h (working copy)
@@ -63,6 +63,7 @@
typedef struct _ipoib_params
{
int32_t rq_depth;
+ int32_t rq_low_watermark;
int32_t sq_depth;
boolean_t send_chksum_offload;
boolean_t wsdp_enabled;
@@ -79,6 +80,10 @@
* rq_depth
* Number of receive WQEs to allocate.
*
+* rq_low_watermark
+* Receives are indicated with NDIS_STATUS_RESOURCES when the number of
+* receives posted to the RQ falls bellow this value.
+*
* sq_depth
* Number of send WQEs to allocate.
*
Index: ulp/ipoib/kernel/ipoib_port.h
===================================================================
--- ulp/ipoib/kernel/ipoib_port.h (revision 358)
+++ ulp/ipoib/kernel/ipoib_port.h (working copy)
@@ -406,7 +406,7 @@
typedef struct _ipoib_recv_mgr
{
- atomic32_t depth;
+ int32_t depth;
NDIS_PACKET **recv_pkt_array;
Index: ulp/ipoib/kernel/ipoib_driver.c
===================================================================
--- ulp/ipoib/kernel/ipoib_driver.c (revision 358)
+++ ulp/ipoib/kernel/ipoib_driver.c (working copy)
@@ -429,6 +429,20 @@
}
p_adapter->params.rq_depth = p_param->ParameterData.IntegerData;
+ /* Optional: Receive queue low watermark. */
+ RtlInitUnicodeString( &keyword, L"RqLowWatermark" );
+ NdisReadConfiguration(
+ &status, &p_param, h_config, &keyword, NdisParameterInteger );
+ if( status != NDIS_STATUS_SUCCESS || !p_param->ParameterData.IntegerData )
+ {
+ p_adapter->params.rq_low_watermark = p_adapter->params.rq_depth >> 2;
+ }
+ else
+ {
+ p_adapter->params.rq_low_watermark =
+ p_adapter->params.rq_depth / p_param->ParameterData.IntegerData;
+ }
+
/* Required: Send queue depth. */
RtlInitUnicodeString( &keyword, L"SqDepth" );
NdisReadConfiguration(
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: ipoib_recv_hang.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20060522/7f0e83c2/attachment.ksh>
More information about the ofw
mailing list