[Openib-windows] Receive Queue

Fabian Tillier ftillier at silverstorm.com
Mon May 22 10:38:41 PDT 2006


Hi Guy,

On 4/27/06, Fabian Tillier <ftillier at silverstorm.com> wrote:
> Hi Guy,
>
> On 4/25/06, Guy Corem <guyc at voltaire.com> wrote:
> >
> > Hi Fab,
> >
> > Thanks for your illuminating reply. Attached is my fix, per your
> > suggestion.
>
> This is a great start.  There are some issues, though, but I will put
> something together and send it out for your review.

Sorry it's taken this long to get something to you.  Everytime I
thought I was close, I found a potential problem that had to be
closed.

Here are the changes that were made:
- If the receive queue falls bellow a low water mark, the driver will
report the receives with STATUS_RESOURCES.
- For any packets reported with STATUS_RESOURCES, the driver now calls
ipoib_return_packet to repost, as well as to flush any other pending
packets.  The existing code only completed a single packet during
ipoib_return_packet, rather than reporting as many as possible.
- ipoib_return_packet now also reports receives with STATUS_RESOURCES,
and loops internally to report as many queued receives as possible.
- I added a tunable to the INF that allows users to configure where
the low watermark is, as a ratio of the receive queue depth.  The
default is 4, so 1/4th, with a max of 1/8th, and a min of 2 (1/2).

Please give it a try and let me know if it works for you or not.  I'll
wait to apply until I hear back.

Thanks,

- Fab

Index: ulp/ipoib/kernel/netipoib.inf
===================================================================
--- ulp/ipoib/kernel/netipoib.inf	(revision 358)
+++ ulp/ipoib/kernel/netipoib.inf	(working copy)
@@ -68,6 +68,14 @@
 HKR, Ndi\Params\RqDepth,		Max,		0, "1024"
 HKR, Ndi\Params\RqDepth,		Step,		0, "128"

+HKR, Ndi\Params\RqLowWatermark,	ParamDesc,	0, "Receive Queue Low Watermark"
+HKR, Ndi\Params\RqLowWatermark,	Type,		0, "dword"
+HKR, Ndi\Params\RqLowWatermark,	Default,	0, "4"
+HKR, Ndi\Params\RqLowWatermark,	Optional,	0, "1"
+HKR, Ndi\Params\RqLowWatermark,	Min,		0, "2"
+HKR, Ndi\Params\RqLowWatermark,	Max,		0, "8"
+HKR, Ndi\Params\RqLowWatermark,	Step,		0, "1"
+
 HKR, Ndi\Params\SqDepth,		ParamDesc,	0, "Send Queue Depth"
 HKR, Ndi\Params\SqDepth,		Type,		0, "dword"
 HKR, Ndi\Params\SqDepth,		Default,	0, "128"
Index: ulp/ipoib/kernel/ipoib_port.c
===================================================================
--- ulp/ipoib/kernel/ipoib_port.c	(revision 358)
+++ ulp/ipoib/kernel/ipoib_port.c	(working copy)
@@ -218,7 +218,7 @@
 		OUT			ipoib_endpt_t** const		pp_src,
 		OUT			ipoib_endpt_t** const		pp_dst );

-static uint32_t
+static int32_t
 __recv_mgr_filter(
 	IN				ipoib_port_t* const			p_port,
 	IN				ib_wc_t* const				p_done_wc_list,
@@ -258,7 +258,9 @@
 static uint32_t
 __recv_mgr_build_pkt_array(
 	IN				ipoib_port_t* const			p_port,
-		OUT			cl_qlist_t* const			p_done_list );
+	IN				int32_t						shortage,
+		OUT			cl_qlist_t* const			p_done_list,
+		OUT			int32_t* const				p_discarded );

 /******************************************************************************
 *
@@ -1235,7 +1237,6 @@

 	NdisChainBufferAtFront( p_packet, p_buffer );
 	NDIS_SET_PACKET_HEADER_SIZE( p_packet, sizeof(eth_hdr_t) );
-	NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );

 	IPOIB_EXIT(  IPOIB_DBG_RECV );
 	return p_packet;
@@ -1299,8 +1300,13 @@
 }


-/* Posts receive buffers to the receive queue. */
-static ib_api_status_t
+/*
+ * Posts receive buffers to the receive queue and returns the number
+ * of receives needed to bring the RQ to its low water mark.  Note
+ * that the value is signed, and can go negative.  All tests must
+ * be for > 0.
+ */
+static int32_t
 __recv_mgr_repost(
 	IN				ipoib_port_t* const			p_port )
 {
@@ -1319,7 +1325,7 @@
 		cl_obj_unlock( &p_port->obj );
 		IPOIB_PRINT_EXIT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
 			("Port in invalid state.  Not reposting.\n") );
-		return IB_SUCCESS;
+		return 0;
 	}
 	cl_obj_ref( &p_port->obj );
 	cl_obj_unlock( &p_port->obj );
@@ -1349,45 +1355,37 @@

 		p_head = p_next;

-		cl_atomic_inc( &p_port->recv_mgr.depth );
+		p_port->recv_mgr.depth++;
 	}

-	if( !p_head )
+	if( p_head )
 	{
-		cl_obj_deref( &p_port->obj );
-		IPOIB_EXIT( IPOIB_DBG_RECV );
-		return IB_SUCCESS;
-	}
+		cl_perf_start( PostRecv );
+		status = p_port->p_adapter->p_ifc->post_recv(
+			p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );
+		cl_perf_stop( &p_port->p_adapter->perf, PostRecv );

-	cl_perf_start( PostRecv );
-	status = p_port->p_adapter->p_ifc->post_recv(
-		p_port->ib_mgr.h_qp, &p_head->wr, &p_failed );
-	cl_perf_stop( &p_port->p_adapter->perf, PostRecv );
-
-	/*
-	 * If we failed, fix up the work completion list and return those
-	 * buffers to the pool
-	 */
-	if( status != IB_SUCCESS )
-	{
-		IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
-			("ip_post_recv returned %s\n",
-			p_port->p_adapter->p_ifc->get_err_str( status )) );
-		/* return the descriptors to the pool */
-		while( p_failed )
+		if( status != IB_SUCCESS )
 		{
-			p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );
-			p_failed = p_failed->p_next;
+			IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+				("ip_post_recv returned %s\n",
+				p_port->p_adapter->p_ifc->get_err_str( status )) );
+			/* return the descriptors to the pool */
+			while( p_failed )
+			{
+				p_head = PARENT_STRUCT( p_failed, ipoib_recv_desc_t, wr );
+				p_failed = p_failed->p_next;

-			__buf_mgr_put_recv( p_port, p_head, NULL );
-			cl_atomic_dec( &p_port->recv_mgr.depth );
+				__buf_mgr_put_recv( p_port, p_head, NULL );
+				p_port->recv_mgr.depth--;
+			}
 		}
 	}

 	cl_obj_deref( &p_port->obj );

 	IPOIB_EXIT( IPOIB_DBG_RECV );
-	return status;
+	return p_port->p_adapter->params.rq_low_watermark - p_port->recv_mgr.depth;
 }


@@ -1400,6 +1398,7 @@
 	ipoib_port_t		*p_port;
 	ipoib_recv_desc_t	*p_desc;
 	ib_api_status_t		status = IB_NOT_DONE;
+	int32_t				shortage;
 	PERF_DECLARE( ReturnPacket );
 	PERF_DECLARE( ReturnPutRecv );
 	PERF_DECLARE( ReturnRepostRecv );
@@ -1413,30 +1412,25 @@

 	cl_perf_start( ReturnPacket );

-	/* Get the manager and descriptor from the packet. */
+	/* Get the port and descriptor from the packet. */
 	p_port = IPOIB_PORT_FROM_PACKET( p_packet );
 	p_desc = IPOIB_RECV_FROM_PACKET( p_packet );

 	cl_spinlock_acquire( &p_port->recv_lock );
+
 	cl_perf_start( ReturnPutRecv );
 	__buf_mgr_put_recv( p_port, p_desc, p_packet );
 	cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );

 	/* Repost buffers. */
 	cl_perf_start( ReturnRepostRecv );
-	__recv_mgr_repost( p_port );
+	shortage = __recv_mgr_repost( p_port );
 	cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );

-	/* Complete any additional receives waiting for a packet. */
-	p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
-	do
+	for( p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
+		p_item != cl_qlist_end( &p_port->recv_mgr.done_list );
+		p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list ) )
 	{
-		if( p_item == cl_qlist_end( &p_port->recv_mgr.done_list ) )
-		{
-			cl_spinlock_release( &p_port->recv_lock );
-			break;
-		}
-
 		p_desc = (ipoib_recv_desc_t*)p_item;

 		cl_perf_start( ReturnPreparePkt );
@@ -1444,11 +1438,29 @@
 		cl_perf_stop( &p_port->p_adapter->perf, ReturnPreparePkt );
 		if( status == IB_SUCCESS )
 		{
+			if( shortage > 0 )
+				NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_RESOURCES );
+			else
+				NDIS_SET_PACKET_STATUS( p_packet, NDIS_STATUS_SUCCESS );
+
 			cl_spinlock_release( &p_port->recv_lock );
 			cl_perf_start( ReturnNdisIndicate );
 			NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,
 				&p_packet, 1 );
 			cl_perf_stop( &p_port->p_adapter->perf, ReturnNdisIndicate );
+			cl_spinlock_acquire( &p_port->recv_lock );
+
+			if( shortage > 0 )
+			{
+				cl_perf_start( ReturnPutRecv );
+				__buf_mgr_put_recv( p_port, p_desc, p_packet );
+				cl_perf_stop( &p_port->p_adapter->perf, ReturnPutRecv );
+
+				/* Repost buffers. */
+				cl_perf_start( ReturnRepostRecv );
+				shortage = __recv_mgr_repost( p_port );
+				cl_perf_stop( &p_port->p_adapter->perf, ReturnRepostRecv );
+			}
 		}
 		else if( status != IB_NOT_DONE )
 		{
@@ -1457,14 +1469,10 @@
 				p_port->p_adapter->p_ifc->get_err_str( status )) );
 			/* Return the item to the head of the list. */
 			cl_qlist_insert_head( &p_port->recv_mgr.done_list, p_item );
-			cl_spinlock_release( &p_port->recv_lock );
+			break;
 		}
-		else
-		{
-			p_item = cl_qlist_remove_head( &p_port->recv_mgr.done_list );
-		}
-
-	} while( status == IB_NOT_DONE );
+	}
+	cl_spinlock_release( &p_port->recv_lock );
 	cl_perf_stop( &p_port->p_adapter->perf, ReturnPacket );

 	IPOIB_EXIT( IPOIB_DBG_RECV );
@@ -1479,7 +1487,7 @@
 	ipoib_port_t		*p_port;
 	ib_api_status_t		status;
 	ib_wc_t				wc[MAX_RECV_WC], *p_free, *p_wc;
-	uint32_t			pkt_cnt, recv_cnt = 0;
+	int32_t				pkt_cnt, recv_cnt = 0, shortage, discarded;
 	cl_qlist_t			done_list, bad_list;
 	size_t				i;
 	PERF_DECLARE( RecvCompBundle );
@@ -1540,35 +1548,78 @@
 	/* We're done looking at the endpoint map, release the reference. */
 	cl_atomic_dec( &p_port->endpt_rdr );

+	cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );
+
+	cl_spinlock_acquire( &p_port->recv_lock );
+
 	/* Update our posted depth. */
-	cl_atomic_sub( &p_port->recv_mgr.depth, recv_cnt );
+	p_port->recv_mgr.depth -= recv_cnt;

-	cl_perf_start( BuildPktArray );
-	/* Notify NDIS of any and all possible receive buffers. */
-	pkt_cnt = __recv_mgr_build_pkt_array( p_port, &done_list );
-	cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );
+	/* Return any discarded receives to the pool */
+	cl_perf_start( PutRecvList );
+	__buf_mgr_put_recv_list( p_port, &bad_list );
+	cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );

-	/* Only indicate receives if we actually had any. */
-	if( pkt_cnt )
+	do
 	{
+		/* Repost ASAP so we don't starve the RQ. */
+		cl_perf_start( RepostRecv );
+		shortage = __recv_mgr_repost( p_port );
+		cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+
+		cl_perf_start( BuildPktArray );
+		/* Notify NDIS of any and all possible receive buffers. */
+		pkt_cnt = __recv_mgr_build_pkt_array(
+			p_port, shortage, &done_list, &discarded );
+		cl_perf_stop( &p_port->p_adapter->perf, BuildPktArray );
+
+		/* Only indicate receives if we actually had any. */
+		if( discarded && shortage > 0 )
+		{
+			/* We may have thrown away packets, and have a shortage */
+			cl_perf_start( RepostRecv );
+			__recv_mgr_repost( p_port );
+			cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+		}
+
+		if( !pkt_cnt )
+			break;
+
+		cl_spinlock_release( &p_port->recv_lock );
+
 		cl_perf_start( RecvNdisIndicate );
 		NdisMIndicateReceivePacket( p_port->p_adapter->h_adapter,
 			p_port->recv_mgr.recv_pkt_array, pkt_cnt );
 		cl_perf_stop( &p_port->p_adapter->perf, RecvNdisIndicate );
-	}

-	cl_spinlock_acquire( &p_port->recv_lock );
+		/*
+		 * Cap the number of receives to put back to what we just indicated
+		 * with NDIS_STATUS_RESOURCES.
+		 */
+		if( shortage > 0 )
+		{
+			if( pkt_cnt < shortage )
+				shortage = pkt_cnt;

-	/* Return any discarded receives to the pool */
-	cl_perf_start( PutRecvList );
-	__buf_mgr_put_recv_list( p_port, &bad_list );
-	cl_perf_stop( &p_port->p_adapter->perf, PutRecvList );
+			/* Return all but the last packet to the pool. */
+			cl_spinlock_acquire( &p_port->recv_lock );
+			while( shortage-- > 1 )
+			{
+				__buf_mgr_put_recv( p_port,
+					IPOIB_RECV_FROM_PACKET( p_port->recv_mgr.recv_pkt_array[shortage] ),
+					p_port->recv_mgr.recv_pkt_array[shortage] );
+			}
+			cl_spinlock_release( &p_port->recv_lock );

-	/* Repost receives. */
-	cl_perf_start( RepostRecv );
-	__recv_mgr_repost( p_port );
-	cl_perf_stop( &p_port->p_adapter->perf, RepostRecv );
+			/*
+			 * Return the last packet as if NDIS returned it, so that we repost
+			 * and report any other pending receives.
+			 */
+			ipoib_return_packet( NULL, p_port->recv_mgr.recv_pkt_array[0] );
+		}
+		cl_spinlock_acquire( &p_port->recv_lock );

+	} while( pkt_cnt );
 	cl_spinlock_release( &p_port->recv_lock );

 	/*
@@ -1585,7 +1636,6 @@
 	cl_obj_deref( &p_port->obj );

 	cl_perf_stop( &p_port->p_adapter->perf, RecvCb );
-	cl_perf_log( &p_port->p_adapter->perf, RecvCompBundle, recv_cnt );

 	IPOIB_EXIT( IPOIB_DBG_RECV );
 }
@@ -1718,7 +1768,7 @@
 }


-static uint32_t
+static int32_t
 __recv_mgr_filter(
 	IN				ipoib_port_t* const			p_port,
 	IN				ib_wc_t* const				p_done_wc_list,
@@ -1731,7 +1781,8 @@
 	eth_pkt_t				*p_eth;
 	ipoib_endpt_t			*p_src, *p_dst;
 	ib_api_status_t			status;
-	uint32_t				len, recv_cnt = 0;
+	uint32_t				len;
+	int32_t					recv_cnt = 0;
 	PERF_DECLARE( GetRecvEndpts );
 	PERF_DECLARE( RecvGen );
 	PERF_DECLARE( RecvTcp );
@@ -2393,7 +2444,9 @@
 static uint32_t
 __recv_mgr_build_pkt_array(
 	IN				ipoib_port_t* const			p_port,
-		OUT			cl_qlist_t* const			p_done_list )
+	IN				int32_t						shortage,
+		OUT			cl_qlist_t* const			p_done_list,
+		OUT			int32_t* const				p_discarded )
 {
 	cl_list_item_t			*p_item;
 	ipoib_recv_desc_t		*p_desc;
@@ -2403,7 +2456,7 @@

 	IPOIB_ENTER( IPOIB_DBG_RECV );

-	cl_spinlock_acquire( &p_port->recv_lock );
+	*p_discarded = 0;

 	/* Move any existing receives to the head to preserve ordering. */
 	cl_qlist_insert_list_head( p_done_list, &p_port->recv_mgr.done_list );
@@ -2419,10 +2472,24 @@
 		if( status == IB_SUCCESS )
 		{
 			CL_ASSERT( p_port->recv_mgr.recv_pkt_array[i] );
+			if( shortage-- > 0 )
+			{
+				NDIS_SET_PACKET_STATUS(
+					p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_RESOURCES );
+			}
+			else
+			{
+				NDIS_SET_PACKET_STATUS(
+					p_port->recv_mgr.recv_pkt_array[i], NDIS_STATUS_SUCCESS );
+			}
 			i++;
 		}
-		else if( status != IB_NOT_DONE )
+		else if( status == IB_NOT_DONE )
 		{
+			(*p_discarded)++;
+		}
+		else
+		{
 			IPOIB_PRINT(TRACE_LEVEL_INFORMATION, IPOIB_DBG_RECV,
 				("__recv_mgr_prepare_pkt returned %s\n",
 				p_port->p_adapter->p_ifc->get_err_str( status )) );
@@ -2435,8 +2502,6 @@
 		p_item = cl_qlist_remove_head( p_done_list );
 	}

-	cl_spinlock_release( &p_port->recv_lock );
-
 	IPOIB_EXIT( IPOIB_DBG_RECV );
 	return i;
 }
Index: ulp/ipoib/kernel/ipoib_adapter.h
===================================================================
--- ulp/ipoib/kernel/ipoib_adapter.h	(revision 358)
+++ ulp/ipoib/kernel/ipoib_adapter.h	(working copy)
@@ -63,6 +63,7 @@
 typedef struct _ipoib_params
 {
 	int32_t		rq_depth;
+	int32_t		rq_low_watermark;
 	int32_t		sq_depth;
 	boolean_t	send_chksum_offload;
 	boolean_t	wsdp_enabled;
@@ -79,6 +80,10 @@
 *	rq_depth
 *		Number of receive WQEs to allocate.
 *
+*	rq_low_watermark
+*		Receives are indicated with NDIS_STATUS_RESOURCES when the number of
+*		receives posted to the RQ falls bellow this value.
+*
 *	sq_depth
 *		Number of send WQEs to allocate.
 *
Index: ulp/ipoib/kernel/ipoib_port.h
===================================================================
--- ulp/ipoib/kernel/ipoib_port.h	(revision 358)
+++ ulp/ipoib/kernel/ipoib_port.h	(working copy)
@@ -406,7 +406,7 @@

 typedef struct _ipoib_recv_mgr
 {
-	atomic32_t		depth;
+	int32_t			depth;

 	NDIS_PACKET		**recv_pkt_array;

Index: ulp/ipoib/kernel/ipoib_driver.c
===================================================================
--- ulp/ipoib/kernel/ipoib_driver.c	(revision 358)
+++ ulp/ipoib/kernel/ipoib_driver.c	(working copy)
@@ -429,6 +429,20 @@
 	}
 	p_adapter->params.rq_depth = p_param->ParameterData.IntegerData;

+	/* Optional: Receive queue low watermark. */
+	RtlInitUnicodeString( &keyword, L"RqLowWatermark" );
+	NdisReadConfiguration(
+		&status, &p_param, h_config, &keyword, NdisParameterInteger );
+	if( status != NDIS_STATUS_SUCCESS || !p_param->ParameterData.IntegerData )
+	{
+		p_adapter->params.rq_low_watermark = p_adapter->params.rq_depth >> 2;
+	}
+	else
+	{
+		p_adapter->params.rq_low_watermark =
+			p_adapter->params.rq_depth / p_param->ParameterData.IntegerData;
+	}
+
 	/* Required: Send queue depth. */
 	RtlInitUnicodeString( &keyword, L"SqDepth" );
 	NdisReadConfiguration(
-------------- next part --------------
An embedded and charset-unspecified text was scrubbed...
Name: ipoib_recv_hang.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20060522/7f0e83c2/attachment.ksh>


More information about the ofw mailing list