[ofw] RE: Is there a race in __endpt_mgr_add_local?

Tzachi Dar tzachid at mellanox.co.il
Sun Oct 26 10:01:58 PDT 2008


Actually things are somewhat more complicated, than what I thought.

There are 3 lists in our code. This are guarded by two locks
The reader lock and the object_lock.

Object_lock is being used to syncronize write to these lists.

So, after some more code reviewing, it seems that I have found the
following errors:

1) on ipoib_port_deref we have been using the object after it was
already released. (debug only)

2) On many places we have been using cl_obj_destroy( &p_port->obj ); 
Instead of ipoib_port_deref( p_port, ref_port_destroying);
To my understanding this is an error as we might be deleting something
that someone else is using (referance is not zero).

3) The function ipoib_port_flush_endpts was never used. (I want to
remove it).

4) on the function __bcast_cb call take the lock before calling
__endpt_mgr_add_loca.

5) In the function __mcast_cb we have been inserting to the list without
the readers lock.

6) In the function port_do_mcast_garbage the lists are being changed
wihtout the readers lock.

Attached is the first draft of the patche in order to have community
comments:

I'm still running test to see if things are improved.

And last thing stan, I don't think that it is related to your bug. (I'm
also working on looking at this bug as well).

Thanks
Tzachi

Index: ipoib_debug.h
===================================================================
--- ipoib_debug.h	(revision 3369)
+++ ipoib_debug.h	(working copy)
@@ -292,7 +292,8 @@
 	ref_mcast_av_failed = 400 + ref_join_mcast,
 	ref_mcast_join_failed = 500 + ref_join_mcast,
 
-	ref_port_info_cb = 100 + ref_port_up
+	ref_port_info_cb = 100 + ref_port_up,
+	ref_port_destroying = 200 + ref_port_up
 
 };
 
Index: ipoib_endpoint.h
===================================================================
--- ipoib_endpoint.h	(revision 3369)
+++ ipoib_endpoint.h	(working copy)
@@ -60,7 +60,7 @@
 	net16_t					dlid;
 	net32_t					qpn;
 	ib_av_handle_t			h_av;
-	boolean_t				expired;
+//	boolean_t				expired; // remove
???????
 	ib_al_ifc_t				*p_ifc;
 	boolean_t    			is_in_use;
 	boolean_t				is_mcast_listener;
@@ -140,7 +140,7 @@
 	 * or trying to send data to that endpoint.  Clear the expired
flag
 	 * to prevent the AV from being flushed.
 	 */
-	p_endpt->expired = FALSE;
+//	p_endpt->expired = FALSE;
 }
 
 
Index: ipoib_port.c
===================================================================
--- ipoib_port.c	(revision 3369)
+++ ipoib_port.c	(working copy)
@@ -495,8 +495,6 @@
 
 inline void ipoib_port_deref(ipoib_port_t * p_port, int type)
 {
-	cl_obj_deref( &p_port->obj );
-
 #if DBG
 	cl_atomic_dec( &p_port->ref[type % ref_mask] );
 	IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_OBJ,
@@ -504,6 +502,8 @@
 #else
 	UNREFERENCED_PARAMETER(type);
 #endif
+	cl_obj_deref( &p_port->obj );
+
 }
 
 /* function returns pointer to payload that is going after IP header.
@@ -575,7 +575,7 @@
 	CL_ASSERT( p_port->p_adapter );
 	CL_ASSERT( !p_port->p_adapter->p_port );
 
-	cl_obj_destroy( &p_port->obj );
+	ipoib_port_deref( p_port, ref_port_destroying);
 
 	IPOIB_EXIT( IPOIB_DBG_INIT );
 }
@@ -3954,15 +3954,15 @@
 	
 	p_desc->wr.ds_array = p_desc->local_ds;
 
-	p_desc->wr.dgrm.ud.remote_qp = p_desc->p_endpt->qpn;
+	p_desc->wr.dgrm.ud.remote_qp = p_desc->p_endpt1->qpn;
 	p_desc->wr.dgrm.ud.remote_qkey = p_port->ib_mgr.bcast_rec.qkey;
-	p_desc->wr.dgrm.ud.h_av = p_desc->p_endpt->h_av;
+	p_desc->wr.dgrm.ud.h_av = p_desc->p_endpt1->h_av;
 	p_desc->wr.dgrm.ud.pkey_index = p_port->pkey_index;
 	p_desc->wr.dgrm.ud.rsvd = NULL;
 
 	/* Store context in our reserved area of the packet. */
 	IPOIB_PORT_FROM_PACKET( p_desc->p_pkt ) = p_port;
-	IPOIB_ENDPT_FROM_PACKET( p_desc->p_pkt ) = p_desc->p_endpt;
+	IPOIB_ENDPT_FROM_PACKET( p_desc->p_pkt ) = p_desc->p_endpt1;
 	IPOIB_SEND_FROM_PACKET( p_desc->p_pkt ) = p_desc->p_buf;
 
 	IPOIB_EXIT( IPOIB_DBG_SEND );
@@ -3983,8 +3983,8 @@
 		p_desc->p_pkt, status );
 	ipoib_inc_send_stat( p_port->p_adapter, IP_STAT_ERROR, 0 );
 	/* Deref the endpoint. */
-	if( p_desc->p_endpt )
-		ipoib_endpt_deref( p_desc->p_endpt );
+	if( p_desc->p_endpt1 )
+		ipoib_endpt_deref( p_desc->p_endpt1 );
 
 	if( p_desc->p_buf )
 	{
@@ -4044,7 +4044,7 @@
 	for( i = 0; i < num_packets; i++ )
 	{
 		desc.p_pkt = p_packet_array[i];
-		desc.p_endpt = NULL;
+		desc.p_endpt1 = NULL;
 		desc.p_buf = NULL;
 
 		/* Get the ethernet header so we can find the endpoint.
*/
@@ -4107,7 +4107,7 @@
 			p_eth_hdr->dst.addr[3] = ((unsigned
char*)&p_ip_hdr->dst_ip)[1];
 		}
 h_end:
-		status = __send_mgr_queue( p_port, p_eth_hdr,
&desc.p_endpt );
+		status = __send_mgr_queue( p_port, p_eth_hdr,
&desc.p_endpt1 );
 		cl_perf_stop( &p_port->p_adapter->perf, SendMgrQueue );
 		if( status == NDIS_STATUS_PENDING )
 		{
@@ -4216,7 +4216,7 @@
 
 		desc.p_pkt = IPOIB_PACKET_FROM_LIST_ITEM(
 			cl_qlist_remove_head(
&p_port->send_mgr.pending_list ) );
-		desc.p_endpt = NULL;
+		desc.p_endpt1 = NULL;
 		desc.p_buf = NULL;
 
 		/* Get the ethernet header so we can find the endpoint.
*/
@@ -4231,10 +4231,11 @@
 		}
 
 		cl_perf_start( GetEndpt );
-		status = __endpt_mgr_ref( p_port, p_eth_hdr->dst,
&desc.p_endpt );
+		status = __endpt_mgr_ref( p_port, p_eth_hdr->dst,
&desc.p_endpt1 );
 		cl_perf_stop( &p_port->p_adapter->perf, GetEndpt );
 		if( status == NDIS_STATUS_PENDING )
 		{
+			CL_ASSERT(desc.p_endpt == NULL);
 			cl_qlist_insert_head(
&p_port->send_mgr.pending_list,
 				IPOIB_LIST_ITEM_FROM_PACKET( desc.p_pkt
) );
 			break;
@@ -4242,6 +4243,7 @@
 		else if( status != NDIS_STATUS_SUCCESS )
 		{
 			ASSERT( status ==
NDIS_STATUS_NO_ROUTE_TO_DESTINATION );
+			CL_ASSERT(desc.p_endpt == NULL);
 
 			if( ETH_IS_MULTICAST( p_eth_hdr->dst.addr ) )
 			{
@@ -4618,8 +4620,11 @@
 	/* Destroy all multicast endpoints now that we have released the
lock. */
 	while( cl_qlist_count( &mc_list ) )
 	{
-		cl_obj_destroy( &PARENT_STRUCT( cl_qlist_remove_head(
&mc_list ),
-			ipoib_endpt_t, mac_item.pool_item.list_item
)->obj );
+        cl_list_item_t	*p_item;
+        p_item = cl_qlist_remove_head( &mc_list );
+        p_endpt = PARENT_STRUCT(p_item, ipoib_endpt_t,
mac_item.pool_item.list_item);
+        ipoib_endpt_deref( p_endpt );
+
 	}
 
 	IPOIB_EXIT( IPOIB_DBG_ENDPT );
@@ -4639,8 +4644,11 @@
 {
 	IPOIB_ENTER( IPOIB_DBG_ENDPT );
 
+	/* This function must be called from the recieve path */
+	CL_ASSERT(p_port->endpt_rdr > 0);
+
 	cl_obj_lock( &p_port->obj );
-	/* Wait for all readers to complete. */
+	/* Wait for all readers to complete. */    
 	while( p_port->endpt_rdr > 1 )
 		;
 
@@ -4660,7 +4668,7 @@
 
 	cl_obj_unlock( &p_port->obj );
 
-	cl_obj_destroy( &p_endpt->obj );
+	ipoib_endpt_deref( p_endpt );
 
 	IPOIB_EXIT( IPOIB_DBG_ENDPT );
 }
@@ -4974,7 +4982,7 @@
 
 	if( cl_status != CL_SUCCESS )
 	{
-		cl_obj_destroy( &p_endpt->obj );
+		ipoib_endpt_deref(p_endpt);
 		return IB_INVALID_STATE;
 	}
 
@@ -5094,7 +5102,7 @@
 		}
 
 		cl_obj_unlock( &p_port->obj );
-		cl_obj_destroy( &p_endpt->obj );
+		ipoib_endpt_deref( p_endpt );
 #if DBG
 		cl_atomic_dec( &p_port->ref[ref_endpt_track] );
 		IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
@@ -5110,7 +5118,7 @@
 	IPOIB_EXIT( IPOIB_DBG_ENDPT );
 }
 
-
+#if 0
 void
 ipoib_port_flush_endpts(
 	IN				ipoib_port_t* const
p_port )
@@ -5153,8 +5161,8 @@
 
 	IPOIB_EXIT( IPOIB_DBG_ENDPT );
 }
+#endif
 
-
 /*
  * The sequence for port up is as follows:
  *	1. The port goes active.  This allows the adapter to send SA
queries
@@ -5741,9 +5749,16 @@
 		IPOIB_EXIT( IPOIB_DBG_INIT );
 		return;
 	}
-	cl_obj_unlock( &p_port->obj );
 	p_port->bc_join_retry_cnt = 0;
 
+	while( p_port->endpt_rdr )
+	{
+		cl_obj_unlock( &p_port->obj );
+		cl_obj_lock( &p_port->obj );
+	}
+            ;
+//	cl_obj_unlock( &p_port->obj );
+
 	tmp = cl_atomic_inc(&p_port->m_Counter);
 	if (tmp !=1) {
 		CL_ASSERT(FALSE);
@@ -5757,7 +5772,7 @@
 		cl_memclr(&port_info, sizeof(port_info));
 		port_info.base_lid = p_port->base_lid;
 		status = __endpt_mgr_add_local( p_port, &port_info );
-
+		cl_obj_unlock( &p_port->obj );
 		if( status != IB_SUCCESS )
 		{
 			IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
@@ -5768,6 +5783,10 @@
 		}
 	}
     cl_atomic_dec(&p_port->m_Counter);
+
+	cl_obj_unlock( &p_port->obj );
+
+
 	status = __endpt_mgr_add_bcast( p_port, p_mcast_rec );
 	if( status != IB_SUCCESS )
 	{
@@ -6044,10 +6063,10 @@
 			("Invalid state - Aborting.\n") );
 		return;
 	}
-	cl_obj_unlock( &p_port->obj );
 
 	if( p_mcast_rec->status != IB_SUCCESS )
 	{
+		cl_obj_unlock( &p_port->obj );
 		IPOIB_PRINT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
 			("Multicast join request failed with status
%s.\n",
 			p_port->p_adapter->p_ifc->get_err_str(
p_mcast_rec->status )) );
@@ -6058,7 +6077,6 @@
 		return;
 	}
 
-	cl_obj_lock( &p_port->obj );
 	p_item = cl_fmap_get(
 		&p_port->endpt_mgr.gid_endpts,
&p_mcast_rec->p_member_rec->mgid );
 	if( p_item == cl_fmap_end( &p_port->endpt_mgr.gid_endpts ) )
@@ -6104,6 +6122,9 @@
 	 */
 	if( p_endpt->dlid )
 	{
+		while( p_port->endpt_rdr )
+			;
+    
 		p_qitem = cl_qmap_insert(
 			&p_port->endpt_mgr.lid_endpts, p_endpt->dlid,
&p_endpt->lid_item );
 		CL_ASSERT( p_qitem == &p_endpt->lid_item );
@@ -6333,6 +6354,9 @@
 	cl_qlist_init( &destroy_mc_list );
 
 	cl_obj_lock( &p_port->obj );
+	/* Wait for all readers to finish */
+	while( p_port->endpt_rdr )
+        ;    
 	cnt = 0;
 	p_item = cl_qmap_head( &p_port->endpt_mgr.mac_endpts );
 	while( (p_item != cl_qmap_end( &p_port->endpt_mgr.mac_endpts ))
&& (cnt < GC_MAX_LEAVE_NUM))
@@ -6381,9 +6405,7 @@
 				 p_endpt->mac.addr[3],
 				 p_endpt->mac.addr[4],
 				 p_endpt->mac.addr[5]) );
-
-		cl_obj_destroy( &PARENT_STRUCT( cl_qlist_remove_head(
&destroy_mc_list ),
-			ipoib_endpt_t, mac_item.pool_item.list_item
)->obj );
+		ipoib_endpt_deref( p_endpt );
 	}
 }
 
Index: ipoib_port.h
===================================================================
--- ipoib_port.h	(revision 3369)
+++ ipoib_port.h	(working copy)
@@ -378,7 +378,7 @@
 typedef struct _ipoib_send_desc
 {
 	NDIS_PACKET			*p_pkt;
-	ipoib_endpt_t		*p_endpt;
+	ipoib_endpt_t		*p_endpt1;
 	send_buf_t			*p_buf;
 	ib_send_wr_t		wr;
 	ipoib_hdr_t			pkt_hdr;
@@ -596,10 +596,14 @@
 	IN				ipoib_port_t* const
p_port,
 	IN		const	mac_addr_t
mac );
 
+
+#if 0
 void
 ipoib_port_flush_endpts(
 	IN				ipoib_port_t* const
p_port );
+#endif
 
+
 void
 ipoib_port_send(
 	IN				ipoib_port_t* const
p_port, 

> -----Original Message-----
> From: Alex Estrin [mailto:alex.estrin at qlogic.com] 
> Sent: Friday, October 24, 2008 4:22 PM
> To: Fab Tillier; Tzachi Dar; ofw at lists.openfabrics.org
> Subject: RE: [ofw] RE: Is there a race in __endpt_mgr_add_local?
> 
> Yes, it should be guarded with port lock.
> 
> Thanks,
> Alex. 
> 
> > -----Original Message-----
> > From: ofw-bounces at lists.openfabrics.org 
> > [mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Fab Tillier
> > Sent: Thursday, October 23, 2008 6:57 PM
> > To: Tzachi Dar; ofw at lists.openfabrics.org
> > Subject: [ofw] RE: Is there a race in __endpt_mgr_add_local?
> > 
> > It probably needs a lock.  I don't have access to my dev 
> box right now 
> > so can't look for sure.
> > 
> > -Fab
> > 
> > >From: ofw-bounces at lists.openfabrics.org 
> > >[mailto:ofw-bounces at lists.openfabrics.org] On Behalf Of Tzachi Dar
> > >Sent: Thursday, October 23, 2008 3:51 PM
> > >To: Fab Tillier; ofw at lists.openfabrics.org
> > >Subject: [ofw] Is there a race in __endpt_mgr_add_local?
> > >
> > >Hi Fab,
> > >
> > >The function __endpt_mgr_add_local is being called from
> > __bcast_cb without the port lock being hold.
> > >
> > >It later calls __endpt_mgr_insert (again without the lock).
> > >
> > >I believe that this is a bug, is there anyone who disagrees?
> > >
> > >Thanks
> > >Tzachi
> > >
> > _______________________________________________
> > ofw mailing list
> > ofw at lists.openfabrics.org
> > http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
> > 
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: ipoib_lists.patch
Type: application/octet-stream
Size: 9408 bytes
Desc: ipoib_lists.patch
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20081026/f7df9cb4/attachment.obj>


More information about the ofw mailing list