[ofw][patch] mcast garbage collector

Slava Strebkov slavas at voltaire.com
Sun Jun 29 06:52:17 PDT 2008


Hi,
Please review mcast garbage collector, which removes old endpoints of
ipoib port. Rescan time can be configured from device property page.
Garbage collector may be disabled from the device property page, it's on
by default. Garbage collector uses implementation of igmp v2 protocol.



Index: ulp/ipoib/kernel/ipoib_adapter.h
===================================================================
--- ulp/ipoib/kernel/ipoib_adapter.h	(revision 1302)
+++ ulp/ipoib/kernel/ipoib_adapter.h	(working copy)
@@ -74,7 +74,8 @@
 	uint32_t	payload_mtu;
 	uint32_t	xfer_block_size;
 	mac_addr_t	conf_mac;
-
+    boolean_t	mc_garbage_collector;
+	uint32_t	mc_leave_rescan;
 }	ipoib_params_t;
 /*
 * FIELDS
Index: ulp/ipoib/kernel/ipoib_driver.c
===================================================================
--- ulp/ipoib/kernel/ipoib_driver.c	(revision 1302)
+++ ulp/ipoib/kernel/ipoib_driver.c	(working copy)
@@ -526,6 +526,29 @@
 	}
 	p_adapter->params.recv_pool_ratio =
p_param->ParameterData.IntegerData;
 
+    /* Required: MC garbage collector. */
+    RtlInitUnicodeString( &keyword, L"MCGarbageCollector" );
+    NdisReadConfiguration(
+        &status, &p_param, h_config, &keyword, NdisParameterInteger );
+    if( status != NDIS_STATUS_SUCCESS )
+    {
+        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+            ("MC garbage collector parameter is missing.\n") );
+        return status;
+    }
+    p_adapter->params.mc_garbage_collector =
(p_param->ParameterData.IntegerData != 0);
+	/* Optional: MC leave rescan (sec) for the MC garable collector
thread. */
+	RtlInitUnicodeString( &keyword, L"MCLeaveRescan" );
+	NdisReadConfiguration(
+		&status, &p_param, h_config, &keyword,
NdisParameterInteger );
+	if( status != NDIS_STATUS_SUCCESS )
+	{
+		p_adapter->params.mc_leave_rescan = 260;
+	}
+	else
+	{
+		p_adapter->params.mc_leave_rescan =
p_param->ParameterData.IntegerData;
+	}
 	/* required: MTU size. */
 	RtlInitUnicodeString( &keyword, L"PayloadMtu" );
 	NdisReadConfiguration(
Index: ulp/ipoib/kernel/ipoib_endpoint.h
===================================================================
--- ulp/ipoib/kernel/ipoib_endpoint.h	(revision 1302)
+++ ulp/ipoib/kernel/ipoib_endpoint.h	(working copy)
@@ -61,7 +61,9 @@
 TO_LONG_PTR(	ib_av_handle_t ,			h_av) ; 
 	boolean_t				expired;
 	ib_al_ifc_t				*p_ifc;
-
+	boolean_t    			is_in_use;
+    boolean_t				is_gc_in_use;
+	boolean_t				is_mcast_listener;
 }	ipoib_endpt_t;
 /*
 * FIELDS
Index: ulp/ipoib/kernel/ipoib_port.c
===================================================================
--- ulp/ipoib/kernel/ipoib_port.c	(revision 1302)
+++ ulp/ipoib/kernel/ipoib_port.c	(working copy)
@@ -66,7 +66,7 @@
 ipoib_port_t	*gp_ipoib_port;
 #endif
 
-
+static void __port_do_mcast_garbage(ipoib_port_t *p_port);
 
/***********************************************************************
*******
 *
 * Declarations
@@ -94,6 +94,8 @@
 __port_free(
 	IN				cl_obj_t* const
p_obj );
 
+static void CL_API __port_mcast_garbage_collector
+	(IN				void*
context );
 
 
/***********************************************************************
*******
 *
@@ -290,6 +292,14 @@
 	IN	OUT			ipoib_send_desc_t* const
p_desc );
 
 static NDIS_STATUS
+__send_mgr_filter_igmp_v2(
+	IN				ipoib_port_t* const
p_port,
+    IN		const	ip_hdr_t* const
p_ip_hdr,
+	IN				size_t
iph_options_size,
+	IN				NDIS_BUFFER*
p_buf,
+	IN				size_t
buf_len );
+
+static NDIS_STATUS
 __send_mgr_filter_udp(
 	IN				ipoib_port_t* const
p_port,
 	IN		const	ip_hdr_t* const
p_ip_hdr,
@@ -489,6 +499,13 @@
 #endif
 }
 
+/* function returns pointer to payload that is going after IP header.
+*  asssuming that payload and IP header are in the same buffer
+*/
+static void* GetIpPayloadPtr(const	ip_hdr_t* const	p_ip_hdr)
+{
+	return (void*)((uint8_t*)p_ip_hdr + 4*(p_ip_hdr->ver_hl & 0xf));
+}
 
 
/***********************************************************************
*******
 *
@@ -579,6 +596,11 @@
 	KeInitializeEvent( &p_port->sa_event, NotificationEvent, TRUE );
 	KeInitializeEvent( &p_port->leave_mcast_event,
NotificationEvent, TRUE );
 	
+	p_port->mcast_event_init = FALSE;
+	cl_event_construct(&p_port->mcast_event);
+
+	p_port->mcast_thread_init = FALSE;
+	cl_thread_construct(&p_port->mcast_thread);
 	IPOIB_EXIT( IPOIB_DBG_INIT );
 }
 
@@ -653,6 +675,12 @@
 		return status;
 	}
 
+    if (p_port->p_adapter->params.mc_garbage_collector) 
+    {
+        /* Initialize multicast garbage collector event */
+		KeInitializeEvent( &p_port->mcast_event,
NotificationEvent, FALSE );
+        p_port->mcast_event_init = TRUE;
+	}
 	/* We only ever destroy from the PnP callback thread. */
 	cl_status = cl_obj_init( &p_port->obj, CL_DESTROY_SYNC,
 		__port_destroying, __port_cleanup, __port_free );
@@ -746,7 +774,19 @@
 	CL_ASSERT( p_obj );
 
 	p_port = PARENT_STRUCT( p_obj, ipoib_port_t, obj );
+    if (p_port->p_adapter->params.mc_garbage_collector) 
+    {
+        /* Destroy multicast garbage collector thread */
 
+        if(p_port->mcast_thread_init)
+        {
+            CL_ASSERT(p_port->mcast_event_init);
+			KeSetEvent( &p_port->mcast_event, 0, FALSE );

+            cl_thread_destroy(&p_port->mcast_thread);
+            p_port->mcast_thread_init = FALSE;
+        }
+    }
+
 	__endpt_mgr_destroy( p_port );
 	__recv_mgr_destroy( p_port );
 	__send_mgr_destroy( p_port );
@@ -2131,6 +2171,11 @@
 	p_eth->hdr.src = p_src->mac;
 	p_eth->hdr.dst = p_dst->mac;
 
+    if ( p_dst->is_gc_in_use) 
+	{
+		CL_ASSERT(p_dst->h_mcast != NULL);
+        p_dst->is_in_use = TRUE;
+	}
 	IPOIB_EXIT( IPOIB_DBG_RECV );
 	return IB_SUCCESS;
 }
@@ -3129,7 +3174,132 @@
 	return status;
 }
 
+static NDIS_STATUS
+__send_mgr_filter_igmp_v2(
+	IN				ipoib_port_t* const
p_port,
+    IN		const	ip_hdr_t* const
p_ip_hdr,
+	IN				size_t
iph_options_size,
+	IN				NDIS_BUFFER*
p_buf,
+	IN				size_t
buf_len )
+{
+    igmp_v2_hdr_t		*p_igmp_v2_hdr = NULL;
+	NDIS_STATUS			endpt_status;
+	ipoib_endpt_t* 		p_endpt = NULL;
+	mac_addr_t			fake_mcast_mac;
 
+	IPOIB_ENTER( IPOIB_DBG_SEND );
+
+    if( !buf_len )
+	{
+		// To get the IGMP packet we need to skip the ip options
NDIS_BUFFER (if exists)
+		while ( iph_options_size )
+		{
+			NdisGetNextBuffer( p_buf, &p_buf );
+			if( !p_buf )
+			{
+				IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+					("Failed to get IGMPv2 header
buffer.\n") );
+				return NDIS_STATUS_FAILURE;
+			}
+			NdisQueryBufferSafe( p_buf, &p_igmp_v2_hdr,
&buf_len, NormalPagePriority );
+			if( !p_igmp_v2_hdr )
+			{
+				IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+					("Failed to query IGMPv2 header
buffer.\n") );
+				return NDIS_STATUS_FAILURE;
+			}
+
+			iph_options_size-=buf_len;
+		}
+        
+		NdisGetNextBuffer( p_buf, &p_buf );
+		if( !p_buf )
+		{
+			IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+				("Failed to get IGMPv2 header
buffer.\n") );
+			return NDIS_STATUS_FAILURE;
+		}
+		NdisQueryBufferSafe( p_buf, &p_igmp_v2_hdr, &buf_len,
NormalPagePriority );
+		if( !p_igmp_v2_hdr )
+		{
+			IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
IPOIB_DBG_ERROR,
+				("Failed to query IGMPv2 header
buffer.\n") );
+			return NDIS_STATUS_FAILURE;
+		}
+	}
+	else
+	{
+		/* assuming ip header and options are in the same packet
*/
+		p_igmp_v2_hdr = GetIpPayloadPtr(p_ip_hdr);
+	}
+	/* Get the IGMP header length. */
+	if( buf_len < sizeof(igmp_v2_hdr_t) )
+	{
+		IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+			("Buffer not large enough for IGMPv2 packet.\n")
);
+		return NDIS_STATUS_BUFFER_TOO_SHORT;
+	}
+
+	// build fake mac from igmp packet group address
+	fake_mcast_mac.addr[0] = 1;
+    fake_mcast_mac.addr[1] = ((unsigned
char*)&p_igmp_v2_hdr->group_address)[0] & 0x0f;
+    fake_mcast_mac.addr[2] = 0x5E;
+	fake_mcast_mac.addr[3] = ((unsigned
char*)&p_igmp_v2_hdr->group_address)[1];
+	fake_mcast_mac.addr[4] = ((unsigned
char*)&p_igmp_v2_hdr->group_address)[2];
+	fake_mcast_mac.addr[5] = ((unsigned
char*)&p_igmp_v2_hdr->group_address)[3];
+
+	switch ( p_igmp_v2_hdr->type )
+	{
+	case IGMP_V2_MEMBERSHIP_REPORT:
+		/* 
+			This mean that some body open listener on this
group 
+		    Change type of mcast endpt to SEND_RECV endpt. So
mcast garbage collector 
+			will not delete this mcast endpt.
+		*/
+        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
+			("Catched IGMP_V2_MEMBERSHIP_REPORT message\n")
);
+        endpt_status = __endpt_mgr_ref( p_port, fake_mcast_mac,
&p_endpt );
+		if ( p_endpt )
+		{
+			cl_obj_lock( &p_port->obj );
+			p_endpt->is_mcast_listener = TRUE;
+			cl_obj_unlock( &p_port->obj );
+            ipoib_endpt_deref( p_endpt );
+		}
+		break;
+
+	case IGMP_V2_LEAVE_GROUP:
+        /* 
+			This mean that somebody CLOSE listener on this
group .
+		    Change type of mcast endpt to SEND_ONLY endpt. So
mcast 
+			garbage collector will delete this mcast endpt
next time.
+		*/
+        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
+			     ("Catched IGMP_V2_LEAVE_GROUP message\n")
);
+        endpt_status = __endpt_mgr_ref( p_port, fake_mcast_mac,
&p_endpt );
+		if ( p_endpt )
+		{
+			cl_obj_lock( &p_port->obj );
+			p_endpt->is_mcast_listener = FALSE;
+			p_endpt->is_in_use = FALSE;
+			cl_obj_unlock( &p_port->obj );
+			ipoib_endpt_deref( p_endpt );
+		}
+
+		__port_do_mcast_garbage(p_port);
+
+		break;
+
+	default:
+        IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_MCAST,
+			     ("Send Unknown IGMP message: 0x%x \n",
p_igmp_v2_hdr->type ) );
+		break;
+	}
+
+	IPOIB_EXIT( IPOIB_DBG_SEND );
+	return NDIS_STATUS_SUCCESS;
+}
+
 static NDIS_STATUS
 __send_mgr_filter_udp(
 	IN				ipoib_port_t* const
p_port,
@@ -3577,7 +3747,18 @@
 			return NDIS_STATUS_PENDING;
 		}
 	}
+	else if ( p_port->p_adapter->params.mc_garbage_collector &&
+		      status == NDIS_STATUS_SUCCESS && 
+			  ETH_IS_MULTICAST( p_eth_hdr->dst.addr ) &&  
+			  !ETH_IS_BROADCAST( p_eth_hdr->dst.addr ) )
+	{
+		CL_ASSERT( (*pp_endpt) );
+		CL_ASSERT((*pp_endpt)->h_mcast != NULL);
+		CL_ASSERT((*pp_endpt)->is_gc_in_use);
 
+		(*pp_endpt)->is_in_use = TRUE;
+	}
+
 	IPOIB_EXIT( IPOIB_DBG_SEND );
 	return status;
 }
@@ -4706,6 +4887,8 @@
 	ib_query_req_t			query;
 	ib_user_query_t			info;
 	ib_portinfo_record_t	port_rec;
+    cl_status_t             cl_status;
+    BOOLEAN                 success = TRUE;
 
 	IPOIB_ENTER( IPOIB_DBG_INIT );
 
@@ -4744,15 +4927,41 @@
 		p_port->p_adapter->h_al, &query, &p_port->ib_mgr.h_query
);
 	if( status != IB_SUCCESS )
 	{
-		KeSetEvent( &p_port->sa_event, EVENT_INCREMENT, FALSE );
-		ipoib_set_inactive( p_port->p_adapter );
-		ipoib_port_deref( p_port, ref_port_up );
+            success = FALSE;
 		IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
 			("ib_query returned %s\n", 
 			p_port->p_adapter->p_ifc->get_err_str( status ))
);
-		return;
-	}
+        goto port_up_end;
+        }
 
+	/* garbage collector is needed when link is active */
+        if (p_port->p_adapter->params.mc_garbage_collector) 
+        {
+            CL_ASSERT(p_port->mcast_event_init);
+		KeClearEvent( &p_port->mcast_event );
+
+            cl_status = cl_thread_init(
+                &p_port->mcast_thread, 
+                __port_mcast_garbage_collector, 
+                p_port, 
+                "mcast_garbage");
+            if( cl_status != CL_SUCCESS )
+            {
+                success = FALSE;
+                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
+                    ("cl_thread_init returned %#x\n", cl_status) );
+            goto port_up_end;
+            }
+            p_port->mcast_thread_init = TRUE;
+    }
+port_up_end:
+        if (!success) 
+        {
+            KeSetEvent( &p_port->sa_event, EVENT_INCREMENT, FALSE );
+            ipoib_set_inactive( p_port->p_adapter );
+            ipoib_port_deref( p_port, ref_port_up );
+        }
+
 	IPOIB_EXIT( IPOIB_DBG_INIT );
 }
 
@@ -5229,6 +5438,16 @@
 		return;
 	}
 
+    /* Destroy multicast garbage collector thread */
+	/* garbage collector is not needed when link is down */
+    if(p_port->p_adapter->params.mc_garbage_collector &&
p_port->mcast_thread_init)
+    {
+        CL_ASSERT(p_port->mcast_event_init);
+		KeSetEvent( &p_port->mcast_event, 0, FALSE );
+
+        cl_thread_destroy(&p_port->mcast_thread);
+        p_port->mcast_thread_init = FALSE;
+    }
 	KeResetEvent(&p_port->leave_mcast_event);
 
 	/* Reset all endpoints so we don't flush our ARP cache. */
@@ -5656,6 +5875,17 @@
 			&p_port->endpt_mgr.lid_endpts, p_endpt->dlid,
&p_endpt->lid_item );
 		CL_ASSERT( p_qitem == &p_endpt->lid_item );
 	}
+    /* Add the endpoint to the multicast endpoints list */
+	if ( p_port->p_adapter->params.mc_garbage_collector )
+	{
+		/* set flag that garbage collector is enabled */
+		p_endpt->is_gc_in_use = TRUE;
+		p_endpt->is_in_use = TRUE;
+	}
+	else
+	{
+		p_endpt->is_gc_in_use = FALSE;
+	}
 	cl_obj_unlock( &p_port->obj );
 	
 	/* Try to send all pending sends. */
@@ -5712,6 +5942,93 @@
 	IPOIB_EXIT( IPOIB_DBG_MCAST );
 }
 
+static void __port_do_mcast_garbage(ipoib_port_t *p_port)
+{
+    const mac_addr_t DEFAULT_MCAST_GROUP = {0x01, 0x00, 0x5e, 0x00,
0x00, 0x01};
+	/* Do garbage collecting... */
 
+	cl_map_item_t	*p_item;
+	ipoib_endpt_t	*p_endpt;
+	cl_qlist_t		destroy_mc_list;
+	uint8_t			cnt;
+	const static GC_MAX_LEAVE_NUM = 80;
 
+	cl_qlist_init( &destroy_mc_list );
 
+	cl_obj_lock( &p_port->obj );
+	cnt = 0;
+	p_item = cl_qmap_head( &p_port->endpt_mgr.mac_endpts );
+	while( (p_item != cl_qmap_end( &p_port->endpt_mgr.mac_endpts ))
&& (cnt < GC_MAX_LEAVE_NUM))
+	{
+		p_endpt = PARENT_STRUCT( p_item, ipoib_endpt_t, mac_item
);
+		p_item = cl_qmap_next( p_item );
+
+		/* Check if the current endpoint is not a multicast
listener */
+
+		if( p_endpt->h_mcast && 
+			p_endpt->is_gc_in_use &&
+			(!p_endpt->is_mcast_listener) &&
+			( cl_memcmp( &p_endpt->mac,
&DEFAULT_MCAST_GROUP, sizeof(mac_addr_t) ) &&
+			 (!p_endpt->is_in_use) ))
+		{
+			cl_qmap_remove_item(
&p_port->endpt_mgr.mac_endpts,
+				&p_endpt->mac_item );
+			cl_fmap_remove_item(
&p_port->endpt_mgr.gid_endpts,
+				&p_endpt->gid_item );
+
+			if( p_endpt->dlid )
+			{
+				cl_qmap_remove_item(
&p_port->endpt_mgr.lid_endpts,
+					&p_endpt->lid_item );
+				p_endpt->dlid = 0;
+			}
+
+			cl_qlist_insert_tail(
+				&destroy_mc_list,
&p_endpt->mac_item.pool_item.list_item );
+			cnt++;
+		}
+		else
+			p_endpt->is_in_use = FALSE;
+	}
+	cl_obj_unlock( &p_port->obj );
+
+	/* Destroy all multicast endpoints now that we have released the
lock. */
+	while( cl_qlist_count( &destroy_mc_list ) )
+	{
+		p_endpt = PARENT_STRUCT( cl_qlist_head( &destroy_mc_list
),
+
ipoib_endpt_t, mac_item.pool_item.list_item );
+		IPOIB_PRINT( TRACE_LEVEL_INFORMATION, IPOIB_DBG_ENDPT,
+			("mcast garbage collector: destroying endpoint
%02x:%02x:%02x:%02x:%02x:%02x \n", 
+				 p_endpt->mac.addr[0],
+				 p_endpt->mac.addr[1],
+				 p_endpt->mac.addr[2],
+				 p_endpt->mac.addr[3],
+				 p_endpt->mac.addr[4],
+				 p_endpt->mac.addr[5]) );
+
+		cl_obj_destroy( &PARENT_STRUCT( cl_qlist_remove_head(
&destroy_mc_list ),
+			ipoib_endpt_t, mac_item.pool_item.list_item
)->obj );
+	}
+}
+
+static void CL_API __port_mcast_garbage_collector
+	(IN				void*
context )
+{
+	cl_status_t status;
+	LARGE_INTEGER	wait;
+	ipoib_port_t *p_port = context;
+    IPOIB_ENTER( IPOIB_DBG_ENDPT );
+
+    CL_ASSERT( p_port->p_adapter->params.mc_garbage_collector );
+
+	wait.QuadPart =
-(int64_t)(((uint64_t)p_port->p_adapter->params.mc_leave_rescan *
1000000) * 10);
+	status = KeWaitForSingleObject( &p_port->mcast_event, Executive,
KernelMode,
+		FALSE, &wait );
+
+	while((status = KeWaitForSingleObject( &p_port->mcast_event,
Executive, KernelMode,
+
FALSE, &wait )) != STATUS_SUCCESS)
+	{
+		__port_do_mcast_garbage(p_port);
+	}
+    IPOIB_EXIT( IPOIB_DBG_ENDPT );
+}
Index: ulp/ipoib/kernel/ipoib_port.h
===================================================================
--- ulp/ipoib/kernel/ipoib_port.h	(revision 1302)
+++ ulp/ipoib/kernel/ipoib_port.h	(working copy)
@@ -506,6 +506,11 @@
 	atomic32_t				endpt_rdr;
 
 	atomic32_t				hdr_idx;
+	boolean_t				mcast_event_init;	
+	KEVENT				    mcast_event;
/* Multicast garabage collector thread terminate event */
+
+	boolean_t				mcast_thread_init;	
+	cl_thread_t				mcast_thread;
/* Multicast garbage collector thread */
 	uint16_t				pkey_index;
 	ipoib_hdr_t				hdr[1];	/* Must be last!
*/
 
Index: ulp/ipoib/kernel/netipoib.inf
===================================================================
--- ulp/ipoib/kernel/netipoib.inf	(revision 1302)
+++ ulp/ipoib/kernel/netipoib.inf	(working copy)
@@ -126,6 +126,19 @@
 HKR, Ndi\Params\PayloadMtu,		Min,		0, "60"
 HKR, Ndi\Params\PayloadMtu,		Max,		0, "2044"
 
+HKR, Ndi\Params\MCGarbageCollector,	ParamDesc,	0, "MC garbage
collector"
+HKR, Ndi\Params\MCGarbageCollector,	Type,		0, "enum"
+HKR, Ndi\Params\MCGarbageCollector,	Default,	0, "1"
+HKR, Ndi\Params\MCGarbageCollector,	Optional,	0, "0"
+HKR, Ndi\Params\MCGarbageCollector\enum,"0",		0, "Disabled"
+HKR, Ndi\Params\MCGarbageCollector\enum,"1",		0, "Enabled"
+
+HKR, Ndi\Params\MCLeaveRescan,		ParamDesc,	0, "MC leave
rescan (sec)"
+HKR, Ndi\Params\MCLeaveRescan,		Type,		0, "dword"
+HKR, Ndi\Params\MCLeaveRescan,		Default,	0, "260"
+HKR, Ndi\Params\MCLeaveRescan,		Optional,	0, "0"
+HKR, Ndi\Params\MCLeaveRescan,		Min,		0, "1"
+HKR, Ndi\Params\MCLeaveRescan,		Max,		0, "3600"
 [IpoibService]
 DisplayName     = %IpoibServiceDispName%
 ServiceType     = 1 ;%SERVICE_KERNEL_DRIVER%


-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcast_garbage_collector.diff
Type: application/octet-stream
Size: 17780 bytes
Desc: mcast_garbage_collector.diff
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080629/69fec8b6/attachment.obj>


More information about the ofw mailing list