[ofw][patch] mcast garbage collector
Fab Tillier
ftillier at windows.microsoft.com
Thu Jun 12 10:20:39 PDT 2008
Hi Slava,
>Hi,
>Please review mcast garbage collector, which removes old endpoints of
>ipoib port.
>Rescan time and aging time are parameters and can be configured from
>device property page.
>SA retry count and timeout are also configurable from registry.
It seems that this patch does two things - it adds the mcast garbage collector, as well as adds filtering of IGMP packets. These two things should be split up.
For the garbage collection thread, can you tell me why I would or wouldn't want to use it? Is it the kind of thing that should probably become a 'on by default' feature? Or are there only specific cases where it's useful/needed?
>Index: core/al/al_mcast.c
>===================================================================
>--- core/al/al_mcast.c (revision 1261)
>+++ core/al/al_mcast.c (working copy)
>@@ -271,8 +274,14 @@
> sa_mad_data.p_attr = &h_mcast->member_rec;
>
> ref_al_obj( &h_mcast->obj );
>- status = al_send_sa_req(
The above line was fine. Probably leave it, since you end up duplicating it.
>- &h_mcast->sa_dereg_req, h_mcast->port_guid, 500,
>0, &sa_mad_data, 0 );
>+ status =
>+#if defined( CL_KERNEL )
>+ al_send_sa_req(
>+ &h_mcast->sa_dereg_req, h_mcast->port_guid,
>g_mc_destr_retr_timeout, g_mc_dest_retr_count, &sa_mad_data, 0 );
>+#else
>+ al_send_sa_req(
>+ &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0,
>&sa_mad_data, 0 );
The indentation of this part is whacky. If you fix the indentation here you'll reduce the diffs.
>+#endif
> if( status != IB_SUCCESS )
> deref_al_obj( &h_mcast->obj );
>
>Index: inc/complib/cl_types.h
>===================================================================
>--- inc/complib/cl_types.h (revision 1261)
>+++ inc/complib/cl_types.h (working copy)
>@@ -46,7 +46,7 @@
>
> #include <complib/cl_types_osd.h>
>
>-
>+typedef uint8_t net8_t;
No no no no no no no. I've given this feedback before (I think on a previous iteration of this patch, even). There is no such thing as a network order byte. It is meaningless. Use uint8_t.
> typedef uint16_t net16_t;
> typedef uint32_t net32_t;
> typedef uint64_t net64_t;
>Index: inc/kernel/ip_packet.h
>===================================================================
>--- inc/kernel/ip_packet.h (revision 1261)
>+++ inc/kernel/ip_packet.h (working copy)
This file has nothing to do with garbage collection. Should be in a different patch.
>Index: ulp/ipoib/kernel/ipoib_driver.c
>===================================================================
>--- ulp/ipoib/kernel/ipoib_driver.c (revision 1261)
>+++ ulp/ipoib/kernel/ipoib_driver.c (working copy)
>@@ -526,6 +526,38 @@
> }
> p_adapter->params.recv_pool_ratio =
>p_param->ParameterData.IntegerData;
>
>+ /* Required: MC garbage collector. */
>+ RtlInitUnicodeString( &keyword, L"MCGarbageCollector" );
>+ NdisReadConfiguration(
>+ &status, &p_param, h_config, &keyword, NdisParameterInteger );
>+ if( status != NDIS_STATUS_SUCCESS )
>+ {
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("MC garbage collector parameter is missing.\n") );
>+ return status;
>+ }
>+ p_adapter->params.mc_garbage_collector =
>(p_param->ParameterData.IntegerData != 0);
Note that sometimes you use spaces for tabs, and sometimes you preserve the tabs... Not a biggie as long as you keep the tab width at 4, which you seem to be doing.
>+ /* Optional: MC leave rescan (sec) for the MC garable
>collector thread. */
>+ RtlInitUnicodeString( &keyword, L"MCLeaveRescan" );
>+ NdisReadConfiguration(
>+ &status, &p_param, h_config, &keyword,
>NdisParameterInteger );
>+ if( status != NDIS_STATUS_SUCCESS )
>+ {
>+ p_adapter->params.mc_leave_rescan = 130;
>+ }
>+ else
If you use curly brackets for the if, use them for the else. Otherwise it makes it harder to read.
>+ p_adapter->params.mc_leave_rescan =
>p_param->ParameterData.IntegerData;
>+
>+ /* Optional: MC aging time (sec) */
>+ RtlInitUnicodeString( &keyword, L"MCAgingTime" );
>+ NdisReadConfiguration(
>+ &status, &p_param, h_config, &keyword,
>NdisParameterInteger );
>+ if( status != NDIS_STATUS_SUCCESS )
>+ {
>+ p_adapter->params.mc_aging_time = 260;
>+ }
>+ else
Same here.
>+ p_adapter->params.mc_aging_time =
>p_param->ParameterData.IntegerData;
> /* required: MTU size. */
> RtlInitUnicodeString( &keyword, L"PayloadMtu" );
> NdisReadConfiguration(
>Index: ulp/ipoib/kernel/ipoib_port.c
>===================================================================
>--- ulp/ipoib/kernel/ipoib_port.c (revision 1261)
>+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
>@@ -290,6 +292,14 @@
> IN OUT
>ipoib_send_desc_t* const p_desc );
>
> static NDIS_STATUS
>+__send_mgr_filter_igmp_v2(
>+ IN ipoib_port_t*
>const p_port,
>+ IN const ip_hdr_t* const
>p_ip_hdr,
>+ IN size_t
>iph_options_size,
>+ IN NDIS_BUFFER*
>p_buf,
>+ IN size_t
>buf_len );
This function and its related implementation belongs in a different patch.
>+
>+static NDIS_STATUS
> __send_mgr_filter_udp(
> IN ipoib_port_t*
>const p_port,
> IN const ip_hdr_t* const
>p_ip_hdr,
>@@ -579,6 +589,11 @@
> KeInitializeEvent( &p_port->sa_event, NotificationEvent,
>TRUE );
> KeInitializeEvent( &p_port->leave_mcast_event,
>NotificationEvent, TRUE );
>
>+ p_port->mcast_event_init = FALSE;
You don't need this - if you called cl_event_construct it is always safe to call cl_event_destroy. In fact, it would almost be better to just use KeInitializeEvent here instead of complib.
>+ cl_event_construct(&p_port->mcast_event);
>+
>+ p_port->mcast_thread_init = FALSE;
>+ cl_thread_construct(&p_port->mcast_thread);
> IPOIB_EXIT( IPOIB_DBG_INIT );
> }
>
>@@ -653,6 +668,18 @@
> return status;
> }
>
>+ if (p_port->p_adapter->params.mc_garbage_collector)
>+ {
>+ /* Initialize multicast garbage collector event */
>+ cl_status = cl_event_init(&p_port->mcast_event, TRUE);
>+ if( cl_status != CL_SUCCESS )
>+ {
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("cl_event_init returned %#x\n", cl_status) );
>+ return IB_ERROR;
>+ }
Use of KeInitializeEvent would eliminate this status check. Event initialization cannot fail.
>+ p_port->mcast_event_init = TRUE;
>+ }
> /* We only ever destroy from the PnP callback thread. */
> cl_status = cl_obj_init( &p_port->obj, CL_DESTROY_SYNC,
> __port_destroying, __port_cleanup, __port_free
>);
>@@ -2131,6 +2177,30 @@
> p_eth->hdr.src = p_src->mac;
> p_eth->hdr.dst = p_dst->mac;
>
>+ /* Check if multicast packet and update endpoint timestamp if
>needed */
>+
>+ if ( ETH_IS_MULTICAST(p_eth->hdr.dst.addr) &&
>+ p_eth->hdr.type == ETH_PROT_TYPE_IP &&
>+ !ETH_IS_BROADCAST(p_eth->hdr.dst.addr) )
Why not just do the check on p_dst->is_mcast_endpoint instead of the above?
>+ {
>+ /*
>+
>p_port->p_adapter->params.mc_garbage_collector doesn't
>+ exist in this context , so we use
>p_dst->is_mcast_endpoint
>+ as indicator for mc_garbage collector
>activity ( enable/disable )
>+ */
>+ if ( p_dst->is_mcast_endpoint &&
>+ ++(p_dst->mcast_count) >
>IPOIB_MCAST_TIMESTAMP_THRESHOLD)
>+ {
>+ CL_ASSERT(p_dst->h_mcast != NULL);
>+ CL_ASSERT(p_dst->is_mcast_endpoint);
>+
>+ p_dst->mcast_count = 0;
>+ p_dst->mcast_send_timestamp = cl_get_time_stamp_sec();
>+ }
>+
>+ p_eth->hdr.dst.addr[1] = 0;
>+ p_eth->hdr.dst.addr[3] = p_eth->hdr.dst.addr[3] &
>0x7f;
A comment explaining what you're doing here would help. Why are you setting the second byte to 0, and why are you clearing the most significant bit of the 4th byte?
>+ }
> IPOIB_EXIT( IPOIB_DBG_RECV );
> return IB_SUCCESS;
> }
>@@ -3110,6 +3180,26 @@
> if( p_ip_hdr->offset ||
> p_ip_hdr->prot != IP_PROT_UDP )
> {
>+ /* Check if this packet is IGMP */
>+ if ( p_ip_hdr->prot == IP_PROT_IGMP )
>+ {
>+ /*
>+ In igmp packet I saw that iph
>arrive in 2 NDIS_BUFFERs:
>+ 1. iph
>+ 2. ip options
>+ So to get the IGMP packet
>we need to skip the ip options NDIS_BUFFER
>+ */
>+ size_t iph_size_in_bytes =
>(p_ip_hdr->ver_hl & 0xf) * 4;
>+ size_t iph_options_size =
>iph_size_in_bytes - buf_len;
>+ buf_len -= sizeof(ip_hdr_t);
>+
>+ /*
>+ Could be a case that arrived igmp
>packet not from type IGMPv2 ,
>+ but IGMPv1 or IGMPv3.
>+ We anyway pass it to
>__send_mgr_filter_igmp_v2().
>+ */
>+ __send_mgr_filter_igmp_v2(p_port,
>p_ip_hdr, iph_options_size, p_buf, buf_len);
You adjust buf_len, but not p_buf - is that then intent?
>+ }
IGMP packet filtering belongs in a different patch.
> /* Not a UDP packet. */
> cl_perf_start( SendTcp );
> status = __send_gen( p_port, p_desc );
>@@ -3131,6 +3221,129 @@
>
>
> static NDIS_STATUS
>+__send_mgr_filter_igmp_v2(
>+ IN ipoib_port_t*
>const p_port,
>+ IN const ip_hdr_t* const
>p_ip_hdr,
>+ IN size_t
>iph_options_size,
>+ IN NDIS_BUFFER*
>p_buf,
>+ IN size_t
>buf_len )
>+{
>+ igmp_v2_hdr_t *p_igmp_v2_hdr = NULL;
>+ NDIS_STATUS endpt_status;
>+ ipoib_endpt_t* p_endpt = NULL;
>+ mac_addr_t fake_mcast_mac;
>+
>+ IPOIB_ENTER( IPOIB_DBG_SEND );
>+
>+ if( !buf_len )
>+ {
>+ // To get the IGMP packet we need to skip the ip
>options NDIS_BUFFER (if exists)
>+ while ( iph_options_size )
>+ {
>+ NdisGetNextBuffer( p_buf, &p_buf );
>+ if( !p_buf )
>+ {
>+ IPOIB_PRINT_EXIT(
>TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("Failed to
>get IGMPv2 header buffer.\n") );
>+ return
>NDIS_STATUS_FAILURE;
>+ }
>+ NdisQueryBufferSafe( p_buf,
>&p_igmp_v2_hdr, &buf_len, NormalPagePriority );
>+ if( !p_igmp_v2_hdr )
>+ {
>+ IPOIB_PRINT_EXIT(
>TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("Failed to
>query IGMPv2 header buffer.\n") );
>+ return
>NDIS_STATUS_FAILURE;
>+ }
>+
>+ iph_options_size-=buf_len;
>+ }
>+
>+ NdisGetNextBuffer( p_buf, &p_buf );
>+ if( !p_buf )
>+ {
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>+ ("Failed to get IGMPv2
>header buffer.\n") );
>+ return NDIS_STATUS_FAILURE;
>+ }
>+ NdisQueryBufferSafe( p_buf, &p_igmp_v2_hdr,
>&buf_len, NormalPagePriority );
>+ if( !p_igmp_v2_hdr )
>+ {
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>+ ("Failed to query IGMPv2
>header buffer.\n") );
>+ return NDIS_STATUS_FAILURE;
>+ }
>+ }
>+ else
>+ {
>+ p_igmp_v2_hdr = (igmp_v2_hdr_t*)(p_ip_hdr + 1);
Ok, this is confusing now. If buf_len was not zero, you set the IGMP header to start right after the IP header, but you don't account for options like you did when you calculated buf_len. If you adjusted p_buf when calling, you could just set p_igmp_vs_hdr = p_buf and be safe.
>+ }
<snip...>
>@@ -3577,7 +3790,22 @@
> return NDIS_STATUS_PENDING;
> }
> }
>+ else if ( p_port->p_adapter->params.mc_garbage_collector &&
Any reason not to always update the timestamp?
>+ status == NDIS_STATUS_SUCCESS &&
>+ ETH_IS_MULTICAST(
>p_eth_hdr->dst.addr ) &&
>+ !ETH_IS_BROADCAST(
>p_eth_hdr->dst.addr ) )
>+ {
>+ CL_ASSERT( (*pp_endpt) );
>+ CL_ASSERT((*pp_endpt)->h_mcast != NULL);
>+ CL_ASSERT((*pp_endpt)->is_mcast_endpoint);
>
>+ if (++((*pp_endpt)->mcast_count) >
>IPOIB_MCAST_TIMESTAMP_THRESHOLD)
>+ {
>+ (*pp_endpt)->mcast_count = 0;
>+ (*pp_endpt)->mcast_send_timestamp =
>cl_get_time_stamp_sec();
>+ }
>+ }
This whole block would make more sense in the __endpt_mgr_ref or maybe even in the endpoint itself.
>+
> IPOIB_EXIT( IPOIB_DBG_SEND );
> return status;
> }
>@@ -4706,6 +4934,8 @@
> ib_query_req_t query;
> ib_user_query_t info;
> ib_portinfo_record_t port_rec;
>+ cl_status_t cl_status;
>+ BOOLEAN success = TRUE;
>
> IPOIB_ENTER( IPOIB_DBG_INIT );
>
>@@ -4740,17 +4970,54 @@
> /* reference the object for the multicast query. */
> ipoib_port_ref( p_port, ref_port_up );
>
>+ __try
>+ {
Your indentation of the whole __try block is wrong. In any case, I don't like the use of the __try here, as you're just using it instead of a goto. Just use a goto and be done with it.
> status = p_port->p_adapter->p_ifc->query(
> p_port->p_adapter->h_al, &query,
>&p_port->ib_mgr.h_query );
> if( status != IB_SUCCESS )
> {
>- KeSetEvent( &p_port->sa_event, EVENT_INCREMENT,
>FALSE );
>- ipoib_set_inactive( p_port->p_adapter );
>- ipoib_port_deref( p_port, ref_port_up );
>+ success = FALSE;
> IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
> ("ib_query returned %s\n",
>
>p_port->p_adapter->p_ifc->get_err_str( status )) );
>- return;
>+ __leave;
>+ }
>+
>+ if (p_port->p_adapter->params.mc_garbage_collector)
>+ {
>+ CL_ASSERT(p_port->mcast_event_init);
>+ cl_status = cl_event_reset(&p_port->mcast_event);
>+ if( cl_status != CL_SUCCESS )
>+ {
>+ success = FALSE;
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("cl_event_reset returned %#x\n", cl_status) );
>+ __leave;
>+ }
>+
>+ cl_status = cl_thread_init(
>+ &p_port->mcast_thread,
>+ __port_mcast_garbage_collector,
>+ p_port,
>+ "mcast_garbage");
>+ if( cl_status != CL_SUCCESS )
>+ {
>+ success = FALSE;
>+ IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+ ("cl_thread_init returned %#x\n", cl_status) );
>+ __leave;
>+ }
>+ p_port->mcast_thread_init = TRUE;
You know, I think it would make a lot more sense to create the garbage collector thread when you initialize the port object. It really simplifies the error handling.
>+ }
>+ }
>+ __finally
>+ {
>+ if (!success)
>+ {
>+ KeSetEvent( &p_port->sa_event, EVENT_INCREMENT, FALSE );
>+ ipoib_set_inactive( p_port->p_adapter );
>+ ipoib_port_deref( p_port, ref_port_up );
>+ }
> }
>
> IPOIB_EXIT( IPOIB_DBG_INIT );
>@@ -5229,6 +5496,16 @@
> return;
> }
>
>+ /* Destroy multicast garbage collector thread */
>+
>+ if(p_port->p_adapter->params.mc_garbage_collector &&
>p_port->mcast_thread_init)
>+ {
>+ CL_ASSERT(p_port->mcast_event_init);
>+ cl_event_signal(&p_port->mcast_event);
>+
>+ cl_thread_destroy(&p_port->mcast_thread);
>+ p_port->mcast_thread_init = FALSE;
>+ }
Again, creating the thread at port object initialization would probably be better. Is there a reason it wasn't done like this? Some synchronization issue?
> KeResetEvent(&p_port->leave_mcast_event);
>
> /* Reset all endpoints so we don't flush our ARP cache. */
>@@ -5656,6 +5933,15 @@
> &p_port->endpt_mgr.lid_endpts,
>p_endpt->dlid, &p_endpt->lid_item );
> CL_ASSERT( p_qitem == &p_endpt->lid_item );
> }
>+ /* Add the endpoint to the multicast endpoints list */
>+ if ( p_port->p_adapter->params.mc_garbage_collector )
>+ {
>+ p_endpt->is_mcast_endpoint = TRUE;
>+ p_endpt->mcast_count = 0;
>+ p_endpt->mcast_send_timestamp =
>cl_get_time_stamp_sec();
Why not just do this always? Is the long term expectation that the garbage collector thread will be there?
>+ }
>+ else
Add the missing curlies...
>+ p_endpt->is_mcast_endpoint = FALSE;
Wait, if there's no garbage collector thread you mark the endpoint as not a multicast endpoint? Why? A comment to explain this would help, because it's very counter intuitive since this is in the __mcast_cb function.
> cl_obj_unlock( &p_port->obj );
>
> /* Try to send all pending sends. */
More information about the ofw
mailing list