[ofw][patch] mcast garbage collector

Fab Tillier ftillier at windows.microsoft.com
Thu Jun 12 10:20:39 PDT 2008


Hi Slava,

>Hi,
>Please review mcast garbage collector, which removes old endpoints of
>ipoib port.
>Rescan time and aging time are parameters and can be configured from
>device property page.
>SA retry count and timeout are also configurable from registry.

It seems that this patch does two things - it adds the mcast garbage collector, as well as adds filtering of IGMP packets.  These two things should be split up.

For the garbage collection thread, can you tell me why I would or wouldn't want to use it?  Is it the kind of thing that should probably become a 'on by default' feature?  Or are there only specific cases where it's useful/needed?

>Index: core/al/al_mcast.c
>===================================================================
>--- core/al/al_mcast.c     (revision 1261)
>+++ core/al/al_mcast.c  (working copy)
>@@ -271,8 +274,14 @@
>            sa_mad_data.p_attr = &h_mcast->member_rec;
>
>            ref_al_obj( &h_mcast->obj );
>-           status = al_send_sa_req(

The above line was fine.  Probably leave it, since you end up duplicating it.

>-                       &h_mcast->sa_dereg_req, h_mcast->port_guid, 500,
>0, &sa_mad_data, 0 );
>+          status =
>+#if defined( CL_KERNEL )
>+                      al_send_sa_req(
>+                      &h_mcast->sa_dereg_req, h_mcast->port_guid,
>g_mc_destr_retr_timeout, g_mc_dest_retr_count, &sa_mad_data, 0 );
>+#else
>+          al_send_sa_req(
>+          &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0,
>&sa_mad_data, 0 );

The indentation of this part is whacky.  If you fix the indentation here you'll reduce the diffs.

>+#endif
>            if( status != IB_SUCCESS )
>                        deref_al_obj( &h_mcast->obj );
>
>Index: inc/complib/cl_types.h
>===================================================================
>--- inc/complib/cl_types.h           (revision 1261)
>+++ inc/complib/cl_types.h         (working copy)
>@@ -46,7 +46,7 @@
>
> #include <complib/cl_types_osd.h>
>
>-
>+typedef uint8_t                         net8_t;

No no no no no no no.  I've given this feedback before (I think on a previous iteration of this patch, even).  There is no such thing as a network order byte.  It is meaningless.  Use uint8_t.

> typedef uint16_t                        net16_t;
> typedef uint32_t                        net32_t;
> typedef uint64_t                        net64_t;

>Index: inc/kernel/ip_packet.h
>===================================================================
>--- inc/kernel/ip_packet.h           (revision 1261)
>+++ inc/kernel/ip_packet.h         (working copy)

This file has nothing to do with garbage collection.  Should be in a different patch.

>Index: ulp/ipoib/kernel/ipoib_driver.c
>===================================================================
>--- ulp/ipoib/kernel/ipoib_driver.c (revision 1261)
>+++ ulp/ipoib/kernel/ipoib_driver.c          (working copy)
>@@ -526,6 +526,38 @@
>            }
>            p_adapter->params.recv_pool_ratio =
>p_param->ParameterData.IntegerData;
>
>+    /* Required: MC garbage collector. */
>+    RtlInitUnicodeString( &keyword, L"MCGarbageCollector" );
>+    NdisReadConfiguration(
>+        &status, &p_param, h_config, &keyword, NdisParameterInteger );
>+    if( status != NDIS_STATUS_SUCCESS )
>+    {
>+        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+            ("MC garbage collector parameter is missing.\n") );
>+        return status;
>+    }
>+    p_adapter->params.mc_garbage_collector =
>(p_param->ParameterData.IntegerData != 0);

Note that sometimes you use spaces for tabs, and sometimes you preserve the tabs...  Not a biggie as long as you keep the tab width at 4, which you seem to be doing.

>+          /* Optional: MC leave rescan (sec) for the MC garable
>collector thread. */
>+          RtlInitUnicodeString( &keyword, L"MCLeaveRescan" );
>+          NdisReadConfiguration(
>+                      &status, &p_param, h_config, &keyword,
>NdisParameterInteger );
>+          if( status != NDIS_STATUS_SUCCESS )
>+          {
>+                      p_adapter->params.mc_leave_rescan = 130;
>+          }
>+          else

If you use curly brackets for the if, use them for the else.  Otherwise it makes it harder to read.

>+                      p_adapter->params.mc_leave_rescan =
>p_param->ParameterData.IntegerData;
>+
>+          /* Optional: MC aging time (sec) */
>+          RtlInitUnicodeString( &keyword, L"MCAgingTime" );
>+          NdisReadConfiguration(
>+                      &status, &p_param, h_config, &keyword,
>NdisParameterInteger );
>+          if( status != NDIS_STATUS_SUCCESS )
>+          {
>+                      p_adapter->params.mc_aging_time = 260;
>+          }
>+          else

Same here.

>+                      p_adapter->params.mc_aging_time =
>p_param->ParameterData.IntegerData;
>            /* required: MTU size. */
>            RtlInitUnicodeString( &keyword, L"PayloadMtu" );
>            NdisReadConfiguration(
>Index: ulp/ipoib/kernel/ipoib_port.c
>===================================================================
>--- ulp/ipoib/kernel/ipoib_port.c   (revision 1261)
>+++ ulp/ipoib/kernel/ipoib_port.c (working copy)
>@@ -290,6 +292,14 @@
>            IN         OUT
>ipoib_send_desc_t* const           p_desc );
>
> static NDIS_STATUS
>+__send_mgr_filter_igmp_v2(
>+          IN                                             ipoib_port_t*
>const                                 p_port,
>+    IN               const    ip_hdr_t* const
>p_ip_hdr,
>+          IN                                             size_t
>iph_options_size,
>+          IN                                             NDIS_BUFFER*
>p_buf,
>+          IN                                             size_t
>buf_len );

This function and its related implementation belongs in a different patch.

>+
>+static NDIS_STATUS
> __send_mgr_filter_udp(
>            IN                                             ipoib_port_t*
>const                                 p_port,
>            IN                     const    ip_hdr_t* const
>p_ip_hdr,
>@@ -579,6 +589,11 @@
>            KeInitializeEvent( &p_port->sa_event, NotificationEvent,
>TRUE );
>            KeInitializeEvent( &p_port->leave_mcast_event,
>NotificationEvent, TRUE );
>
>+          p_port->mcast_event_init = FALSE;

You don't need this - if you called cl_event_construct it is always safe to call cl_event_destroy.  In fact, it would almost be better to just use KeInitializeEvent here instead of complib.

>+          cl_event_construct(&p_port->mcast_event);
>+
>+          p_port->mcast_thread_init = FALSE;
>+          cl_thread_construct(&p_port->mcast_thread);
>            IPOIB_EXIT( IPOIB_DBG_INIT );
> }
>
>@@ -653,6 +668,18 @@
>                        return status;
>            }
>
>+    if (p_port->p_adapter->params.mc_garbage_collector)
>+    {
>+        /* Initialize multicast garbage collector event */
>+        cl_status = cl_event_init(&p_port->mcast_event, TRUE);
>+        if( cl_status != CL_SUCCESS )
>+        {
>+            IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+                ("cl_event_init returned %#x\n", cl_status) );
>+            return IB_ERROR;
>+        }

Use of KeInitializeEvent would eliminate this status check.  Event initialization cannot fail.

>+        p_port->mcast_event_init = TRUE;
>+          }
>            /* We only ever destroy from the PnP callback thread. */
>            cl_status = cl_obj_init( &p_port->obj, CL_DESTROY_SYNC,
>                        __port_destroying, __port_cleanup, __port_free
>);
>@@ -2131,6 +2177,30 @@
>            p_eth->hdr.src = p_src->mac;
>            p_eth->hdr.dst = p_dst->mac;
>
>+          /* Check if multicast packet and update endpoint timestamp if
>needed */
>+
>+          if ( ETH_IS_MULTICAST(p_eth->hdr.dst.addr) &&
>+                      p_eth->hdr.type == ETH_PROT_TYPE_IP &&
>+                      !ETH_IS_BROADCAST(p_eth->hdr.dst.addr) )

Why not just do the check on p_dst->is_mcast_endpoint instead of the above?

>+          {
>+                      /*
>+
>p_port->p_adapter->params.mc_garbage_collector doesn't
>+                                  exist in this context , so we use
>p_dst->is_mcast_endpoint
>+                                  as indicator for mc_garbage collector
>activity ( enable/disable )
>+                      */
>+        if ( p_dst->is_mcast_endpoint &&
>+                                  ++(p_dst->mcast_count) >
>IPOIB_MCAST_TIMESTAMP_THRESHOLD)
>+        {
>+                                  CL_ASSERT(p_dst->h_mcast != NULL);
>+                                  CL_ASSERT(p_dst->is_mcast_endpoint);
>+
>+            p_dst->mcast_count = 0;
>+            p_dst->mcast_send_timestamp = cl_get_time_stamp_sec();
>+        }
>+
>+                      p_eth->hdr.dst.addr[1] = 0;
>+                      p_eth->hdr.dst.addr[3] = p_eth->hdr.dst.addr[3] &
>0x7f;

A comment explaining what you're doing here would help.  Why are you setting the second byte to 0, and why are you clearing the most significant bit of the 4th byte?

>+          }
>            IPOIB_EXIT( IPOIB_DBG_RECV );
>            return IB_SUCCESS;
> }
>@@ -3110,6 +3180,26 @@
>            if( p_ip_hdr->offset ||
>                        p_ip_hdr->prot != IP_PROT_UDP )
>            {
>+                      /* Check if this packet is IGMP */
>+                      if ( p_ip_hdr->prot == IP_PROT_IGMP )
>+                      {
>+                                  /*
>+                                      In igmp packet I saw that iph
>arrive in 2 NDIS_BUFFERs:
>+                                              1. iph
>+                                              2. ip options
>+                                              So to get the IGMP packet
>we need to skip the ip options NDIS_BUFFER
>+                                  */
>+                                  size_t iph_size_in_bytes =
>(p_ip_hdr->ver_hl & 0xf) * 4;
>+                                  size_t iph_options_size =
>iph_size_in_bytes - buf_len;
>+                                  buf_len -= sizeof(ip_hdr_t);
>+
>+                                  /*
>+                                      Could be a case that arrived igmp
>packet not from type IGMPv2 ,
>+                                              but IGMPv1 or IGMPv3.
>+                                              We anyway pass it to
>__send_mgr_filter_igmp_v2().
>+                                  */
>+                                  __send_mgr_filter_igmp_v2(p_port,
>p_ip_hdr, iph_options_size, p_buf, buf_len);

You adjust buf_len, but not p_buf - is that then intent?

>+                      }

IGMP packet filtering belongs in a different patch.

>                        /* Not a UDP packet. */
>                        cl_perf_start( SendTcp );
>                        status = __send_gen( p_port, p_desc );
>@@ -3131,6 +3221,129 @@
>
>
> static NDIS_STATUS
>+__send_mgr_filter_igmp_v2(
>+          IN                                             ipoib_port_t*
>const                                 p_port,
>+    IN               const    ip_hdr_t* const
>p_ip_hdr,
>+          IN                                             size_t
>iph_options_size,
>+          IN                                             NDIS_BUFFER*
>p_buf,
>+          IN                                             size_t
>buf_len )
>+{
>+    igmp_v2_hdr_t                     *p_igmp_v2_hdr = NULL;
>+          NDIS_STATUS                          endpt_status;
>+          ipoib_endpt_t*               p_endpt = NULL;
>+          mac_addr_t                               fake_mcast_mac;
>+
>+          IPOIB_ENTER( IPOIB_DBG_SEND );
>+
>+    if( !buf_len )
>+          {
>+                      // To get the IGMP packet we need to skip the ip
>options NDIS_BUFFER (if exists)
>+                      while ( iph_options_size )
>+                      {
>+                                  NdisGetNextBuffer( p_buf, &p_buf );
>+                                  if( !p_buf )
>+                                  {
>+                                              IPOIB_PRINT_EXIT(
>TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+                                                          ("Failed to
>get IGMPv2 header buffer.\n") );
>+                                              return
>NDIS_STATUS_FAILURE;
>+                                  }
>+                                  NdisQueryBufferSafe( p_buf,
>&p_igmp_v2_hdr, &buf_len, NormalPagePriority );
>+                                  if( !p_igmp_v2_hdr )
>+                                  {
>+                                              IPOIB_PRINT_EXIT(
>TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+                                                          ("Failed to
>query IGMPv2 header buffer.\n") );
>+                                              return
>NDIS_STATUS_FAILURE;
>+                                  }
>+
>+                                  iph_options_size-=buf_len;
>+                      }
>+
>+                      NdisGetNextBuffer( p_buf, &p_buf );
>+                      if( !p_buf )
>+                      {
>+                                  IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>+                                              ("Failed to get IGMPv2
>header buffer.\n") );
>+                                  return NDIS_STATUS_FAILURE;
>+                      }
>+                      NdisQueryBufferSafe( p_buf, &p_igmp_v2_hdr,
>&buf_len, NormalPagePriority );
>+                      if( !p_igmp_v2_hdr )
>+                      {
>+                                  IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>+                                              ("Failed to query IGMPv2
>header buffer.\n") );
>+                                  return NDIS_STATUS_FAILURE;
>+                      }
>+          }
>+          else
>+          {
>+                      p_igmp_v2_hdr = (igmp_v2_hdr_t*)(p_ip_hdr + 1);

Ok, this is confusing now.  If buf_len was not zero, you set the IGMP header to start right after the IP header, but you don't account for options like you did when you calculated buf_len.  If you adjusted p_buf when calling, you could just set p_igmp_vs_hdr = p_buf and be safe.

>+          }

<snip...>

>@@ -3577,7 +3790,22 @@
>                                    return NDIS_STATUS_PENDING;
>                        }
>            }
>+          else if ( p_port->p_adapter->params.mc_garbage_collector &&

Any reason not to always update the timestamp?

>+                            status == NDIS_STATUS_SUCCESS &&
>+                                    ETH_IS_MULTICAST(
>p_eth_hdr->dst.addr ) &&
>+                                    !ETH_IS_BROADCAST(
>p_eth_hdr->dst.addr ) )
>+          {
>+                      CL_ASSERT( (*pp_endpt) );
>+                      CL_ASSERT((*pp_endpt)->h_mcast != NULL);
>+                      CL_ASSERT((*pp_endpt)->is_mcast_endpoint);
>
>+                      if (++((*pp_endpt)->mcast_count) >
>IPOIB_MCAST_TIMESTAMP_THRESHOLD)
>+                      {
>+                                  (*pp_endpt)->mcast_count = 0;
>+                                  (*pp_endpt)->mcast_send_timestamp =
>cl_get_time_stamp_sec();
>+                      }
>+          }

This whole block would make more sense in the __endpt_mgr_ref or maybe even in the endpoint itself.

>+
>            IPOIB_EXIT( IPOIB_DBG_SEND );
>            return status;
> }
>@@ -4706,6 +4934,8 @@
>            ib_query_req_t                          query;
>            ib_user_query_t                        info;
>            ib_portinfo_record_t       port_rec;
>+    cl_status_t             cl_status;
>+    BOOLEAN                 success = TRUE;
>
>            IPOIB_ENTER( IPOIB_DBG_INIT );
>
>@@ -4740,17 +4970,54 @@
>            /* reference the object for the multicast query. */
>            ipoib_port_ref( p_port, ref_port_up );
>
>+    __try
>+    {

Your indentation of the whole __try block is wrong.  In any case, I don't like the use of the __try here, as you're just using it instead of a goto.  Just use a goto and be done with it.

>            status = p_port->p_adapter->p_ifc->query(
>                        p_port->p_adapter->h_al, &query,
>&p_port->ib_mgr.h_query );
>            if( status != IB_SUCCESS )
>            {
>-                       KeSetEvent( &p_port->sa_event, EVENT_INCREMENT,
>FALSE );
>-                       ipoib_set_inactive( p_port->p_adapter );
>-                       ipoib_port_deref( p_port, ref_port_up );
>+            success = FALSE;
>                        IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR,
>IPOIB_DBG_ERROR,
>                                    ("ib_query returned %s\n",
>
>p_port->p_adapter->p_ifc->get_err_str( status )) );
>-                       return;
>+            __leave;
>+        }
>+
>+        if (p_port->p_adapter->params.mc_garbage_collector)
>+        {
>+            CL_ASSERT(p_port->mcast_event_init);
>+            cl_status = cl_event_reset(&p_port->mcast_event);
>+            if( cl_status != CL_SUCCESS )
>+            {
>+                success = FALSE;
>+                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+                    ("cl_event_reset returned %#x\n", cl_status) );
>+                __leave;
>+            }
>+
>+            cl_status = cl_thread_init(
>+                &p_port->mcast_thread,
>+                __port_mcast_garbage_collector,
>+                p_port,
>+                "mcast_garbage");
>+            if( cl_status != CL_SUCCESS )
>+            {
>+                success = FALSE;
>+                IPOIB_PRINT_EXIT( TRACE_LEVEL_ERROR, IPOIB_DBG_ERROR,
>+                    ("cl_thread_init returned %#x\n", cl_status) );
>+                __leave;
>+            }
>+            p_port->mcast_thread_init = TRUE;

You know, I think it would make a lot more sense to create the garbage collector thread when you initialize the port object.  It really simplifies the error handling.

>+        }
>+    }
>+    __finally
>+    {
>+        if (!success)
>+        {
>+            KeSetEvent( &p_port->sa_event, EVENT_INCREMENT, FALSE );
>+            ipoib_set_inactive( p_port->p_adapter );
>+            ipoib_port_deref( p_port, ref_port_up );
>+        }
>            }
>
>            IPOIB_EXIT( IPOIB_DBG_INIT );
>@@ -5229,6 +5496,16 @@
>                        return;
>            }
>
>+    /* Destroy multicast garbage collector thread */
>+
>+    if(p_port->p_adapter->params.mc_garbage_collector &&
>p_port->mcast_thread_init)
>+    {
>+        CL_ASSERT(p_port->mcast_event_init);
>+        cl_event_signal(&p_port->mcast_event);
>+
>+        cl_thread_destroy(&p_port->mcast_thread);
>+        p_port->mcast_thread_init = FALSE;
>+    }

Again, creating the thread at port object initialization would probably be better.  Is there a reason it wasn't done like this?  Some synchronization issue?

>            KeResetEvent(&p_port->leave_mcast_event);
>
>            /* Reset all endpoints so we don't flush our ARP cache. */
>@@ -5656,6 +5933,15 @@
>                                    &p_port->endpt_mgr.lid_endpts,
>p_endpt->dlid, &p_endpt->lid_item );
>                        CL_ASSERT( p_qitem == &p_endpt->lid_item );
>            }
>+    /* Add the endpoint to the multicast endpoints list */
>+          if ( p_port->p_adapter->params.mc_garbage_collector )
>+          {
>+                      p_endpt->is_mcast_endpoint = TRUE;
>+                      p_endpt->mcast_count = 0;
>+                      p_endpt->mcast_send_timestamp =
>cl_get_time_stamp_sec();

Why not just do this always?  Is the long term expectation that the garbage collector thread will be there?

>+          }
>+          else

Add the missing curlies...

>+                      p_endpt->is_mcast_endpoint = FALSE;

Wait, if there's no garbage collector thread you mark the endpoint as not a multicast endpoint?  Why?  A comment to explain this would help, because it's very counter intuitive since this is in the __mcast_cb function.

>            cl_obj_unlock( &p_port->obj );
>
>            /* Try to send all pending sends. */



More information about the ofw mailing list