[ofw][patch] kernel igmp support

Slava Strebkov slavas at voltaire.com
Thu Jun 12 06:53:27 PDT 2008


Hi,

Attached patch designed to work with user mode MCE (multicast
offloading).

Data path is going through user mode dll, IPoIB holds IGMP messages
support and does not perform ATTACH

in order to prevent unnecessary data copying. 

 

Index: core/al/al_init.c

===================================================================

--- core/al/al_init.c         (revision 1261)

+++ core/al/al_init.c       (working copy)

@@ -49,6 +49,10 @@

 

 uint32_t                                                g_al_dbg_level
= TRACE_LEVEL_ERROR;

 uint32_t                                                g_al_dbg_flags
= 0xf0;

+#ifdef CL_KERNEL

+          extern void igmp_list_init();

+          extern void igmp_list_destroy();

+#endif

 /*

  * Device driver initialization routine.

  */

@@ -111,7 +115,9 @@

                                    ("init_al_mgr: status = 0x%x.\n",
status) );

                        return status;

            }

-

+#ifdef CL_KERNEL

+          igmp_list_init();

+#endif

            AL_EXIT( AL_DBG_DEV );

            return status;

 }

@@ -168,6 +174,8 @@

                        cl_free( gp_async_proc_mgr );

                        gp_async_proc_mgr = NULL;

            }

-

+#ifdef CL_KERNEL

+    igmp_list_destroy();

+#endif

            AL_PRINT_EXIT( TRACE_LEVEL_WARNING, AL_DBG_DEV, ("Goodbye
Cruel World =(\n") );

 }

Index: core/al/al_mcast.c

===================================================================

--- core/al/al_mcast.c     (revision 1261)

+++ core/al/al_mcast.c  (working copy)

@@ -96,10 +96,254 @@

 static void

 __free_attach(

            IN                                             al_obj_t
*p_obj );

-#endif

+struct 

+{

+          cl_spinlock_t     mc_list_lock;

+          cl_qlist_t                mc_group_list;

+          LONG            initialized;

+}g_mc_g;

 

 

+/**********************************************************************
*********************

+*         name:   igmp_list_init

+*         input:    no

+*   return:        void

+*   initializes list and spin lock for igmp list maintenance

+***********************************************************************
*********************/ 

+void igmp_list_init()

+{

+          AL_ENTER( AL_DBG_MCAST );

 

+          if (0 == InterlockedCompareExchange(&g_mc_g.initialized,1,0))

+          {

+                      cl_spinlock_construct( &g_mc_g.mc_list_lock );

+                      cl_spinlock_init( &g_mc_g.mc_list_lock );

+                      cl_qlist_init( &g_mc_g.mc_group_list );

+          } 

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+/**********************************************************************
*********************

+*         name:   igmp_list_destroy

+*         input:    no

+*   return:        void

+*   Release spin lock of igmp list 

+***********************************************************************
*********************/ 

+void igmp_list_destroy()

+{

+          AL_ENTER( AL_DBG_MCAST );

+

+          CL_ASSERT(0 == cl_qlist_count( &g_mc_g.mc_group_list ));

+

+          if (1 == InterlockedCompareExchange(&g_mc_g.initialized,0,1))

+          {

+                      cl_spinlock_destroy( &g_mc_g.mc_list_lock );

+          }   

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_add_group

+*         input:    ib_gid_t, ib_qp_handle_t,boolean_t

+*   return:        ib_api_status_t

+*   Adds MC group description in the list, increments ref count if
exists

+***********************************************************************
*********************/ 

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user)

+          {

+                      cl_list_item_t
*pItem;

+                      join_record_t
*mc_gr = NULL;

+                      boolean_t
found = FALSE;

+                      (h_qp);

+

+                      AL_ENTER( AL_DBG_MCAST );

+                      CL_ASSERT(g_mc_g.initialized == 1);

+

+                      cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+                      for( pItem = cl_qlist_head( &g_mc_g.mc_group_list
);

+                                  pItem != cl_qlist_end(
&g_mc_g.mc_group_list );

+                                  pItem = cl_qlist_next( pItem ) )

+                      {

+

+                                  mc_gr = CONTAINING_RECORD( pItem,
join_record_t, entry );

+                                  if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+                                  {

+                                              found = TRUE;

+                                              break;

+                                  }

+                      }

+                      if (found)

+              {

+
cl_spinlock_release(&g_mc_g.mc_list_lock);

+                                  return IB_SUCCESS;

+                      }

+                      

+

+                      mc_gr = cl_zalloc( sizeof(join_record_t) );

+                      if (! mc_gr) 

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_add_mcast : cl_zalloc
failed\n"));

+
cl_spinlock_release(&g_mc_g.mc_list_lock);

+                                  return IB_INSUFFICIENT_MEMORY;

+                      }

+
RtlCopyMemory(&mc_gr->mgid,p_mgid,sizeof(ib_gid_t));

+                      mc_gr->h_qp = h_qp;

+                      mc_gr->j_state = STATE_NOT_JOINED;

+                      mc_gr->src_req = is_user ? JOIN_SRC_MCE :
JOIN_SRC_IPOIB;

+                      mc_gr->time_stamp = cl_get_time_stamp();

+                      cl_qlist_insert_head(&g_mc_g.mc_group_list,
&mc_gr->entry);

+                      cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+
("al_igmp_list_add_group : ADDED IP = %d.%d.%d.%d\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],            

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+

+                      AL_EXIT( AL_DBG_MCAST );

+

+                      return IB_SUCCESS;

+}

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_skip_attach

+*         input:    ib_gid_t *p_mgid

+*   return:        boolean_t

+*   Checks if p_mgid group already in the list and joined to Mcast
through MCE

+*         used in decision to do attach or not

+***********************************************************************
*********************/ 

+static boolean_t al_igmp_list_skip_attach(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+          cl_list_item_t
*pItem;

+          boolean_t  ret = FALSE;

+

+          AL_ENTER( AL_DBG_MCAST );

+          CL_ASSERT(g_mc_g.initialized == 1);

+

+          cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+

+          for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );

+                      pItem != cl_qlist_end( &g_mc_g.mc_group_list );

+                      pItem = cl_qlist_next( pItem ) )

+          {

+                      mc_gr = CONTAINING_RECORD( pItem, join_record_t,
entry );

+                      if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+                      {

+                         if ((mc_gr->j_state == STATE_JOINED) &&
(mc_gr->src_req == JOIN_SRC_MCE))

+                         {

+                                     ret = TRUE;

+                                     AL_PRINT( TRACE_LEVEL_INFORMATION,
AL_DBG_MCAST,

+
("ATTACH Skipped for %d.%d.%d.%d\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],            

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+

+                                     break;

+                         }

+             }

+          }

+

+          cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+          AL_EXIT( AL_DBG_MCAST );

+

+          return ret;

+}

+

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_change_status_to_joined

+*         input:    ib_gid_t *p_mgid

+*   return:        void

+*   Changes MC status of p_mgid group to JOINED

+***********************************************************************
*********************/ 

+static void al_igmp_list_change_status_to_joined(const ib_gid_t
*p_mgid)

+{

+          join_record_t                                         *mc_gr;

+          cl_list_item_t
*pItem;

+

+          AL_ENTER( AL_DBG_MCAST );

+          CL_ASSERT(g_mc_g.initialized == 1);

+

+          cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+          for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );

+                      pItem != cl_qlist_end( &g_mc_g.mc_group_list );

+                      pItem = cl_qlist_next( pItem ) )

+          {

+                      mc_gr = CONTAINING_RECORD( pItem, join_record_t,
entry );

+                      if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+                      {

+                 mc_gr->ref_cnt++;

+                         mc_gr->j_state = STATE_JOINED;

+                         AL_PRINT( TRACE_LEVEL_INFORMATION,
AL_DBG_MCAST,

+                                                          ("igmp group
%d.%d.%d.%d set to STATE_JOINED\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+

+                         break;    

+             }

+          }

+          cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_remove_record

+*         input:    ib_gid_t *p_mgid

+*   return:        void

+*   Remove MC record with p_mgid from the list if ref count reaches 0

+***********************************************************************
*********************/ 

+void al_igmp_list_remove_record(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+          cl_list_item_t
*pItem;

+

+          AL_ENTER( AL_DBG_MCAST );

+          CL_ASSERT(g_mc_g.initialized == 1);

+

+          cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+          for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );

+                      pItem != cl_qlist_end( &g_mc_g.mc_group_list );

+                      pItem = cl_qlist_next( pItem ) )

+          {

+                      mc_gr = CONTAINING_RECORD( pItem, join_record_t,
entry );

+                      if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+             {

+                         CL_ASSERT((mc_gr->j_state == STATE_NOT_JOINED)
|| (mc_gr->ref_cnt > 0));

+                         if ((--mc_gr->ref_cnt) > 0)

+                         {

+                                     if ((1 ==
mc_gr->ref_cnt)&&(mc_gr->src_req == JOIN_SRC_MCE))

+                                     {

+                                                 /* last reference
usually igmp through ipoib */

+                                                 mc_gr->src_req =
JOIN_SRC_IPOIB;

+                                     }

+                                     break;

+                         }

+                         else

+                         {

+                                     AL_PRINT( TRACE_LEVEL_INFORMATION,
AL_DBG_MCAST,

+
("igmp group %d.%d.%d.%d removed\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],            

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+                                     cl_qlist_remove_item(
&g_mc_g.mc_group_list, &mc_gr->entry );

+                                     cl_free(mc_gr);

+                         }

+                         break;

+             }

+          }

+          cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+          AL_EXIT( AL_DBG_MCAST );

+

+}

+

+#endif //CL_KERNEL

+

+

+

 ib_api_status_t

 al_join_mcast(

            IN                     const    ib_qp_handle_t FUNC_PTR64
h_qp,

@@ -112,6 +356,17 @@

 

            AL_ENTER( AL_DBG_MCAST );

 

+#ifdef CL_KERNEL

+

+                      status =
al_igmp_list_add_group(&p_mcast_req->member_rec.mgid, NULL,FALSE); 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_join_mcast :
al_igmp_list_add_group FAILED status: %s\n", ib_get_err_str(status)) );

+                                  return status;

+                      }

+      

+#endif

            /*

             * Validate the port GUID.  There is no need to validate the
pkey index as

             * the user could change it later to make it invalid.  There
is also no

@@ -468,6 +723,9 @@

            ib_mcast_rec_t                          mcast_rec;

            boolean_t                                              sync;

 

+#if defined( CL_KERNEL )

+          boolean_t skip_attach;

+#endif

            AL_ENTER( AL_DBG_MCAST );

 

            h_mcast = PARENT_STRUCT( p_item, ib_mcast_t, async );

@@ -497,13 +755,29 @@

                        /* Ensure that the user wants the join operation
to proceed. */

                        if( h_mcast->state == SA_REG_STARTING )

                        {

+#if defined( CL_KERNEL )

+                                  /* It's a IPoIB join callback - check
for attach */

+                                  skip_attach =
al_igmp_list_skip_attach(&h_mcast->member_rec.mgid);

+                                  if (skip_attach)

+                                  {

+                                              AL_PRINT(
TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ATTACH Skipped\n") );

+                                  }

+

+                                  /* It's a IPoIB - change status to
JOINED and inc ref count*/

+
al_igmp_list_change_status_to_joined(&h_mcast->member_rec.mgid);

+#endif

                                    /*

                                     * Change the state here so that we
avoid trying to cancel

                                     * the request if the verb operation
fails.

                                     */

                                    h_mcast->state = SA_REG_ACTIVE;

                                    /* Attach the QP to the multicast
group. */

-
if(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER)

+                                  if( 

+#if defined( CL_KERNEL )

+                                              (!skip_attach) && 

+#endif

+
(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER))

                                    {

                                                status =
verbs_attach_mcast(h_mcast);

                                                if( status != IB_SUCCESS
)

@@ -559,7 +833,15 @@

                                    ("IB_INVALID_MCAST_HANDLE\n") );

                        return IB_INVALID_MCAST_HANDLE;

            }

+#if defined( CL_KERNEL ) 

+          AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ib_leave_mcast KERNEL:IP :
%d.%d.%d.%d\n",0xE0 |
h_mcast->member_rec.mgid.multicast.raw_group_id[10],   

+
h_mcast->member_rec.mgid.multicast.raw_group_id[11],h_mcast->member_rec.
mgid.multicast.raw_group_id[12],

+
h_mcast->member_rec.mgid.multicast.raw_group_id[13]));

 

+          al_igmp_list_remove_record(&h_mcast->member_rec.mgid);

+

+#endif

            /* Record that we're already leaving the multicast group. */

            ref_al_obj( &h_mcast->obj );

            h_mcast->obj.pfn_destroy( &h_mcast->obj, pfn_destroy_cb );

@@ -647,6 +929,8 @@

                        h_attach->obj.pfn_destroy( &h_attach->obj, NULL
);

                        return status;

            }

+    /* called from user mode MC callback. By calling the function below
we indicate that user mode(MCE) is joined and attached*/

+          al_igmp_list_change_status_to_joined(p_mcast_gid);

 

            /* The proxy will release the reference taken in
init_al_obj. */

            *ph_attach = h_attach;

Index: core/al/ib_common.h

===================================================================

--- core/al/ib_common.h (revision 1261)

+++ core/al/ib_common.h          (working copy)

@@ -35,6 +35,7 @@

 

 

 #include <complib/cl_types.h>

+#include <complib/cl_qlist.h>

 #include <iba/ib_types.h>

 

 

@@ -47,4 +48,29 @@

            IN                                             ib_ca_attr_t*
const                                 p_dest,

            IN                     const    ib_ca_attr_t* const
p_src );

 

+typedef enum   _JOIN_STATE

+{

+          STATE_NOT_JOINED = 0,

+          STATE_JOINED

+}JOIN_STATE;

+

+typedef enum

+{

+          JOIN_SRC_MCE = 1,

+          JOIN_SRC_IPOIB

+}JOIN_SRC;

+

+typedef struct _JOIN_RECORD

+{

+          cl_list_item_t      entry;

+          ib_gid_t        mgid;

+          ib_qp_handle_t  h_qp;

+          JOIN_STATE                 j_state;

+          int32_t        ref_cnt;

+          JOIN_SRC                    src_req;

+          uint64_t     time_stamp;

+}join_record_t;

+

+void al_igmp_list_remove_record(const ib_gid_t *p_mgid);

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user);

 #endif /* __IB_COMMON_H__ */

Index: core/al/kernel/al_proxy_subnet.c

===================================================================

--- core/al/kernel/al_proxy_subnet.c        (revision 1261)

+++ core/al/kernel/al_proxy_subnet.c      (working copy)

@@ -289,7 +289,22 @@

            p_sa_req->pfn_sa_req_cb = __proxy_sa_req_cb;

 

            p_ioctl->in.sa_req.p_attr = p_ioctl->in.attr;

+          if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_SET))

+          {

+                      status =
al_igmp_list_add_group(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid,
NULL,TRUE); 

 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR, ("Failed to add mc group\n") );

+                                  goto proxy_send_sa_req_err2;

+                      }

+          }

+          else if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_DELETE))

+    {

+
al_igmp_list_remove_record(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid);

+          }

            /*

             * We never pass the user-mode flag when sending SA requests
- the

             * I/O manager will perform all synchronization to make this
IRP sync

 

Slava Strebkov

SW Engineer

Voltaire

099718750

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080612/d590147c/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: igmp_support.diff
Type: application/octet-stream
Size: 14586 bytes
Desc: igmp_support.diff
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080612/d590147c/attachment.obj>


More information about the ofw mailing list