[ofw] [owf][patch] user multicast offload support

Sean Hefty sean.hefty at intel.com
Mon Jun 30 09:59:09 PDT 2008


The patch refers to ipoib support, but all of the code changes are entirely
within ibal.  I think this needs an entirely different approach because of that.

 

- Sean

 

Please review the code that adds support for user mode component performing
multicast offload for applications.

The idea is to allow host to issue IGMP messages through IPoIB. No data flow on
IPoIB since there is no ATTACH in case of using user mcast offload. This
prevents data duplicating on receive path.

IB leave will send always if issued from user mode application, and when
reference count reaches 0 if leave has been requested from IPoIB.

 

 

 

 

 

Index: core/al/al_init.c

===================================================================

--- core/al/al_init.c         (revision 1302)

+++ core/al/al_init.c       (working copy)

@@ -49,6 +49,9 @@

 

 uint32_t                                                g_al_dbg_level =
TRACE_LEVEL_ERROR;

 uint32_t                                                g_al_dbg_flags = 0xf0;

+extern void igmp_list_init();

+extern void igmp_list_destroy();

+

 /*

  * Device driver initialization routine.

  */

@@ -112,6 +115,8 @@

                        return status;

            }

 

+          igmp_list_init();

+

            AL_EXIT( AL_DBG_DEV );

            return status;

 }

@@ -169,5 +174,7 @@

                        gp_async_proc_mgr = NULL;

            }

 

+    igmp_list_destroy();

+

            AL_PRINT_EXIT( TRACE_LEVEL_WARNING, AL_DBG_DEV, ("Goodbye Cruel
World =(\n") );

 }

Index: core/al/al_mcast.c

===================================================================

--- core/al/al_mcast.c     (revision 1302)

+++ core/al/al_mcast.c  (working copy)

@@ -96,10 +96,229 @@

 static void

 __free_attach(

            IN                                             al_obj_t
*p_obj );

-#endif

+struct 

+{

+          cl_spinlock_t     mc_list_lock;

+          cl_qlist_t                mc_group_list;

+}g_mc_g;

 

 

+/******************************************************************************
*************

+*         name:   igmp_list_init

+*         input:    no

+*   return:        void

+*   initializes list and spin lock for igmp list maintenance

+*******************************************************************************
*************/ 

+void igmp_list_init()

+{

+          AL_ENTER( AL_DBG_MCAST );

 

+          cl_spinlock_construct( &g_mc_g.mc_list_lock );

+          cl_spinlock_init( &g_mc_g.mc_list_lock );

+          cl_qlist_init( &g_mc_g.mc_group_list );

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+/******************************************************************************
*************

+*         name:   igmp_list_destroy

+*         input:    no

+*   return:        void

+*   Release spin lock of igmp list 

+*******************************************************************************
*************/ 

+void igmp_list_destroy()

+{

+          AL_ENTER( AL_DBG_MCAST );

+

+          CL_ASSERT(0 == cl_qlist_count( &g_mc_g.mc_group_list ));

+

+          cl_spinlock_destroy( &g_mc_g.mc_list_lock );

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+

+static join_record_t* igmp_list_find_group(const ib_gid_t *p_mgid)

+{

+          cl_list_item_t                                          *pItem;

+          join_record_t                                         *mc_gr = NULL;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+          for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );

+                      pItem != cl_qlist_end( &g_mc_g.mc_group_list );

+                      pItem = cl_qlist_next( pItem ) )

+          {

+

+                      mc_gr = CONTAINING_RECORD( pItem, join_record_t, entry );

+                      if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+                      {

+                                  cl_spinlock_release(&g_mc_g.mc_list_lock);

+                                  AL_EXIT( AL_DBG_MCAST );

+                                  return mc_gr;

+                      }

+          }

+          cl_spinlock_release(&g_mc_g.mc_list_lock);

+          AL_EXIT( AL_DBG_MCAST );

+          return NULL;

+}

+

+/******************************************************************************
*************

+*         name:   al_igmp_list_add_group

+*         input:    ib_gid_t, ib_qp_handle_t,boolean_t

+*   return:        ib_api_status_t

+*   Adds MC group description in the list, increments ref count if exists

+*******************************************************************************
*************/ 

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user)

+          {

+                      join_record_t
*mc_gr = NULL;

+

+                      AL_ENTER( AL_DBG_MCAST );

+

+                      mc_gr = igmp_list_find_group(p_mgid);

+                      if (mc_gr)

+                                  return IB_SUCCESS;

+                      

+                      mc_gr = cl_zalloc( sizeof(join_record_t) );

+                      if (! mc_gr) 

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_add_mcast : cl_zalloc failed\n"));

+                                  return IB_INSUFFICIENT_MEMORY;

+                      }

+

+                      RtlCopyMemory(&mc_gr->mgid,p_mgid,sizeof(ib_gid_t));

+                      mc_gr->h_qp = h_qp;

+                      mc_gr->join_state = STATE_NOT_JOINED;

+                      mc_gr->join_source = is_user ? JOIN_SRC_MCE :
JOIN_SRC_IPOIB;

+                      mc_gr->time_stamp = cl_get_time_stamp();

+                      cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+                      cl_qlist_insert_head(&g_mc_g.mc_group_list,
&mc_gr->entry);

+                      cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                                ("al_igmp_list_add_group :
ADDED IP = %d.%d.%d.%d from %s\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],


+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_id[12],

+
mc_gr->mgid.multicast.raw_group_id[13],is_user ? "MCE" : "IPoIB"));

+

+                      AL_EXIT( AL_DBG_MCAST );

+

+                      return IB_SUCCESS;

+}

+

+/******************************************************************************
*************

+*         name:   al_igmp_list_skip_attach

+*         input:    ib_gid_t *p_mgid

+*   return:        boolean_t

+*   Checks if p_mgid group already in the list and joined to Mcast through MCE

+*         used in decision to do attach or not

+*******************************************************************************
*************/ 

+static boolean_t al_igmp_list_skip_attach(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          mc_gr = igmp_list_find_group(p_mgid);

+          if(! mc_gr)

+                      return FALSE;

+

+          if ((mc_gr->join_state == STATE_JOINED) && (mc_gr->join_source ==
JOIN_SRC_MCE))

+          {

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                              ("ATTACH Skipped for
%d.%d.%d.%d\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_id[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+

+              return TRUE;

+          }

+

+          AL_EXIT( AL_DBG_MCAST );

+

+          return FALSE;

+}

+

+

+/******************************************************************************
*************

+*         name:   al_igmp_list_change_status_to_joined

+*         input:    ib_gid_t *p_mgid

+*   return:        void

+*   Changes MC status of p_mgid group to JOINED

+*******************************************************************************
*************/ 

+static void al_igmp_list_change_status_to_joined(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          mc_gr = igmp_list_find_group(p_mgid);

+          if(! mc_gr)

+                      return;

+

+          mc_gr->ref_cnt++;

+          mc_gr->join_state = STATE_JOINED;

+          AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                  ("igmp group %d.%d.%d.%d set to
STATE_JOINED\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_id[12],mc
_gr->mgid.multicast.raw_group_id[13]));

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+

+/******************************************************************************
*************

+*         name:   al_igmp_list_remove_record

+*         input:    ib_gid_t *p_mgid

+*   return:        int32_t ref count

+*   Remove MC record with p_mgid from the list if ref count reaches 0

+*******************************************************************************
*************/ 

+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+          mc_gr = igmp_list_find_group(p_mgid);

+

+          if(! mc_gr)

+                      return 0;

+

+          CL_ASSERT((mc_gr->join_state == STATE_NOT_JOINED) || (mc_gr->ref_cnt
> 0));

+          if ((--mc_gr->ref_cnt) > 0)

+          {

+                      if ((1 == mc_gr->ref_cnt)&&(mc_gr->join_source ==
JOIN_SRC_MCE))

+                      {

+                                  /* set the last reference to ipoib - will not
prevent ATTACH */

+                                  mc_gr->join_source = JOIN_SRC_IPOIB;

+                      }

+                      return mc_gr->ref_cnt;

+          }

+          else

+          {

+                      cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+                      cl_qlist_remove_item( &g_mc_g.mc_group_list,
&mc_gr->entry );

+                      cl_spinlock_release(&g_mc_g.mc_list_lock); 

+

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                              ("igmp group %d.%d.%d.%d
removed\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_id[12],mc
_gr->mgid.multicast.raw_group_id[13]));

+

+                      cl_free(mc_gr);

+          }

+          AL_EXIT( AL_DBG_MCAST );

+          return 0;

+}

+#else

+void igmp_list_init()

+{

+          return;

+}

+void igmp_list_destroy()

+{

+          return;

+}

+

+#endif //CL_KERNEL

+

 ib_api_status_t

 al_join_mcast(

            IN                     const    ib_qp_handle_t FUNC_PTR64
h_qp,

@@ -112,6 +331,17 @@

 

            AL_ENTER( AL_DBG_MCAST );

 

+#ifdef CL_KERNEL

+

+                      status =
al_igmp_list_add_group(&p_mcast_req->member_rec.mgid, NULL,FALSE); 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_join_mcast : al_igmp_list_add_group
FAILED status: %s\n", ib_get_err_str(status)) );

+                                  return status;

+                      }

+      

+#endif

            /*

             * Validate the port GUID.  There is no need to validate the pkey
index as

             * the user could change it later to make it invalid.  There is also
no

@@ -271,8 +501,14 @@

            sa_mad_data.p_attr = &h_mcast->member_rec;

 

            ref_al_obj( &h_mcast->obj );

-           status = al_send_sa_req(

-                       &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0,
&sa_mad_data, 0 );

+          status = 

+#if defined( CL_KERNEL )

+                      al_send_sa_req(

+                      &h_mcast->sa_dereg_req, h_mcast->port_guid,
g_mc_destr_retr_timeout, g_mc_destr_retr_count, &sa_mad_data, 0 );

+#else

+          al_send_sa_req(

+          &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0, &sa_mad_data, 0
);

+#endif

            if( status != IB_SUCCESS )

                        deref_al_obj( &h_mcast->obj );

 

@@ -468,6 +704,9 @@

            ib_mcast_rec_t                          mcast_rec;

            boolean_t                                              sync;

 

+#if defined( CL_KERNEL )

+          boolean_t skip_attach = FALSE;

+#endif

            AL_ENTER( AL_DBG_MCAST );

 

            h_mcast = PARENT_STRUCT( p_item, ib_mcast_t, async );

@@ -497,18 +736,40 @@

                        /* Ensure that the user wants the join operation to
proceed. */

                        if( h_mcast->state == SA_REG_STARTING )

                        {

+#if defined( CL_KERNEL )

+                                  /* It's a IPoIB join callback - check for
attach */

+                                  skip_attach =
al_igmp_list_skip_attach(&h_mcast->member_rec.mgid);

+                                  if (skip_attach)

+                                  {

+                                              AL_PRINT(
TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ATTACH Skipped\n") );

+                                  }

+

+                                  /* It's a IPoIB - change status to JOINED and
inc ref count*/

+
al_igmp_list_change_status_to_joined(&h_mcast->member_rec.mgid);

+#endif

                                    /*

                                     * Change the state here so that we avoid
trying to cancel

                                     * the request if the verb operation fails.

                                     */

                                    h_mcast->state = SA_REG_ACTIVE;

                                    /* Attach the QP to the multicast group. */

+#if defined( CL_KERNEL )

+                                  if( (!skip_attach) &&

+
(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER))

+                                  {

+                                              status =
verbs_attach_mcast(h_mcast);

+                                              if( status != IB_SUCCESS )

+                                                          AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );

+                                  }

+#else

 
if(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER)

                                    {

                                                status =
verbs_attach_mcast(h_mcast);

                                                if( status != IB_SUCCESS )

                                                            AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );

                                    }

+#endif

                                    mcast_rec.h_mcast = h_mcast;

                                    

                        }

@@ -559,7 +820,16 @@

                                    ("IB_INVALID_MCAST_HANDLE\n") );

                        return IB_INVALID_MCAST_HANDLE;

            }

+#if defined( CL_KERNEL ) 

+          AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ib_leave_mcast KERNEL:IP :
%d.%d.%d.%d\n",0xE0 | h_mcast->member_rec.mgid.multicast.raw_group_id[10],   

+
h_mcast->member_rec.mgid.multicast.raw_group_id[11],h_mcast->member_rec.mgid.mul
ticast.raw_group_id[12],

+
h_mcast->member_rec.mgid.multicast.raw_group_id[13]));

 

+          /* when ref count is not 0, don't send leave from IPoIB */

+          if (al_igmp_list_remove_record(&h_mcast->member_rec.mgid))

+                      return IB_SUCCESS;

+#endif

            /* Record that we're already leaving the multicast group. */

            ref_al_obj( &h_mcast->obj );

            h_mcast->obj.pfn_destroy( &h_mcast->obj, pfn_destroy_cb );

@@ -647,6 +917,8 @@

                        h_attach->obj.pfn_destroy( &h_attach->obj, NULL );

                        return status;

            }

+    /* called from user mode MC callback. By calling the function below we
indicate that user mode(MCE) is joined and attached*/

+          al_igmp_list_change_status_to_joined(p_mcast_gid);

 

            /* The proxy will release the reference taken in init_al_obj. */

            *ph_attach = h_attach;

Index: core/al/ib_common.h

===================================================================

--- core/al/ib_common.h (revision 1302)

+++ core/al/ib_common.h          (working copy)

@@ -35,6 +35,7 @@

 

 

 #include <complib/cl_types.h>

+#include <complib/cl_qlist.h>

 #include <iba/ib_types.h>

 

 

@@ -47,4 +48,29 @@

            IN                                             ib_ca_attr_t* const
p_dest,

            IN                     const    ib_ca_attr_t* const
p_src );

 

+typedef enum   _JOIN_STATE

+{

+          STATE_NOT_JOINED = 0,

+          STATE_JOINED

+}JOIN_STATE;

+

+typedef enum

+{

+          JOIN_SRC_MCE = 1,

+          JOIN_SRC_IPOIB

+}JOIN_SRC;

+

+typedef struct _JOIN_RECORD

+{

+          cl_list_item_t      entry;

+          ib_gid_t        mgid;

+          ib_qp_handle_t  h_qp;

+          JOIN_STATE                 join_state;

+          int32_t         ref_cnt;

+          JOIN_SRC                    join_source;

+          uint64_t     time_stamp;

+}join_record_t;

+

+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid);

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user);

 #endif /* __IB_COMMON_H__ */

Index: core/al/kernel/al_proxy_subnet.c

===================================================================

--- core/al/kernel/al_proxy_subnet.c        (revision 1302)

+++ core/al/kernel/al_proxy_subnet.c      (working copy)

@@ -289,7 +289,23 @@

            p_sa_req->pfn_sa_req_cb = __proxy_sa_req_cb;

 

            p_ioctl->in.sa_req.p_attr = p_ioctl->in.attr;

+          if ((p_ioctl->in.sa_req.attr_id == IB_MAD_ATTR_MCMEMBER_RECORD) && \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_SET))

+          {

+                      status =
al_igmp_list_add_group(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid, NULL,TRUE); 

 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR, ("Failed to add mc group\n") );

+                                  goto proxy_send_sa_req_err2;

+                      }

+          }

+          else if ((p_ioctl->in.sa_req.attr_id == IB_MAD_ATTR_MCMEMBER_RECORD)
&& \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_DELETE))

+    {

+                      /* always send leave when call from user mode */

+
al_igmp_list_remove_record(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid);

+          }

            /*

             * We never pass the user-mode flag when sending SA requests - the

             * I/O manager will perform all synchronization to make this IRP
sync

 

Slava 

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080630/9273a349/attachment.html>


More information about the ofw mailing list