[ofw] [owf][patch] user multicast offload support

Slava Strebkov slavas at voltaire.com
Tue Jul 1 04:21:50 PDT 2008


Voltaire has developed the LSP for Multicast Offload. Fast data path
will use this service provider working on IB using ibal.dll. 

Original service provider TCP/IPoIB is using to transfer IGMP requests
to Ethernet routers which may be connected to IB host through IPR.

IB join will come from user mode and from IPoIB too. The common place
for them is in the ibal.lib.

To improve performance of HCA on receive path, IPoIB does not issue
ATTACH to the group which already attached from user mode by LSP.

The code below implements this feature.

Slava

 

________________________________

From: Sean Hefty [mailto:sean.hefty at intel.com] 
Sent: Monday, June 30, 2008 7:59 PM
To: Slava Strebkov; ofw at lists.openfabrics.org
Subject: RE: [ofw] [owf][patch] user multicast offload support

 

The patch refers to ipoib support, but all of the code changes are
entirely within ibal.  I think this needs an entirely different approach
because of that.

 

- Sean

 

Please review the code that adds support for user mode component
performing multicast offload for applications.

The idea is to allow host to issue IGMP messages through IPoIB. No data
flow on IPoIB since there is no ATTACH in case of using user mcast
offload. This prevents data duplicating on receive path.

IB leave will send always if issued from user mode application, and when
reference count reaches 0 if leave has been requested from IPoIB.

 

 

 

 

 

Index: core/al/al_init.c

===================================================================

--- core/al/al_init.c         (revision 1302)

+++ core/al/al_init.c       (working copy)

@@ -49,6 +49,9 @@

 

 uint32_t                                                g_al_dbg_level
= TRACE_LEVEL_ERROR;

 uint32_t                                                g_al_dbg_flags
= 0xf0;

+extern void igmp_list_init();

+extern void igmp_list_destroy();

+

 /*

  * Device driver initialization routine.

  */

@@ -112,6 +115,8 @@

                        return status;

            }

 

+          igmp_list_init();

+

            AL_EXIT( AL_DBG_DEV );

            return status;

 }

@@ -169,5 +174,7 @@

                        gp_async_proc_mgr = NULL;

            }

 

+    igmp_list_destroy();

+

            AL_PRINT_EXIT( TRACE_LEVEL_WARNING, AL_DBG_DEV, ("Goodbye
Cruel World =(\n") );

 }

Index: core/al/al_mcast.c

===================================================================

--- core/al/al_mcast.c     (revision 1302)

+++ core/al/al_mcast.c  (working copy)

@@ -96,10 +96,229 @@

 static void

 __free_attach(

            IN                                             al_obj_t
*p_obj );

-#endif

+struct 

+{

+          cl_spinlock_t     mc_list_lock;

+          cl_qlist_t                mc_group_list;

+}g_mc_g;

 

 

+/**********************************************************************
*********************

+*         name:   igmp_list_init

+*         input:    no

+*   return:        void

+*   initializes list and spin lock for igmp list maintenance

+***********************************************************************
*********************/ 

+void igmp_list_init()

+{

+          AL_ENTER( AL_DBG_MCAST );

 

+          cl_spinlock_construct( &g_mc_g.mc_list_lock );

+          cl_spinlock_init( &g_mc_g.mc_list_lock );

+          cl_qlist_init( &g_mc_g.mc_group_list );

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+/**********************************************************************
*********************

+*         name:   igmp_list_destroy

+*         input:    no

+*   return:        void

+*   Release spin lock of igmp list 

+***********************************************************************
*********************/ 

+void igmp_list_destroy()

+{

+          AL_ENTER( AL_DBG_MCAST );

+

+          CL_ASSERT(0 == cl_qlist_count( &g_mc_g.mc_group_list ));

+

+          cl_spinlock_destroy( &g_mc_g.mc_list_lock );

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+

+static join_record_t* igmp_list_find_group(const ib_gid_t *p_mgid)

+{

+          cl_list_item_t
*pItem;

+          join_record_t                                         *mc_gr
= NULL;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+          for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );

+                      pItem != cl_qlist_end( &g_mc_g.mc_group_list );

+                      pItem = cl_qlist_next( pItem ) )

+          {

+

+                      mc_gr = CONTAINING_RECORD( pItem, join_record_t,
entry );

+                      if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))

+                      {

+
cl_spinlock_release(&g_mc_g.mc_list_lock);

+                                  AL_EXIT( AL_DBG_MCAST );

+                                  return mc_gr;

+                      }

+          }

+          cl_spinlock_release(&g_mc_g.mc_list_lock);

+          AL_EXIT( AL_DBG_MCAST );

+          return NULL;

+}

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_add_group

+*         input:    ib_gid_t, ib_qp_handle_t,boolean_t

+*   return:        ib_api_status_t

+*   Adds MC group description in the list, increments ref count if
exists

+***********************************************************************
*********************/ 

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user)

+          {

+                      join_record_t
*mc_gr = NULL;

+

+                      AL_ENTER( AL_DBG_MCAST );

+

+                      mc_gr = igmp_list_find_group(p_mgid);

+                      if (mc_gr)

+                                  return IB_SUCCESS;

+                      

+                      mc_gr = cl_zalloc( sizeof(join_record_t) );

+                      if (! mc_gr) 

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_add_mcast : cl_zalloc
failed\n"));

+                                  return IB_INSUFFICIENT_MEMORY;

+                      }

+

+
RtlCopyMemory(&mc_gr->mgid,p_mgid,sizeof(ib_gid_t));

+                      mc_gr->h_qp = h_qp;

+                      mc_gr->join_state = STATE_NOT_JOINED;

+                      mc_gr->join_source = is_user ? JOIN_SRC_MCE :
JOIN_SRC_IPOIB;

+                      mc_gr->time_stamp = cl_get_time_stamp();

+                      cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+                      cl_qlist_insert_head(&g_mc_g.mc_group_list,
&mc_gr->entry);

+                      cl_spinlock_release(&g_mc_g.mc_list_lock);

+

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+
("al_igmp_list_add_group : ADDED IP = %d.%d.%d.%d from %s\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],    

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13],is_user ? "MCE" : "IPoIB"));

+

+                      AL_EXIT( AL_DBG_MCAST );

+

+                      return IB_SUCCESS;

+}

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_skip_attach

+*         input:    ib_gid_t *p_mgid

+*   return:        boolean_t

+*   Checks if p_mgid group already in the list and joined to Mcast
through MCE

+*         used in decision to do attach or not

+***********************************************************************
*********************/ 

+static boolean_t al_igmp_list_skip_attach(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          mc_gr = igmp_list_find_group(p_mgid);

+          if(! mc_gr)

+                      return FALSE;

+

+          if ((mc_gr->join_state == STATE_JOINED) &&
(mc_gr->join_source == JOIN_SRC_MCE))

+          {

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                              ("ATTACH Skipped for
%d.%d.%d.%d\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],

+
mc_gr->mgid.multicast.raw_group_id[13]));

+

+              return TRUE;

+          }

+

+          AL_EXIT( AL_DBG_MCAST );

+

+          return FALSE;

+}

+

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_change_status_to_joined

+*         input:    ib_gid_t *p_mgid

+*   return:        void

+*   Changes MC status of p_mgid group to JOINED

+***********************************************************************
*********************/ 

+static void al_igmp_list_change_status_to_joined(const ib_gid_t
*p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+

+          mc_gr = igmp_list_find_group(p_mgid);

+          if(! mc_gr)

+                      return;

+

+          mc_gr->ref_cnt++;

+          mc_gr->join_state = STATE_JOINED;

+          AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                  ("igmp group %d.%d.%d.%d set to
STATE_JOINED\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],mc_gr->mgid.multicast.raw_group_id[13]));

+

+          AL_EXIT( AL_DBG_MCAST );

+}

+

+

+/**********************************************************************
*********************

+*         name:   al_igmp_list_remove_record

+*         input:    ib_gid_t *p_mgid

+*   return:        int32_t ref count

+*   Remove MC record with p_mgid from the list if ref count reaches 0

+***********************************************************************
*********************/ 

+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid)

+{

+          join_record_t                                         *mc_gr;

+

+          AL_ENTER( AL_DBG_MCAST );

+          mc_gr = igmp_list_find_group(p_mgid);

+

+          if(! mc_gr)

+                      return 0;

+

+          CL_ASSERT((mc_gr->join_state == STATE_NOT_JOINED) ||
(mc_gr->ref_cnt > 0));

+          if ((--mc_gr->ref_cnt) > 0)

+          {

+                      if ((1 == mc_gr->ref_cnt)&&(mc_gr->join_source ==
JOIN_SRC_MCE))

+                      {

+                                  /* set the last reference to ipoib -
will not prevent ATTACH */

+                                  mc_gr->join_source = JOIN_SRC_IPOIB;

+                      }

+                      return mc_gr->ref_cnt;

+          }

+          else

+          {

+                      cl_spinlock_acquire(&g_mc_g.mc_list_lock);

+                      cl_qlist_remove_item( &g_mc_g.mc_group_list,
&mc_gr->entry );

+                      cl_spinlock_release(&g_mc_g.mc_list_lock); 

+

+                      AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                              ("igmp group %d.%d.%d.%d
removed\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],          

+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],mc_gr->mgid.multicast.raw_group_id[13]));

+

+                      cl_free(mc_gr);

+          }

+          AL_EXIT( AL_DBG_MCAST );

+          return 0;

+}

+#else

+void igmp_list_init()

+{

+          return;

+}

+void igmp_list_destroy()

+{

+          return;

+}

+

+#endif //CL_KERNEL

+

 ib_api_status_t

 al_join_mcast(

            IN                     const    ib_qp_handle_t FUNC_PTR64
h_qp,

@@ -112,6 +331,17 @@

 

            AL_ENTER( AL_DBG_MCAST );

 

+#ifdef CL_KERNEL

+

+                      status =
al_igmp_list_add_group(&p_mcast_req->member_rec.mgid, NULL,FALSE); 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,

+                                  ("al_join_mcast :
al_igmp_list_add_group FAILED status: %s\n", ib_get_err_str(status)) );

+                                  return status;

+                      }

+      

+#endif

            /*

             * Validate the port GUID.  There is no need to validate the
pkey index as

             * the user could change it later to make it invalid.  There
is also no

@@ -271,8 +501,14 @@

            sa_mad_data.p_attr = &h_mcast->member_rec;

 

            ref_al_obj( &h_mcast->obj );

-           status = al_send_sa_req(

-                       &h_mcast->sa_dereg_req, h_mcast->port_guid, 500,
0, &sa_mad_data, 0 );

+          status = 

+#if defined( CL_KERNEL )

+                      al_send_sa_req(

+                      &h_mcast->sa_dereg_req, h_mcast->port_guid,
g_mc_destr_retr_timeout, g_mc_destr_retr_count, &sa_mad_data, 0 );

+#else

+          al_send_sa_req(

+          &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0,
&sa_mad_data, 0 );

+#endif

            if( status != IB_SUCCESS )

                        deref_al_obj( &h_mcast->obj );

 

@@ -468,6 +704,9 @@

            ib_mcast_rec_t                          mcast_rec;

            boolean_t                                              sync;

 

+#if defined( CL_KERNEL )

+          boolean_t skip_attach = FALSE;

+#endif

            AL_ENTER( AL_DBG_MCAST );

 

            h_mcast = PARENT_STRUCT( p_item, ib_mcast_t, async );

@@ -497,18 +736,40 @@

                        /* Ensure that the user wants the join operation
to proceed. */

                        if( h_mcast->state == SA_REG_STARTING )

                        {

+#if defined( CL_KERNEL )

+                                  /* It's a IPoIB join callback - check
for attach */

+                                  skip_attach =
al_igmp_list_skip_attach(&h_mcast->member_rec.mgid);

+                                  if (skip_attach)

+                                  {

+                                              AL_PRINT(
TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ATTACH Skipped\n") );

+                                  }

+

+                                  /* It's a IPoIB - change status to
JOINED and inc ref count*/

+
al_igmp_list_change_status_to_joined(&h_mcast->member_rec.mgid);

+#endif

                                    /*

                                     * Change the state here so that we
avoid trying to cancel

                                     * the request if the verb operation
fails.

                                     */

                                    h_mcast->state = SA_REG_ACTIVE;

                                    /* Attach the QP to the multicast
group. */

+#if defined( CL_KERNEL )

+                                  if( (!skip_attach) &&

+
(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER))

+                                  {

+                                              status =
verbs_attach_mcast(h_mcast);

+                                              if( status != IB_SUCCESS
)

+                                                          AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );

+                                  }

+#else

 
if(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER)

                                    {

                                                status =
verbs_attach_mcast(h_mcast);

                                                if( status != IB_SUCCESS
)

                                                            AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );

                                    }

+#endif

                                    mcast_rec.h_mcast = h_mcast;

                                    

                        }

@@ -559,7 +820,16 @@

                                    ("IB_INVALID_MCAST_HANDLE\n") );

                        return IB_INVALID_MCAST_HANDLE;

            }

+#if defined( CL_KERNEL ) 

+          AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,

+                                    ("ib_leave_mcast KERNEL:IP :
%d.%d.%d.%d\n",0xE0 |
h_mcast->member_rec.mgid.multicast.raw_group_id[10],   

+
h_mcast->member_rec.mgid.multicast.raw_group_id[11],h_mcast->member_rec.
mgid.multicast.raw_group_id[12],

+
h_mcast->member_rec.mgid.multicast.raw_group_id[13]));

 

+          /* when ref count is not 0, don't send leave from IPoIB */

+          if (al_igmp_list_remove_record(&h_mcast->member_rec.mgid))

+                      return IB_SUCCESS;

+#endif

            /* Record that we're already leaving the multicast group. */

            ref_al_obj( &h_mcast->obj );

            h_mcast->obj.pfn_destroy( &h_mcast->obj, pfn_destroy_cb );

@@ -647,6 +917,8 @@

                        h_attach->obj.pfn_destroy( &h_attach->obj, NULL
);

                        return status;

            }

+    /* called from user mode MC callback. By calling the function below
we indicate that user mode(MCE) is joined and attached*/

+          al_igmp_list_change_status_to_joined(p_mcast_gid);

 

            /* The proxy will release the reference taken in
init_al_obj. */

            *ph_attach = h_attach;

Index: core/al/ib_common.h

===================================================================

--- core/al/ib_common.h (revision 1302)

+++ core/al/ib_common.h          (working copy)

@@ -35,6 +35,7 @@

 

 

 #include <complib/cl_types.h>

+#include <complib/cl_qlist.h>

 #include <iba/ib_types.h>

 

 

@@ -47,4 +48,29 @@

            IN                                             ib_ca_attr_t*
const                                 p_dest,

            IN                     const    ib_ca_attr_t* const
p_src );

 

+typedef enum   _JOIN_STATE

+{

+          STATE_NOT_JOINED = 0,

+          STATE_JOINED

+}JOIN_STATE;

+

+typedef enum

+{

+          JOIN_SRC_MCE = 1,

+          JOIN_SRC_IPOIB

+}JOIN_SRC;

+

+typedef struct _JOIN_RECORD

+{

+          cl_list_item_t      entry;

+          ib_gid_t        mgid;

+          ib_qp_handle_t  h_qp;

+          JOIN_STATE                 join_state;

+          int32_t         ref_cnt;

+          JOIN_SRC                    join_source;

+          uint64_t     time_stamp;

+}join_record_t;

+

+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid);

+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t      h_qp, boolean_t is_user);

 #endif /* __IB_COMMON_H__ */

Index: core/al/kernel/al_proxy_subnet.c

===================================================================

--- core/al/kernel/al_proxy_subnet.c        (revision 1302)

+++ core/al/kernel/al_proxy_subnet.c      (working copy)

@@ -289,7 +289,23 @@

            p_sa_req->pfn_sa_req_cb = __proxy_sa_req_cb;

 

            p_ioctl->in.sa_req.p_attr = p_ioctl->in.attr;

+          if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_SET))

+          {

+                      status =
al_igmp_list_add_group(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid,
NULL,TRUE); 

 

+                      if (status != IB_SUCCESS)

+                      {

+                                  AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR, ("Failed to add mc group\n") );

+                                  goto proxy_send_sa_req_err2;

+                      }

+          }

+          else if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \

+                                  (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_DELETE))

+    {

+                      /* always send leave when call from user mode */

+
al_igmp_list_remove_record(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid);

+          }

            /*

             * We never pass the user-mode flag when sending SA requests
- the

             * I/O manager will perform all synchronization to make this
IRP sync

 

Slava 

 

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080701/ef3cdb64/attachment.html>


More information about the ofw mailing list