[ofw] [owf][patch] user multicast offload support
Slava Strebkov
slavas at voltaire.com
Mon Jun 30 01:19:51 PDT 2008
Please review the code that adds support for user mode component
performing multicast offload for applications.
The idea is to allow host to issue IGMP messages through IPoIB. No data
flow on IPoIB since there is no ATTACH in case of using user mcast
offload. This prevents data duplicating on receive path.
IB leave will send always if issued from user mode application, and when
reference count reaches 0 if leave has been requested from IPoIB.
Index: core/al/al_init.c
===================================================================
--- core/al/al_init.c (revision 1302)
+++ core/al/al_init.c (working copy)
@@ -49,6 +49,9 @@
uint32_t g_al_dbg_level
= TRACE_LEVEL_ERROR;
uint32_t g_al_dbg_flags
= 0xf0;
+extern void igmp_list_init();
+extern void igmp_list_destroy();
+
/*
* Device driver initialization routine.
*/
@@ -112,6 +115,8 @@
return status;
}
+ igmp_list_init();
+
AL_EXIT( AL_DBG_DEV );
return status;
}
@@ -169,5 +174,7 @@
gp_async_proc_mgr = NULL;
}
+ igmp_list_destroy();
+
AL_PRINT_EXIT( TRACE_LEVEL_WARNING, AL_DBG_DEV, ("Goodbye
Cruel World =(\n") );
}
Index: core/al/al_mcast.c
===================================================================
--- core/al/al_mcast.c (revision 1302)
+++ core/al/al_mcast.c (working copy)
@@ -96,10 +96,229 @@
static void
__free_attach(
IN al_obj_t
*p_obj );
-#endif
+struct
+{
+ cl_spinlock_t mc_list_lock;
+ cl_qlist_t mc_group_list;
+}g_mc_g;
+/**********************************************************************
*********************
+* name: igmp_list_init
+* input: no
+* return: void
+* initializes list and spin lock for igmp list maintenance
+***********************************************************************
*********************/
+void igmp_list_init()
+{
+ AL_ENTER( AL_DBG_MCAST );
+ cl_spinlock_construct( &g_mc_g.mc_list_lock );
+ cl_spinlock_init( &g_mc_g.mc_list_lock );
+ cl_qlist_init( &g_mc_g.mc_group_list );
+
+ AL_EXIT( AL_DBG_MCAST );
+}
+
+/**********************************************************************
*********************
+* name: igmp_list_destroy
+* input: no
+* return: void
+* Release spin lock of igmp list
+***********************************************************************
*********************/
+void igmp_list_destroy()
+{
+ AL_ENTER( AL_DBG_MCAST );
+
+ CL_ASSERT(0 == cl_qlist_count( &g_mc_g.mc_group_list ));
+
+ cl_spinlock_destroy( &g_mc_g.mc_list_lock );
+
+ AL_EXIT( AL_DBG_MCAST );
+}
+
+
+static join_record_t* igmp_list_find_group(const ib_gid_t *p_mgid)
+{
+ cl_list_item_t
*pItem;
+ join_record_t *mc_gr
= NULL;
+
+ AL_ENTER( AL_DBG_MCAST );
+
+ cl_spinlock_acquire(&g_mc_g.mc_list_lock);
+ for( pItem = cl_qlist_head( &g_mc_g.mc_group_list );
+ pItem != cl_qlist_end( &g_mc_g.mc_group_list );
+ pItem = cl_qlist_next( pItem ) )
+ {
+
+ mc_gr = CONTAINING_RECORD( pItem, join_record_t,
entry );
+ if ( sizeof(mc_gr->mgid.raw) ==
RtlCompareMemory(mc_gr->mgid.raw,p_mgid->raw,sizeof(mc_gr->mgid.raw)))
+ {
+
cl_spinlock_release(&g_mc_g.mc_list_lock);
+ AL_EXIT( AL_DBG_MCAST );
+ return mc_gr;
+ }
+ }
+ cl_spinlock_release(&g_mc_g.mc_list_lock);
+ AL_EXIT( AL_DBG_MCAST );
+ return NULL;
+}
+
+/**********************************************************************
*********************
+* name: al_igmp_list_add_group
+* input: ib_gid_t, ib_qp_handle_t,boolean_t
+* return: ib_api_status_t
+* Adds MC group description in the list, increments ref count if
exists
+***********************************************************************
*********************/
+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t h_qp, boolean_t is_user)
+ {
+ join_record_t
*mc_gr = NULL;
+
+ AL_ENTER( AL_DBG_MCAST );
+
+ mc_gr = igmp_list_find_group(p_mgid);
+ if (mc_gr)
+ return IB_SUCCESS;
+
+ mc_gr = cl_zalloc( sizeof(join_record_t) );
+ if (! mc_gr)
+ {
+ AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,
+ ("al_add_mcast : cl_zalloc
failed\n"));
+ return IB_INSUFFICIENT_MEMORY;
+ }
+
+
RtlCopyMemory(&mc_gr->mgid,p_mgid,sizeof(ib_gid_t));
+ mc_gr->h_qp = h_qp;
+ mc_gr->join_state = STATE_NOT_JOINED;
+ mc_gr->join_source = is_user ? JOIN_SRC_MCE :
JOIN_SRC_IPOIB;
+ mc_gr->time_stamp = cl_get_time_stamp();
+ cl_spinlock_acquire(&g_mc_g.mc_list_lock);
+ cl_qlist_insert_head(&g_mc_g.mc_group_list,
&mc_gr->entry);
+ cl_spinlock_release(&g_mc_g.mc_list_lock);
+
+ AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+
("al_igmp_list_add_group : ADDED IP = %d.%d.%d.%d from %s\n",0xE0 |
mc_gr->mgid.multicast.raw_group_id[10],
+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],
+
mc_gr->mgid.multicast.raw_group_id[13],is_user ? "MCE" : "IPoIB"));
+
+ AL_EXIT( AL_DBG_MCAST );
+
+ return IB_SUCCESS;
+}
+
+/**********************************************************************
*********************
+* name: al_igmp_list_skip_attach
+* input: ib_gid_t *p_mgid
+* return: boolean_t
+* Checks if p_mgid group already in the list and joined to Mcast
through MCE
+* used in decision to do attach or not
+***********************************************************************
*********************/
+static boolean_t al_igmp_list_skip_attach(const ib_gid_t *p_mgid)
+{
+ join_record_t *mc_gr;
+
+ AL_ENTER( AL_DBG_MCAST );
+
+ mc_gr = igmp_list_find_group(p_mgid);
+ if(! mc_gr)
+ return FALSE;
+
+ if ((mc_gr->join_state == STATE_JOINED) &&
(mc_gr->join_source == JOIN_SRC_MCE))
+ {
+ AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+ ("ATTACH Skipped for
%d.%d.%d.%d\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],
+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],
+
mc_gr->mgid.multicast.raw_group_id[13]));
+
+ return TRUE;
+ }
+
+ AL_EXIT( AL_DBG_MCAST );
+
+ return FALSE;
+}
+
+
+/**********************************************************************
*********************
+* name: al_igmp_list_change_status_to_joined
+* input: ib_gid_t *p_mgid
+* return: void
+* Changes MC status of p_mgid group to JOINED
+***********************************************************************
*********************/
+static void al_igmp_list_change_status_to_joined(const ib_gid_t
*p_mgid)
+{
+ join_record_t *mc_gr;
+
+ AL_ENTER( AL_DBG_MCAST );
+
+ mc_gr = igmp_list_find_group(p_mgid);
+ if(! mc_gr)
+ return;
+
+ mc_gr->ref_cnt++;
+ mc_gr->join_state = STATE_JOINED;
+ AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+ ("igmp group %d.%d.%d.%d set to
STATE_JOINED\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],
+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],mc_gr->mgid.multicast.raw_group_id[13]));
+
+ AL_EXIT( AL_DBG_MCAST );
+}
+
+
+/**********************************************************************
*********************
+* name: al_igmp_list_remove_record
+* input: ib_gid_t *p_mgid
+* return: int32_t ref count
+* Remove MC record with p_mgid from the list if ref count reaches 0
+***********************************************************************
*********************/
+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid)
+{
+ join_record_t *mc_gr;
+
+ AL_ENTER( AL_DBG_MCAST );
+ mc_gr = igmp_list_find_group(p_mgid);
+
+ if(! mc_gr)
+ return 0;
+
+ CL_ASSERT((mc_gr->join_state == STATE_NOT_JOINED) ||
(mc_gr->ref_cnt > 0));
+ if ((--mc_gr->ref_cnt) > 0)
+ {
+ if ((1 == mc_gr->ref_cnt)&&(mc_gr->join_source ==
JOIN_SRC_MCE))
+ {
+ /* set the last reference to ipoib -
will not prevent ATTACH */
+ mc_gr->join_source = JOIN_SRC_IPOIB;
+ }
+ return mc_gr->ref_cnt;
+ }
+ else
+ {
+ cl_spinlock_acquire(&g_mc_g.mc_list_lock);
+ cl_qlist_remove_item( &g_mc_g.mc_group_list,
&mc_gr->entry );
+ cl_spinlock_release(&g_mc_g.mc_list_lock);
+
+ AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+ ("igmp group %d.%d.%d.%d
removed\n",0xE0 | mc_gr->mgid.multicast.raw_group_id[10],
+
mc_gr->mgid.multicast.raw_group_id[11],mc_gr->mgid.multicast.raw_group_i
d[12],mc_gr->mgid.multicast.raw_group_id[13]));
+
+ cl_free(mc_gr);
+ }
+ AL_EXIT( AL_DBG_MCAST );
+ return 0;
+}
+#else
+void igmp_list_init()
+{
+ return;
+}
+void igmp_list_destroy()
+{
+ return;
+}
+
+#endif //CL_KERNEL
+
ib_api_status_t
al_join_mcast(
IN const ib_qp_handle_t FUNC_PTR64
h_qp,
@@ -112,6 +331,17 @@
AL_ENTER( AL_DBG_MCAST );
+#ifdef CL_KERNEL
+
+ status =
al_igmp_list_add_group(&p_mcast_req->member_rec.mgid, NULL,FALSE);
+ if (status != IB_SUCCESS)
+ {
+ AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR,
+ ("al_join_mcast :
al_igmp_list_add_group FAILED status: %s\n", ib_get_err_str(status)) );
+ return status;
+ }
+
+#endif
/*
* Validate the port GUID. There is no need to validate the
pkey index as
* the user could change it later to make it invalid. There
is also no
@@ -271,8 +501,14 @@
sa_mad_data.p_attr = &h_mcast->member_rec;
ref_al_obj( &h_mcast->obj );
- status = al_send_sa_req(
- &h_mcast->sa_dereg_req, h_mcast->port_guid, 500,
0, &sa_mad_data, 0 );
+ status =
+#if defined( CL_KERNEL )
+ al_send_sa_req(
+ &h_mcast->sa_dereg_req, h_mcast->port_guid,
g_mc_destr_retr_timeout, g_mc_destr_retr_count, &sa_mad_data, 0 );
+#else
+ al_send_sa_req(
+ &h_mcast->sa_dereg_req, h_mcast->port_guid, 500, 0,
&sa_mad_data, 0 );
+#endif
if( status != IB_SUCCESS )
deref_al_obj( &h_mcast->obj );
@@ -468,6 +704,9 @@
ib_mcast_rec_t mcast_rec;
boolean_t sync;
+#if defined( CL_KERNEL )
+ boolean_t skip_attach = FALSE;
+#endif
AL_ENTER( AL_DBG_MCAST );
h_mcast = PARENT_STRUCT( p_item, ib_mcast_t, async );
@@ -497,18 +736,40 @@
/* Ensure that the user wants the join operation
to proceed. */
if( h_mcast->state == SA_REG_STARTING )
{
+#if defined( CL_KERNEL )
+ /* It's a IPoIB join callback - check
for attach */
+ skip_attach =
al_igmp_list_skip_attach(&h_mcast->member_rec.mgid);
+ if (skip_attach)
+ {
+ AL_PRINT(
TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+ ("ATTACH Skipped\n") );
+ }
+
+ /* It's a IPoIB - change status to
JOINED and inc ref count*/
+
al_igmp_list_change_status_to_joined(&h_mcast->member_rec.mgid);
+#endif
/*
* Change the state here so that we
avoid trying to cancel
* the request if the verb operation
fails.
*/
h_mcast->state = SA_REG_ACTIVE;
/* Attach the QP to the multicast
group. */
+#if defined( CL_KERNEL )
+ if( (!skip_attach) &&
+
(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER))
+ {
+ status =
verbs_attach_mcast(h_mcast);
+ if( status != IB_SUCCESS
)
+ AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );
+ }
+#else
if(ib_member_get_state(mcast_rec.p_member_rec->scope_state) ==
IB_MC_REC_STATE_FULL_MEMBER)
{
status =
verbs_attach_mcast(h_mcast);
if( status != IB_SUCCESS
)
AL_PRINT(
TRACE_LEVEL_ERROR, AL_DBG_MCAST, ("attach_mcast failed\n") );
}
+#endif
mcast_rec.h_mcast = h_mcast;
}
@@ -559,7 +820,16 @@
("IB_INVALID_MCAST_HANDLE\n") );
return IB_INVALID_MCAST_HANDLE;
}
+#if defined( CL_KERNEL )
+ AL_PRINT( TRACE_LEVEL_INFORMATION, AL_DBG_MCAST,
+ ("ib_leave_mcast KERNEL:IP :
%d.%d.%d.%d\n",0xE0 |
h_mcast->member_rec.mgid.multicast.raw_group_id[10],
+
h_mcast->member_rec.mgid.multicast.raw_group_id[11],h_mcast->member_rec.
mgid.multicast.raw_group_id[12],
+
h_mcast->member_rec.mgid.multicast.raw_group_id[13]));
+ /* when ref count is not 0, don't send leave from IPoIB */
+ if (al_igmp_list_remove_record(&h_mcast->member_rec.mgid))
+ return IB_SUCCESS;
+#endif
/* Record that we're already leaving the multicast group. */
ref_al_obj( &h_mcast->obj );
h_mcast->obj.pfn_destroy( &h_mcast->obj, pfn_destroy_cb );
@@ -647,6 +917,8 @@
h_attach->obj.pfn_destroy( &h_attach->obj, NULL
);
return status;
}
+ /* called from user mode MC callback. By calling the function below
we indicate that user mode(MCE) is joined and attached*/
+ al_igmp_list_change_status_to_joined(p_mcast_gid);
/* The proxy will release the reference taken in
init_al_obj. */
*ph_attach = h_attach;
Index: core/al/ib_common.h
===================================================================
--- core/al/ib_common.h (revision 1302)
+++ core/al/ib_common.h (working copy)
@@ -35,6 +35,7 @@
#include <complib/cl_types.h>
+#include <complib/cl_qlist.h>
#include <iba/ib_types.h>
@@ -47,4 +48,29 @@
IN ib_ca_attr_t*
const p_dest,
IN const ib_ca_attr_t* const
p_src );
+typedef enum _JOIN_STATE
+{
+ STATE_NOT_JOINED = 0,
+ STATE_JOINED
+}JOIN_STATE;
+
+typedef enum
+{
+ JOIN_SRC_MCE = 1,
+ JOIN_SRC_IPOIB
+}JOIN_SRC;
+
+typedef struct _JOIN_RECORD
+{
+ cl_list_item_t entry;
+ ib_gid_t mgid;
+ ib_qp_handle_t h_qp;
+ JOIN_STATE join_state;
+ int32_t ref_cnt;
+ JOIN_SRC join_source;
+ uint64_t time_stamp;
+}join_record_t;
+
+int32_t al_igmp_list_remove_record(const ib_gid_t *p_mgid);
+ib_api_status_t al_igmp_list_add_group(const ib_gid_t *p_mgid, const
ib_qp_handle_t h_qp, boolean_t is_user);
#endif /* __IB_COMMON_H__ */
Index: core/al/kernel/al_proxy_subnet.c
===================================================================
--- core/al/kernel/al_proxy_subnet.c (revision 1302)
+++ core/al/kernel/al_proxy_subnet.c (working copy)
@@ -289,7 +289,23 @@
p_sa_req->pfn_sa_req_cb = __proxy_sa_req_cb;
p_ioctl->in.sa_req.p_attr = p_ioctl->in.attr;
+ if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \
+ (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_SET))
+ {
+ status =
al_igmp_list_add_group(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid,
NULL,TRUE);
+ if (status != IB_SUCCESS)
+ {
+ AL_PRINT_EXIT( TRACE_LEVEL_ERROR,
AL_DBG_ERROR, ("Failed to add mc group\n") );
+ goto proxy_send_sa_req_err2;
+ }
+ }
+ else if ((p_ioctl->in.sa_req.attr_id ==
IB_MAD_ATTR_MCMEMBER_RECORD) && \
+ (p_ioctl->in.sa_req.method ==
IB_MAD_METHOD_DELETE))
+ {
+ /* always send leave when call from user mode */
+
al_igmp_list_remove_record(&((ib_member_rec_t*)p_ioctl->in.attr)->mgid);
+ }
/*
* We never pass the user-mode flag when sending SA requests
- the
* I/O manager will perform all synchronization to make this
IRP sync
Slava
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080630/2d186249/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: mcast_offload_support.diff
Type: application/octet-stream
Size: 14273 bytes
Desc: mcast_offload_support.diff
URL: <http://lists.openfabrics.org/pipermail/ofw/attachments/20080630/2d186249/attachment.obj>
More information about the ofw
mailing list