[ofa-general] [PATCH] opensm: improve multicast re-routing requests processing

Eitan Zahavi eitan at mellanox.co.il
Wed Sep 9 11:27:49 PDT 2009


Hi All,
 
Sorry I am not following the exact details of the changes made.
But one thing must be clear:The deferred multicast handling is mandatory
for the SM.
The reason is that many requests may arrive together. The deferred
execution enables immediate response to the client 
and reducing the computation time dramatically. 
 
The need to respond immediately and not wait for the MFT calculation and
setting arises from the fact there
is only one timeout value for MAD requests. Increasing that timeout to
handle the expected delay on N MFT changes will make error handling too
slow.
 
The reduced computation comes from the fact that the routing is not
recalculated for each request but only after a set of changes were made.
This happens automatically since when the deferred execution happens all
the changes made from last MFT routing are handled together.
 
I am not sure what is the change but please make sure to test it on a
large cluster when all nodes joins or leaves at once multiple groups.
 
Eitan
 
Eitan Zahavi 
Senior Engineering Director
Mellanox Technologies LTD 
Tel:+972-4-9097208
Fax:+972-4-9593245 
P.O. Box 586 Yokneam 20692 ISRAEL 

 


________________________________

	From: Hal Rosenstock [mailto:hal.rosenstock at gmail.com] 
	Sent: Wednesday, September 09, 2009 1:23 AM
	To: Sasha Khapyorsky
	Cc: OpenIB; Eli Dorfman; Eitan Zahavi; Yevgeny Kliteynik
	Subject: Re: [ofa-general] [PATCH] opensm: improve multicast
re-routing requests processing
	
	


	On Sun, Sep 6, 2009 at 1:39 PM, Sasha Khapyorsky
<sashak at voltaire.com> wrote:
	


		When we have two or more changes in a same multicast
group multiple
		multicast rerouting requests will be created and
processed. To prevent
		this we will use array of requests indexed by mlid value
minus
		IB_LID_MCAST_START_HO and for each multicast group
change we will just
		mark that specific mlid requires re-routing and
"duplicated" requests
		will be merged there.
		
		Also in this way we will be able to process multicast
group routing
		entries deletion for already removed groups by just
knowing its MLID
		and not using its content - this will let us to not
delay mutlicast
		groups deletion ('to_be_deleted' flag) and will simplify
many multicast
		related code flows.
		

	 
	While the delay adds complexity, it is a feature. Delayed
deletion (and join) is allowed by IBA and is needed in a fast changing
subnet when there are a lot of groups changing. This was seen quite a
while ago and was how OpenSM evolved based on field experience and other
testing. Eitan is the expert here. IMO support for this needs to be
added (back in).
	 
	-- Hal
	 
	 

		
		Signed-off-by: Sasha Khapyorsky <sashak at voltaire.com>
		---
		 opensm/include/opensm/osm_sm.h |    4 +-
		 opensm/opensm/osm_mcast_mgr.c  |   27
+++++++++------------
		 opensm/opensm/osm_sm.c         |   49
+++++++++++++---------------------------
		 3 files changed, 30 insertions(+), 50 deletions(-)
		
		diff --git a/opensm/include/opensm/osm_sm.h
b/opensm/include/opensm/osm_sm.h
		index 0914a95..986143a 100644
		--- a/opensm/include/opensm/osm_sm.h
		+++ b/opensm/include/opensm/osm_sm.h
		@@ -126,8 +126,8 @@ typedef struct osm_sm {
		       cl_dispatcher_t *p_disp;
		       cl_plock_t *p_lock;
		       atomic32_t sm_trans_id;
		-       cl_spinlock_t mgrp_lock;
		-       cl_qlist_t mgrp_list;
		+       unsigned mlids_req_max;
		+       uint8_t *mlids_req;
		       osm_sm_mad_ctrl_t mad_ctrl;
		       osm_lid_mgr_t lid_mgr;
		       osm_ucast_mgr_t ucast_mgr;
		diff --git a/opensm/opensm/osm_mcast_mgr.c
b/opensm/opensm/osm_mcast_mgr.c
		index d7c5ce1..dd504ef 100644
		--- a/opensm/opensm/osm_mcast_mgr.c
		+++ b/opensm/opensm/osm_mcast_mgr.c
		@@ -1116,7 +1116,6 @@ static int
mcast_mgr_set_mftables(osm_sm_t * sm)
		 int osm_mcast_mgr_process(osm_sm_t * sm)
		 {
		       cl_qmap_t *p_sw_tbl;
		-       cl_qlist_t *p_list = &sm->mgrp_list;
		       osm_mgrp_t *p_mgrp;
		       int i, ret = 0;
		
		@@ -1150,16 +1149,14 @@ int
osm_mcast_mgr_process(osm_sm_t * sm)
		                       mcast_mgr_process_mgrp(sm,
p_mgrp);
		       }
		
		+       memset(sm->mlids_req, 0, sm->mlids_req_max);
		+       sm->mlids_req_max = 0;
		+
		       /*
		          Walk the switches and download the tables for
each.
		        */
		       ret = mcast_mgr_set_mftables(sm);
		
		-       while (!cl_is_qlist_empty(p_list)) {
		-               cl_list_item_t *p =
cl_qlist_remove_head(p_list);
		-               free(p);
		-       }
		-
		 exit:
		       CL_PLOCK_RELEASE(sm->p_lock);
		
		@@ -1174,11 +1171,10 @@ exit:
	
**********************************************************************/
		 int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
		 {
		-       cl_qlist_t *p_list = &sm->mgrp_list;
		       osm_mgrp_t *p_mgrp;
		       ib_net16_t mlid;
		-       osm_mcast_mgr_ctxt_t *ctx;
		       int ret = 0;
		+       unsigned i;
		
		       OSM_LOG_ENTER(sm->p_log);
		
		@@ -1192,14 +1188,12 @@ int
osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
		               goto exit;
		       }
		
		-       while (!cl_is_qlist_empty(p_list)) {
		-               ctx = (osm_mcast_mgr_ctxt_t *)
cl_qlist_remove_head(p_list);
		-
		-               /* nice copy no warning on size diff */
		-               memcpy(&mlid, &ctx->mlid, sizeof(mlid));
		+       for (i = 0; i <= sm->mlids_req_max; i++) {
		+               if (!sm->mlids_req[i])
		+                       continue;
		+               sm->mlids_req[i] = 0;
		
		-               /* we can destroy the context now */
		-               free(ctx);
		+               mlid = cl_hton16(i +
IB_LID_MCAST_START_HO);
		
		               /* since we delayed the execution we
prefer to pass the
		                  mlid as the mgrp identifier and then
find it or abort */
		@@ -1223,6 +1217,9 @@ int
osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
		               mcast_mgr_process_mgrp(sm, p_mgrp);
		       }
		
		+       memset(sm->mlids_req, 0, sm->mlids_req_max);
		+       sm->mlids_req_max = 0;
		+
		       /*
		          Walk the switches and download the tables for
each.
		        */
		diff --git a/opensm/opensm/osm_sm.c
b/opensm/opensm/osm_sm.c
		index 50aee91..e446c9d 100644
		--- a/opensm/opensm/osm_sm.c
		+++ b/opensm/opensm/osm_sm.c
		@@ -166,7 +166,6 @@ void osm_sm_construct(IN osm_sm_t *
p_sm)
		       cl_event_construct(&p_sm->subnet_up_event);
	
cl_event_wheel_construct(&p_sm->trap_aging_tracker);
		       cl_thread_construct(&p_sm->sweeper);
		-       cl_spinlock_construct(&p_sm->mgrp_lock);
		       osm_sm_mad_ctrl_construct(&p_sm->mad_ctrl);
		       osm_lid_mgr_construct(&p_sm->lid_mgr);
		       osm_ucast_mgr_construct(&p_sm->ucast_mgr);
		@@ -234,8 +233,8 @@ void osm_sm_destroy(IN osm_sm_t *
p_sm)
		       cl_event_destroy(&p_sm->signal_event);
		       cl_event_destroy(&p_sm->subnet_up_event);
		       cl_spinlock_destroy(&p_sm->signal_lock);
		-       cl_spinlock_destroy(&p_sm->mgrp_lock);
		       cl_spinlock_destroy(&p_sm->state_lock);
		+       free(p_sm->mlids_req);
		
		       osm_log(p_sm->p_log, OSM_LOG_SYS, "Exiting
SM\n");      /* Format Waived */
		       OSM_LOG_EXIT(p_sm->p_log);
		@@ -288,11 +287,14 @@ ib_api_status_t osm_sm_init(IN
osm_sm_t * p_sm, IN osm_subn_t * p_subn,
		       if (status != CL_SUCCESS)
		               goto Exit;
		
		-       cl_qlist_init(&p_sm->mgrp_list);
		-
		-       status = cl_spinlock_init(&p_sm->mgrp_lock);
		-       if (status != CL_SUCCESS)
		+       p_sm->mlids_req_max = 0;
		+       p_sm->mlids_req = malloc((IB_LID_MCAST_END_HO -
IB_LID_MCAST_START_HO +
		+                                 1) *
sizeof(p_sm->mlids_req[0]));
		+       if (!p_sm->mlids_req)
		               goto Exit;
		+       memset(p_sm->mlids_req, 0,
		+              (IB_LID_MCAST_END_HO -
IB_LID_MCAST_START_HO +
		+               1) * sizeof(p_sm->mlids_req[0]));
		
		       status = osm_sm_mad_ctrl_init(&p_sm->mad_ctrl,
p_sm->p_subn,
		                                     p_sm->p_mad_pool,
p_sm->p_vl15,
		@@ -441,32 +443,15 @@ Exit:
		
	
/**********************************************************************
	
**********************************************************************/
		-static ib_api_status_t sm_mgrp_process(IN osm_sm_t *
p_sm,
		-                                      IN osm_mgrp_t *
p_mgrp)
		+static void request_mlid(osm_sm_t * sm, uint16_t mlid)
		 {
		-       osm_mcast_mgr_ctxt_t *ctx;
		-
		-       /*
		-        * 'Schedule' all the QP0 traffic for when the
state manager
		-        * isn't busy trying to do something else.
		-        */
		-       ctx = malloc(sizeof(*ctx));
		-       if (!ctx)
		-               return IB_ERROR;
		-       memset(ctx, 0, sizeof(*ctx));
		-       ctx->mlid = p_mgrp->mlid;
		-
		-       cl_spinlock_acquire(&p_sm->mgrp_lock);
		-       cl_qlist_insert_tail(&p_sm->mgrp_list,
&ctx->list_item);
		-       cl_spinlock_release(&p_sm->mgrp_lock);
		-
		-       osm_sm_signal(p_sm,
OSM_SIGNAL_IDLE_TIME_PROCESS_REQUEST);
		-
		-       return IB_SUCCESS;
		+       mlid -= IB_LID_MCAST_START_HO;
		+       sm->mlids_req[mlid] = 1;
		+       if (sm->mlids_req_max < mlid)
		+               sm->mlids_req_max = mlid;
		+       osm_sm_signal(sm,
OSM_SIGNAL_IDLE_TIME_PROCESS_REQUEST);
		 }
		
	
-/**********************************************************************
		-
**********************************************************************/
		 ib_api_status_t osm_sm_mcgrp_join(IN osm_sm_t * p_sm,
IN osm_mgrp_t *mgrp,
		                                 IN const ib_net64_t
port_guid)
		 {
		@@ -519,7 +504,7 @@ ib_api_status_t osm_sm_mcgrp_join(IN
osm_sm_t * p_sm, IN osm_mgrp_t *mgrp,
		               goto Exit;
		       }
		
		-       status = sm_mgrp_process(p_sm, mgrp);
		+       request_mlid(p_sm, cl_ntoh16(mgrp->mlid));
		 Exit:
		       CL_PLOCK_RELEASE(p_sm->p_lock);
		       OSM_LOG_EXIT(p_sm->p_log);
		@@ -527,8 +512,6 @@ Exit:
		       return status;
		 }
		
	
-/**********************************************************************
		-
**********************************************************************/
		 ib_api_status_t osm_sm_mcgrp_leave(IN osm_sm_t * p_sm,
IN osm_mgrp_t *mgrp,
		                                  IN const ib_net64_t
port_guid)
		 {
		@@ -557,7 +540,7 @@ ib_api_status_t
osm_sm_mcgrp_leave(IN osm_sm_t * p_sm, IN osm_mgrp_t *mgrp,
		
		       osm_port_remove_mgrp(p_port, mgrp);
		
		-       status = sm_mgrp_process(p_sm, mgrp);
		+       request_mlid(p_sm, cl_hton16(mgrp->mlid));
		 Exit:
		       CL_PLOCK_RELEASE(p_sm->p_lock);
		       OSM_LOG_EXIT(p_sm->p_log);
		--
		1.6.4.2
		
		_______________________________________________
		general mailing list
		general at lists.openfabrics.org
	
http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
		
		To unsubscribe, please visit
http://openib.org/mailman/listinfo/openib-general
		


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20090909/5a4e141f/attachment.html>


More information about the general mailing list