[ofa-general] Re: [PATCH RFC] opensm: mcast mgr improvements

Tue Jan 1 07:20:37 PST 2008

On 15:50 Tue 01 Jan     , Yevgeny Kliteynik wrote:
> > @@ -1395,79 +1382,79 @@ osm_mgrp_t *__get_mgrp_by_mlid(IN osm_mcast_mgr_t * 
> > const p_mgr,
> >   /**********************************************************************
> >    This is the function that is invoked during idle time to handle the
> > -  process request. Context1 is simply the osm_mcast_mgr_t*, Context2
> > -  hold the mlid, port guid and action (join/leave/delete) required.
> > +  process request for mcast groups where join/leave/delete was required.
> >   **********************************************************************/
> > -osm_signal_t
> > -osm_mcast_mgr_process_mgrp_cb(IN void *const Context1, IN void *const 
> > Context2)
> > +osm_signal_t osm_mcast_mgr_process_mgroups(osm_mcast_mgr_t * p_mgr)
> >  {
> > -	osm_mcast_mgr_t *p_mgr = (osm_mcast_mgr_t *) Context1;
> > +	cl_qlist_t *p_list = &p_mgr->p_subn->p_osm->sm.mgrp_list;
> > +	osm_switch_t *p_sw;
> > +	cl_qmap_t *p_sw_tbl;
> >  	osm_mgrp_t *p_mgrp;
> >  	ib_net16_t mlid;
> > -	osm_signal_t signal = OSM_SIGNAL_DONE;
> > -	osm_mcast_mgr_ctxt_t *p_ctxt = (osm_mcast_mgr_ctxt_t *) Context2;
> > -	osm_mcast_req_type_t req_type = p_ctxt->req_type;
> > -	ib_net64_t port_guid = p_ctxt->port_guid;
> > -
> > -	OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process_mgrp_cb);
> > -
> > -	/* nice copy no warning on size diff */
> > -	memcpy(&mlid, &p_ctxt->mlid, sizeof(mlid));
> > +	osm_signal_t ret, signal = OSM_SIGNAL_DONE;
> > +	osm_mcast_mgr_ctxt_t *ctx;
> > +	osm_mcast_req_type_t req_type;
> > +	ib_net64_t port_guid;
> >  -	/* we can destroy the context now */
> > -	free(p_ctxt);
> > +	OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process_mgroups);
> >   	/* we need a lock to make sure the p_mgrp is not change other ways */
> >  	CL_PLOCK_EXCL_ACQUIRE(p_mgr->p_lock);
> > -	p_mgrp = __get_mgrp_by_mlid(p_mgr, mlid);
> >  -	/* since we delayed the execution we prefer to pass the
> > -	   mlid as the mgrp identifier and then find it or abort */
> > +	if (cl_is_qlist_empty(p_list)) {
> > +		CL_PLOCK_RELEASE(p_mgr->p_lock);
> > +		return OSM_SIGNAL_NONE;
> > +	}
> > +
> > +	while (!cl_is_qlist_empty(p_list)) {
> > +		ctx = (osm_mcast_mgr_ctxt_t *) cl_qlist_remove_head(p_list);
> > +		req_type = ctx->req_type;
> > +		port_guid = ctx->port_guid;
> > +
> > +		/* nice copy no warning on size diff */
> > +		memcpy(&mlid, &ctx->mlid, sizeof(mlid));
> >  -	if (p_mgrp) {
> > +		/* we can destroy the context now */
> > +		free(ctx);
> > +
> > +		/* since we delayed the execution we prefer to pass the
> > +		   mlid as the mgrp identifier and then find it or abort */
> > +		p_mgrp = __get_mgrp_by_mlid(p_mgr, mlid);
> > +		if (!p_mgrp)
> > +			continue;
> >  -		/* if there was no change from the last time we processed the group
> > -		   we can skip doing anything
> > +		/* if there was no change from the last time
> > +		 * we processed the group we can skip doing anything
> >  		 */
> >  		if (p_mgrp->last_change_id == p_mgrp->last_tree_id) {
> >  			osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > -				"osm_mcast_mgr_process_mgrp_cb: "
> > +				"osm_mcast_mgr_process_mgroups: "
> >  				"Skip processing mgrp with lid:0x%X change id:%u\n",
> >  				cl_ntoh16(mlid), p_mgrp->last_change_id);
> > -		} else {
> > -			osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > -				"osm_mcast_mgr_process_mgrp_cb: "
> > -				"Processing mgrp with lid:0x%X change id:%u\n",
> > -				cl_ntoh16(mlid), p_mgrp->last_change_id);
> > -
> > -			signal =
> > -			    osm_mcast_mgr_process_mgrp(p_mgr, p_mgrp, req_type,
> > -						       port_guid);
> > -			p_mgrp->last_tree_id = p_mgrp->last_change_id;
> > +			continue;
> >  		}
> >  -		/* Remove MGRP only if osm_mcm_port_t count is 0 and
> > -		 * Not a well known group
> > -		 */
> > -		if ((0x0 == cl_qmap_count(&p_mgrp->mcm_port_tbl)) &&
> > -		    (p_mgrp->well_known == FALSE)) {
> > -			osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > -				"osm_mcast_mgr_process_mgrp_cb: "
> > -				"Destroying mgrp with lid:0x%X\n",
> > -				cl_ntoh16(mlid));
> > -
> > -			/* Send a Report to any InformInfo registered for
> > -			   Trap 67 : MCGroup delete */
> > -			osm_mgrp_send_delete_notice(p_mgr->p_subn, p_mgr->p_log,
> > -						    p_mgrp);
> > -
> > -			cl_qmap_remove_item(&p_mgr->p_subn->mgrp_mlid_tbl,
> > -					    (cl_map_item_t *) p_mgrp);
> > +		osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > +			"osm_mcast_mgr_process_mgroups: "
> > +			"Processing mgrp with lid:0x%X change id:%u\n",
> > +			cl_ntoh16(mlid), p_mgrp->last_change_id);
> > +		mcast_mgr_process_mgrp(p_mgr, p_mgrp, req_type, port_guid);
> > +	}
> >  -			osm_mgrp_delete(p_mgrp);
> > -		}
> > +	/*
> > +	   Walk the switches and download the tables for each.
> > +	 */
> > +	p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
> > +	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> > +	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> > +		ret = __osm_mcast_mgr_set_tbl(p_mgr, p_sw);
> > +		if (ret == OSM_SIGNAL_DONE_PENDING)
> > +			signal = ret;
> > +		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> >  	}
> 
>  So basically what you do here is processing all the idle queue requests
>  one by one, but you actually update the mcast tables on switches only
>  once at the end of idle queue processing. This should speed up mcast
>  handling significantly, which is great.
> 
>  However, let's get back to the original problem that triggered this change.
>  During a join/leave "burst" in a big fabric (let's say couple of K's of CAs)
>  with hundreds of mcast groups this can still be a problem.
> 
>  If there are hundreds of requests in the idle queue, function
>  osm_mcast_mgr_process_mgroups() will return only when it will finish the 
>  whole
>  idle queue processing,

Right, but the difference is that re-routing for all mcast groups will
be done in one pass. So assuming that the most time consuming operation
here is switch's MFTs update (which by itself requires improvement, but
this is different story) the time which will be needed for this should
be comparable with single mcast group request processing before the
patch.

> so if during that time there will be a topology 
>  change
>  in the fabric, and immediate heavy sweep will be requested, the OSM state 
>  mgr
>  will notice it only after finishing processing the idle queue, right?

Right, but note the second patch too (v2 of your state_mgr patch). It
will process the only list of mcast groups which were requested *before*
heavy sweep request (the rest will be processed during heavy sweep
itself and rest of the list will be purged there).

>  Any idea how fast the processing is right now?

Hard to find exact numbers - I saw significant differences between
simulated and real cluster times in the past. But in general I think
a queue processing time should be comparable to a single request
processing time.

>  In the original problem I saw osm busy with mcast for more than 10 minutes.
>  How much will it take now? Even if it's 10 times faster, 1 minute of 
>  unattended
>  topology change is still too long.

I think that real life scenario here is follow: some nodes were
rebooted, OpenSM starts to get mcast join/leave requests and quickly
starts to process very few first requests, including updating switch
MFTs, during this time (when it waits for NO_PENDED_TRANSACTION signal),
it gets re-sweep request and will continue with heavy sweep.

Sasha