[ofa-general] Re: [PATCH 2/3] OpenSM: Fix incorrect reporting of routing engine/algorithm used

Sasha Khapyorsky sashak at voltaire.com
Sun Dec 9 12:48:22 PST 2007


Hi Al,

On 16:55 Fri 07 Dec     , Al Chu wrote:
> 
> I noticed that when a routing algorithm failed and defaulted back to
> 'minhop', the logs and the console did not report this change.  This is
> because most of that code outputs the routing algorithm name that was
> stored during configuration/setup.

Right, as well as routing engine methods. So next cycle (sweep) this
routing engine method will run again and could be more successful.
So we cannot say that whole routing engine does fallback, just particular
callback in this particular cycle do.

So I'm not sure that just "renaming" helps a lot. I think better solution
in general would be to define something like routing engine chains - so
an user could specify which routing engine to use and in which order it
(whole engine) should fallback. Something like:

	-R ftree updn minhops

could mean - try ftree and if it fails switch to updn, etc..

Sasha

> The name isn't adjusted depending on
> the routing algorithm's success/failure.
> 
> There are several ways this could be fixed.  I decided to easiest was to
> stick a new routed_name field + lock into struct osm_routing_engine, and
> set/use this new field respectively.
> 
> Note that within osm_ucast_mgr_process(), there is a slight logic change
> from what was there before.  If the routing engine's call to
> build_lid_matrices() failed, I've changed the logic to not call the
> routing engine's ucast_build_fwd_tables() function.  This felt like the
> correct logic and seems to be fine given all the routing algorithms in
> OpenSM.  PLMK if there is some behavior subtlety I missed.
> 
> Thanks,
> Al
> -- 
> Albert Chu
> chu11 at llnl.gov
> 925-422-5311
> Computer Scientist
> High Performance Systems Division
> Lawrence Livermore National Laboratory

> From d83332b4c6cb1cdb0fc960808f9fa53615f61201 Mon Sep 17 00:00:00 2001
> From: Albert L. Chu <chu11 at llnl.gov>
> Date: Fri, 7 Dec 2007 13:44:04 -0800
> Subject: [PATCH] fix incorrect reporting of routing engine
> 
> 
> Signed-off-by: Albert L. Chu <chu11 at llnl.gov>
> ---
>  opensm/include/opensm/osm_opensm.h |    9 +++++++
>  opensm/opensm/osm_console.c        |    6 +++-
>  opensm/opensm/osm_opensm.c         |    6 +++++
>  opensm/opensm/osm_ucast_mgr.c      |   43 ++++++++++++++++++++++++++----------
>  4 files changed, 50 insertions(+), 14 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_opensm.h b/opensm/include/opensm/osm_opensm.h
> index 1b5edb8..3fc70fd 100644
> --- a/opensm/include/opensm/osm_opensm.h
> +++ b/opensm/include/opensm/osm_opensm.h
> @@ -107,6 +107,8 @@ struct osm_routing_engine {
>  	int (*ucast_build_fwd_tables) (void *context);
>  	void (*ucast_dump_tables) (void *context);
>  	void (*delete) (void *context);
> +	const char *routed_name;
> +	cl_plock_t routed_name_lock;
>  };
>  /*
>  * FIELDS
> @@ -129,6 +131,13 @@ struct osm_routing_engine {
>  *	delete
>  *		The delete method, may be used for routing engine
>  *		internals cleanup.
> +*
> +*	routed_name
> +*		The routing engine name used for routing (for example,
> +*		the specified one failed and we used the default)
> +*
> +*	routed_name_lock
> +*		Shared lock guarding reads and writes to routed_name.
>  */
>  
>  typedef struct _osm_console_t {
> diff --git a/opensm/opensm/osm_console.c b/opensm/opensm/osm_console.c
> index f669240..d8d16db 100644
> --- a/opensm/opensm/osm_console.c
> +++ b/opensm/opensm/osm_console.c
> @@ -380,9 +380,11 @@ static void print_status(osm_opensm_t * p_osm, FILE * out)
>  			sm_state_mgr_str(p_osm->sm.state_mgr.state));
>  		fprintf(out, "   SA State           : %s\n",
>  			sa_state_str(p_osm->sa.state));
> +                cl_plock_acquire(&p_osm->routing_engine.routed_name_lock);
>  		fprintf(out, "   Routing Engine     : %s\n",
> -			p_osm->routing_engine.name ? p_osm->routing_engine.
> -			name : "null (minhop)");
> +			p_osm->routing_engine.routed_name ? 
> +			p_osm->routing_engine.routed_name : "null (minhop)");
> +                cl_plock_release(&p_osm->routing_engine.routed_name_lock);
>  #ifdef ENABLE_OSM_PERF_MGR
>  		fprintf(out, "\n   PerfMgr state/sweep state : %s/%s\n",
>  			osm_perfmgr_get_state_str(&(p_osm->perfmgr)),
> diff --git a/opensm/opensm/osm_opensm.c b/opensm/opensm/osm_opensm.c
> index 26b9969..b55a638 100644
> --- a/opensm/opensm/osm_opensm.c
> +++ b/opensm/opensm/osm_opensm.c
> @@ -188,6 +188,8 @@ void osm_opensm_destroy(IN osm_opensm_t * const p_osm)
>  
>  	cl_plock_destroy(&p_osm->lock);
>  
> +	cl_plock_destroy(&p_osm->routing_engine.routed_name_lock);
> +
>  	osm_log_destroy(&p_osm->log);
>  }
>  
> @@ -224,6 +226,10 @@ osm_opensm_init(IN osm_opensm_t * const p_osm,
>  	if (status != IB_SUCCESS)
>  		goto Exit;
>  
> +	status = cl_plock_init(&p_osm->routing_engine.routed_name_lock);
> +	if (status != IB_SUCCESS)
> +		goto Exit;
> +
>  	if (p_opt->single_thread) {
>  		osm_log(&p_osm->log, OSM_LOG_INFO,
>  			"osm_opensm_init: Forcing single threaded dispatcher\n");
> diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> index 8bb4739..fe15666 100644
> --- a/opensm/opensm/osm_ucast_mgr.c
> +++ b/opensm/opensm/osm_ucast_mgr.c
> @@ -769,6 +769,9 @@ osm_signal_t osm_ucast_mgr_process(IN osm_ucast_mgr_t * const p_mgr)
>  	struct osm_routing_engine *p_routing_eng;
>  	osm_signal_t signal = OSM_SIGNAL_DONE;
>  	cl_qmap_t *p_sw_guid_tbl;
> +        const char *routed_name = NULL;
> +        int blm = 0;
> +        int ubft = 0;
>  
>  	OSM_LOG_ENTER(p_mgr->p_log, osm_ucast_mgr_process);
>  
> @@ -789,23 +792,39 @@ osm_signal_t osm_ucast_mgr_process(IN osm_ucast_mgr_t * const p_mgr)
>  
>  	p_mgr->any_change = FALSE;
>  
> -	if (!p_routing_eng->build_lid_matrices ||
> -	    p_routing_eng->build_lid_matrices(p_routing_eng->context) != 0)
> -		osm_ucast_mgr_build_lid_matrices(p_mgr);
> -
> -	osm_log(p_mgr->p_log, OSM_LOG_INFO,
> -		"osm_ucast_mgr_process: "
> -		"%s tables configured on all switches\n",
> -		p_routing_eng->name ? p_routing_eng->name : "null (minhop)");
> +	if (p_routing_eng->build_lid_matrices) {
> +            blm = p_routing_eng->build_lid_matrices(p_routing_eng->context);
> +            if (blm)
> +                osm_ucast_mgr_build_lid_matrices(p_mgr);
> +        }
> +        else
> +            osm_ucast_mgr_build_lid_matrices(p_mgr);
>  
>  	/*
>  	   Now that the lid matrices have been built, we can
>  	   build and download the switch forwarding tables.
>  	 */
> -	if (!p_routing_eng->ucast_build_fwd_tables ||
> -	    p_routing_eng->ucast_build_fwd_tables(p_routing_eng->context))
> -		cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> -				   p_mgr);
> +	if (!blm && p_routing_eng->ucast_build_fwd_tables) {
> +            ubft = p_routing_eng->ucast_build_fwd_tables(p_routing_eng->context);
> +            if (ubft)
> +                cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> +                                   p_mgr);
> +        }
> +        else
> +            cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> +                               p_mgr);
> +
> +        if (!blm && !ubft)
> +            routed_name = p_routing_eng->name;
> +
> +        CL_PLOCK_EXCL_ACQUIRE(&p_routing_eng->routed_name_lock);
> +        p_routing_eng->routed_name = routed_name;
> +        CL_PLOCK_RELEASE(&p_routing_eng->routed_name_lock);
> +
> +	osm_log(p_mgr->p_log, OSM_LOG_INFO,
> +		"osm_ucast_mgr_process: "
> +		"%s tables configured on all switches\n",
> +		routed_name ? routed_name : "null (minhop)");
>  
>  	if (p_mgr->any_change) {
>  		signal = OSM_SIGNAL_DONE_PENDING;
> -- 
> 1.5.1
> 




More information about the general mailing list