[ofa-general] Re: [PATCH 2/3] OpenSM: Fix incorrect reporting of routing engine/algorithm used

Sasha Khapyorsky sashak at voltaire.com
Thu Dec 13 03:51:23 PST 2007


Al, Hi again,

On 16:55 Fri 07 Dec     , Al Chu wrote:
> Hey Sasha,
> 
> I noticed that when a routing algorithm failed and defaulted back to
> 'minhop', the logs and the console did not report this change.  This is
> because most of that code outputs the routing algorithm name that was
> stored during configuration/setup.  The name isn't adjusted depending on
> the routing algorithm's success/failure.
> 
> There are several ways this could be fixed.  I decided to easiest was to
> stick a new routed_name field + lock into struct osm_routing_engine, and
> set/use this new field respectively.

Do we need special lock for this?

> Note that within osm_ucast_mgr_process(), there is a slight logic change
> from what was there before.  If the routing engine's call to
> build_lid_matrices() failed, I've changed the logic to not call the
> routing engine's ucast_build_fwd_tables() function.  This felt like the
> correct logic and seems to be fine given all the routing algorithms in
> OpenSM.  PLMK if there is some behavior subtlety I missed.

I think it will break "file" routing engine, where lid matrices and/or
LFTs can be loaded from a dump files. With this "engine" an user can
decide to load only LFTs from a file and use calculated lid matrices
(actually it is most common usage), then only LFTs dump file is
provided and lid matrix creator falls back legally.

> 
> Thanks,
> Al
> -- 
> Albert Chu
> chu11 at llnl.gov
> 925-422-5311
> Computer Scientist
> High Performance Systems Division
> Lawrence Livermore National Laboratory

> From d83332b4c6cb1cdb0fc960808f9fa53615f61201 Mon Sep 17 00:00:00 2001
> From: Albert L. Chu <chu11 at llnl.gov>
> Date: Fri, 7 Dec 2007 13:44:04 -0800
> Subject: [PATCH] fix incorrect reporting of routing engine
> 
> 
> Signed-off-by: Albert L. Chu <chu11 at llnl.gov>
> ---
>  opensm/include/opensm/osm_opensm.h |    9 +++++++
>  opensm/opensm/osm_console.c        |    6 +++-
>  opensm/opensm/osm_opensm.c         |    6 +++++
>  opensm/opensm/osm_ucast_mgr.c      |   43 ++++++++++++++++++++++++++----------
>  4 files changed, 50 insertions(+), 14 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_opensm.h b/opensm/include/opensm/osm_opensm.h
> index 1b5edb8..3fc70fd 100644
> --- a/opensm/include/opensm/osm_opensm.h
> +++ b/opensm/include/opensm/osm_opensm.h
> @@ -107,6 +107,8 @@ struct osm_routing_engine {
>  	int (*ucast_build_fwd_tables) (void *context);
>  	void (*ucast_dump_tables) (void *context);
>  	void (*delete) (void *context);
> +	const char *routed_name;
> +	cl_plock_t routed_name_lock;
>  };
>  /*
>  * FIELDS
> @@ -129,6 +131,13 @@ struct osm_routing_engine {
>  *	delete
>  *		The delete method, may be used for routing engine
>  *		internals cleanup.
> +*
> +*	routed_name
> +*		The routing engine name used for routing (for example,
> +*		the specified one failed and we used the default)
> +*
> +*	routed_name_lock
> +*		Shared lock guarding reads and writes to routed_name.
>  */
>  
>  typedef struct _osm_console_t {
> diff --git a/opensm/opensm/osm_console.c b/opensm/opensm/osm_console.c
> index f669240..d8d16db 100644
> --- a/opensm/opensm/osm_console.c
> +++ b/opensm/opensm/osm_console.c
> @@ -380,9 +380,11 @@ static void print_status(osm_opensm_t * p_osm, FILE * out)
>  			sm_state_mgr_str(p_osm->sm.state_mgr.state));
>  		fprintf(out, "   SA State           : %s\n",
>  			sa_state_str(p_osm->sa.state));
> +                cl_plock_acquire(&p_osm->routing_engine.routed_name_lock);

Another variables are not protected by any locks here. Probably
p_osm->lock should be used for all racy variables in this function
(instead of just protecting routed_name)?

>  		fprintf(out, "   Routing Engine     : %s\n",
> -			p_osm->routing_engine.name ? p_osm->routing_engine.
> -			name : "null (minhop)");
> +			p_osm->routing_engine.routed_name ? 
> +			p_osm->routing_engine.routed_name : "null (minhop)");

What about to always setup routed_name (initially yet in opensm_init())?
Then here in in 3/3 patch it will not be necessary to check it for NULL.

> +                cl_plock_release(&p_osm->routing_engine.routed_name_lock);
>  #ifdef ENABLE_OSM_PERF_MGR
>  		fprintf(out, "\n   PerfMgr state/sweep state : %s/%s\n",
>  			osm_perfmgr_get_state_str(&(p_osm->perfmgr)),
> diff --git a/opensm/opensm/osm_opensm.c b/opensm/opensm/osm_opensm.c
> index 26b9969..b55a638 100644
> --- a/opensm/opensm/osm_opensm.c
> +++ b/opensm/opensm/osm_opensm.c
> @@ -188,6 +188,8 @@ void osm_opensm_destroy(IN osm_opensm_t * const p_osm)
>  
>  	cl_plock_destroy(&p_osm->lock);
>  
> +	cl_plock_destroy(&p_osm->routing_engine.routed_name_lock);
> +
>  	osm_log_destroy(&p_osm->log);
>  }
>  
> @@ -224,6 +226,10 @@ osm_opensm_init(IN osm_opensm_t * const p_osm,
>  	if (status != IB_SUCCESS)
>  		goto Exit;
>  
> +	status = cl_plock_init(&p_osm->routing_engine.routed_name_lock);
> +	if (status != IB_SUCCESS)
> +		goto Exit;
> +
>  	if (p_opt->single_thread) {
>  		osm_log(&p_osm->log, OSM_LOG_INFO,
>  			"osm_opensm_init: Forcing single threaded dispatcher\n");
> diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> index 8bb4739..fe15666 100644
> --- a/opensm/opensm/osm_ucast_mgr.c
> +++ b/opensm/opensm/osm_ucast_mgr.c
> @@ -769,6 +769,9 @@ osm_signal_t osm_ucast_mgr_process(IN osm_ucast_mgr_t * const p_mgr)
>  	struct osm_routing_engine *p_routing_eng;
>  	osm_signal_t signal = OSM_SIGNAL_DONE;
>  	cl_qmap_t *p_sw_guid_tbl;
> +        const char *routed_name = NULL;
> +        int blm = 0;
> +        int ubft = 0;
>  
>  	OSM_LOG_ENTER(p_mgr->p_log, osm_ucast_mgr_process);
>  
> @@ -789,23 +792,39 @@ osm_signal_t osm_ucast_mgr_process(IN osm_ucast_mgr_t * const p_mgr)
>  
>  	p_mgr->any_change = FALSE;
>  
> -	if (!p_routing_eng->build_lid_matrices ||
> -	    p_routing_eng->build_lid_matrices(p_routing_eng->context) != 0)
> -		osm_ucast_mgr_build_lid_matrices(p_mgr);
> -
> -	osm_log(p_mgr->p_log, OSM_LOG_INFO,
> -		"osm_ucast_mgr_process: "
> -		"%s tables configured on all switches\n",
> -		p_routing_eng->name ? p_routing_eng->name : "null (minhop)");
> +	if (p_routing_eng->build_lid_matrices) {
> +            blm = p_routing_eng->build_lid_matrices(p_routing_eng->context);
> +            if (blm)
> +                osm_ucast_mgr_build_lid_matrices(p_mgr);
> +        }
> +        else
> +            osm_ucast_mgr_build_lid_matrices(p_mgr);
>  
>  	/*
>  	   Now that the lid matrices have been built, we can
>  	   build and download the switch forwarding tables.
>  	 */
> -	if (!p_routing_eng->ucast_build_fwd_tables ||
> -	    p_routing_eng->ucast_build_fwd_tables(p_routing_eng->context))
> -		cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> -				   p_mgr);
> +	if (!blm && p_routing_eng->ucast_build_fwd_tables) {
> +            ubft = p_routing_eng->ucast_build_fwd_tables(p_routing_eng->context);
> +            if (ubft)
> +                cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> +                                   p_mgr);
> +        }
> +        else
> +            cl_qmap_apply_func(p_sw_guid_tbl, __osm_ucast_mgr_process_tbl,
> +                               p_mgr);
> +
> +        if (!blm && !ubft)
> +            routed_name = p_routing_eng->name;
> +
> +        CL_PLOCK_EXCL_ACQUIRE(&p_routing_eng->routed_name_lock);
> +        p_routing_eng->routed_name = routed_name;
> +        CL_PLOCK_RELEASE(&p_routing_eng->routed_name_lock);

Basically this section is already protected by p_mgr->p_lock (which is
reference to &osm.lock).

Sasha

> +
> +	osm_log(p_mgr->p_log, OSM_LOG_INFO,
> +		"osm_ucast_mgr_process: "
> +		"%s tables configured on all switches\n",
> +		routed_name ? routed_name : "null (minhop)");
>  
>  	if (p_mgr->any_change) {
>  		signal = OSM_SIGNAL_DONE_PENDING;
> -- 
> 1.5.1
> 




More information about the general mailing list