[ofa-general] [PATCH v3] Add a Node Description check on light sweep to ensure that the ND has been found for each node.

Sasha Khapyorsky sashak at voltaire.com
Sun Aug 17 13:54:12 PDT 2008


Hi Ira,

On 09:08 Fri 08 Aug     , Ira Weiny wrote:
> On Fri, 8 Aug 2008 08:58:15 -0400
> "Hal Rosenstock" <hal.rosenstock at gmail.com> wrote:
> 
> > On Thu, Aug 7, 2008 at 5:53 PM, Ira Weiny <weiny2 at llnl.gov> wrote:
> > > >From 123a950a8bf0fc43331a1e715f0cdd756529437c Mon Sep 17 00:00:00 2001
> > > From: Ira K. Weiny <weiny2 at llnl.gov>
> 
> <snip>
> 
> > > +       if (status != IB_SUCCESS)
> > > +               OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> > > +                       "__osm_state_mgr_get_node_desc: ERR 3315: "
> > > +                       "Failure initiating NodeDescription request (%s)\n",
> > > +                       ib_get_err_str(status));
> > 
> > 
> > Aren't error codes 3314 and 3315 already taken ?
> > 
> > -- Hal
> > 
> 
> <sigh...>  Yes, I forgot you mentioned that.  v3 attached.
> Ira
> 
> 
> >From 6470536504e0bb6c6ff86619f3801235e022a99d Mon Sep 17 00:00:00 2001
> From: Ira K. Weiny <weiny2 at llnl.gov>
> Date: Wed, 30 Jul 2008 17:28:30 -0700
> Subject: [PATCH] Add a Node Description check on light sweep to ensure that the ND has been
>  found for each node.  This case covers the condition where a ND message is
>  dropped/lost for some reason and OpenSM is left with a valid configured node
>  which is not named correctly.

Please use one line patch summary as subject and patch description in
email body.

((15) of /usr/src/linux/Documentation/SubmittingPatches).

> 
> This is not the same as a node which has changed it's Node Descriptioin.  In
> this case the node needs to send a trap.
> 
> Signed-off-by: Ira Weiny <weiny2 at llnl.gov>
> ---
>  opensm/include/opensm/osm_base.h |   11 ++++++++
>  opensm/opensm/osm_node.c         |    2 +-
>  opensm/opensm/osm_state_mgr.c    |   53 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 65 insertions(+), 1 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_base.h b/opensm/include/opensm/osm_base.h
> index 3793804..2e8def7 100644
> --- a/opensm/include/opensm/osm_base.h
> +++ b/opensm/include/opensm/osm_base.h
> @@ -640,6 +640,17 @@ BEGIN_C_DECLS
>  */
>  #define OSM_NO_PATH			0xFF
>  /**********/
> +/****d* OpenSM: Base/OSM_NODE_DESC_UNKNOWN
> +* NAME
> +*	OSM_NODE_DESC_UNKNOWN
> +*
> +* DESCRIPTION
> +*	Value indicating the Node Description is not set and is "unknown"
> +*
> +* SYNOPSIS
> +*/
> +#define OSM_NODE_DESC_UNKNOWN "<unknown>"
> +/**********/
>  /****d* OpenSM: Base/osm_thread_state_t
>  * NAME
>  *	osm_thread_state_t
> diff --git a/opensm/opensm/osm_node.c b/opensm/opensm/osm_node.c
> index d99c656..123feb8 100644
> --- a/opensm/opensm/osm_node.c
> +++ b/opensm/opensm/osm_node.c
> @@ -136,7 +136,7 @@ osm_node_t *osm_node_new(IN const osm_madw_t * const p_madw)
>  	osm_node_init_physp(p_node, p_madw);
>  	if (p_ni->node_type == IB_NODE_TYPE_SWITCH)
>  		node_init_physp0(p_node, p_madw);
> -	p_node->print_desc = strdup("<unknown>");
> +	p_node->print_desc = strdup(OSM_NODE_DESC_UNKNOWN);
>  
>  	return (p_node);
>  }
> diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> index 3cdb2cf..7502287 100644
> --- a/opensm/opensm/osm_state_mgr.c
> +++ b/opensm/opensm/osm_state_mgr.c
> @@ -516,6 +516,53 @@ static void query_sm_info(cl_map_item_t *item, void *cxt)
>  }
>  
>  /**********************************************************************
> + During a light sweep check each node to see if the node descriptor is valid
> + if not issue a ND query.
> +**********************************************************************/
> +static void __osm_state_mgr_get_node_desc(IN cl_map_item_t * const p_object,
> +					IN void *context)
> +{
> +	osm_physp_t *p_physp = NULL;
> +	osm_node_t *const p_node = (osm_node_t *) p_object;
> +	ib_api_status_t status = IB_SUCCESS;
> +	osm_madw_context_t mad_context;
> +	osm_sm_t *sm = (osm_sm_t *)context;
> +
> +	OSM_LOG_ENTER(sm->p_log);
> +
> +	CL_ASSERT(p_node);
> +
> +	if (p_node->print_desc && strcmp(p_node->print_desc, OSM_NODE_DESC_UNKNOWN))
> +		/* if ND is valid, do nothing */
> +		goto exit;
> +
> +	OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> +		"ERR 3319: Unknown node description \"%s\" for node "
> +		"0x%016" PRIx64 ".  Reissuing ND query\n",
> +		p_node->print_desc ? p_node->print_desc : OSM_NODE_DESC_UNKNOWN,

Actually this is not needed due to condition above. Just
OSM_NODE_DESC_UNKNOWN (or just "Unknown node description for node....")
could be printer instead.

> +		cl_ntoh64(osm_node_get_node_guid (p_node)));
> +
> +	/* get a physp to request from. */
> +	p_physp = osm_node_get_any_physp_ptr(p_node);
> +
> +	mad_context.nd_context.node_guid = osm_node_get_node_guid(p_node);
> +
> +	status = osm_req_get(sm,
> +			     osm_physp_get_dr_path_ptr(p_physp),
> +			     IB_MAD_ATTR_NODE_DESC,
> +			     0, CL_DISP_MSGID_NONE, &mad_context);
> +	if (status != IB_SUCCESS)
> +		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> +			"__osm_state_mgr_get_node_desc: ERR 331B: "

OSM_LOG() macro includes function name, so you don't need to specify it
again in format string.

> +			"Failure initiating NodeDescription request (%s)\n",
> +			ib_get_err_str(status));
> +
> +exit:
> +	OSM_LOG_EXIT(sm->p_log);
> +}
> +
> +
> +/**********************************************************************
>   Initiates a lightweight sweep of the subnet.
>   Used during normal sweeps after the subnet is up.
>  **********************************************************************/
> @@ -524,6 +571,7 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
>  	ib_api_status_t status = IB_SUCCESS;
>  	osm_bind_handle_t h_bind;
>  	cl_qmap_t *p_sw_tbl;
> +	cl_qmap_t *p_node_tbl;
>  	cl_map_item_t *p_next;
>  	osm_node_t *p_node;
>  	osm_physp_t *p_physp;
> @@ -532,6 +580,7 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
>  	OSM_LOG_ENTER(sm->p_log);
>  
>  	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
> +	p_node_tbl = &sm->p_subn->node_guid_tbl;

Seems like unneeded variable - it is used only once below:

>  
>  	/*
>  	 * First, get the bind handle.
> @@ -550,6 +599,10 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
>  	cl_qmap_apply_func(p_sw_tbl, __osm_state_mgr_get_sw_info, sm);
>  	CL_PLOCK_RELEASE(sm->p_lock);
>  
> +	CL_PLOCK_ACQUIRE(sm->p_lock);
> +	cl_qmap_apply_func(p_node_tbl, __osm_state_mgr_get_node_desc, sm);

	cl_qmap_apply_func(&sm->p_subn->node_guid_tbl,
			   __osm_state_mgr_get_node_desc, sm)

> +	CL_PLOCK_RELEASE(sm->p_lock);
> +
>  	/* now scan the list of physical ports that were not down but have no remote port */
>  	CL_PLOCK_ACQUIRE(sm->p_lock);
>  	p_next = cl_qmap_head(&sm->p_subn->node_guid_tbl);
> -- 
> 1.5.4.5

Sasha



More information about the general mailing list