[ofa-general] [PATCH V2] OpenSM: Add a Node Description check on light sweep to ensure that the ND has been found for each node.

Ira Weiny weiny2 at llnl.gov
Thu Jul 31 10:05:23 PDT 2008


On Thu, 31 Jul 2008 07:33:37 -0400
"Hal Rosenstock" <hal.rosenstock at gmail.com> wrote:

> On Wed, Jul 30, 2008 at 8:40 PM, Ira Weiny <weiny2 at llnl.gov> wrote:
> > >From d2e52a8b9de02521b01de3414562f45e476cafbf Mon Sep 17 00:00:00 2001
> > From: Ira K. Weiny <weiny2 at llnl.gov>
> > Date: Wed, 30 Jul 2008 17:28:30 -0700
> > Subject: [PATCH] Add a Node Description check on light sweep to ensure that the ND has been
> >  found for each node.  This case covers the condition where a ND message is
> >  dropped/lost for some reason and OpenSM is left with a valid configured node
> >  which is not named correctly.
> 
> A couple of nits below.
> 
> > This is not the same as a node which has changed it's Node Descriptioin.  In
> > this case the node needs to send a trap.
> 
> Is that case currently handled ?

In OpenSM, yes, Sasha accepted my patches a while ago.  I have not gotten
around to changing the kernel code to send the trap.  However, there is the
diag test utility which will send this trap if one wanted to use it.

> 
> > Signed-off-by: Ira K. Weiny <weiny2 at llnl.gov>
> > ---
> >  opensm/opensm/osm_state_mgr.c |   54 +++++++++++++++++++++++++++++++++++++++++
> >  1 files changed, 54 insertions(+), 0 deletions(-)
> >
> > diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
> > index b599582..15124c9 100644
> > --- a/opensm/opensm/osm_state_mgr.c
> > +++ b/opensm/opensm/osm_state_mgr.c
> > @@ -506,6 +506,54 @@ Exit:
> >  }
> >
> >  /**********************************************************************
> > + During a light sweep check each node to see if the node descriptor is valid
> > + if not issue a ND query.
> > +**********************************************************************/
> > +static void __osm_state_mgr_get_node_desc(IN cl_map_item_t * const p_object,
> > +                                       IN void *context)
> > +{
> > +       osm_physp_t *p_physp = NULL;
> > +       osm_node_t *const p_node = (osm_node_t *) p_object;
> > +       ib_api_status_t status = IB_SUCCESS;
> > +       osm_madw_context_t mad_context;
> > +       osm_sm_t *sm = (osm_sm_t *)context;
> > +
> > +       OSM_LOG_ENTER(sm->p_log);
> > +
> > +       CL_ASSERT(p_node);
> > +
> > +       if (p_node->print_desc && strcmp(p_node->print_desc, "<unknown>"))
> > +               /* if ND is valid, do nothing */
> > +               goto exit;
> > +
> > +       OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> > +               "__osm_state_mgr_get_node_desc: "
> > +               "Unknown node description \"%s\" for node 0x%016" PRIx64
> > +               ".  Reissuing ND query\n",
> > +               p_node->print_desc ? p_node->print_desc : "<unknown>",
> > +               cl_ntoh64(osm_node_get_node_guid (p_node)));
> 
> Needs ERR code.

Ah, yes, I forget.  Originally I made this a VERBOSE message but later changed it
to ERROR.  What method do we use for assigning the error codes?

> 
> > +
> > +       /* get a physp to request from. */
> > +       p_physp = osm_node_get_any_physp_ptr(p_node);
> > +
> > +       mad_context.nd_context.node_guid = osm_node_get_node_guid(p_node);
> > +
> > +       status = osm_req_get(sm,
> > +                            osm_physp_get_dr_path_ptr(p_physp),
> > +                            IB_MAD_ATTR_NODE_DESC,
> > +                            0, CL_DISP_MSGID_NONE, &mad_context);
> > +       if (status != IB_SUCCESS)
> > +               OSM_LOG(sm->p_log, OSM_LOG_ERROR,
> > +                       "__osm_ni_rcv_get_node_desc: ERR 0D03: "
> 
> Shouldn't routine name be __osm_state_mgr_get_node_desc here ? Is ERR
> code right ?

Sorry, copy and paste error, yes it should be changed.

It looks like 3314 and 3315 could be used for the error codes.  If these are
ok, the new patch is below.  If not, let me know and I can change for you.

Ira


>From 6f7ab6c6ffe64b1da76e5ba68babaaa3356c09fe Mon Sep 17 00:00:00 2001
From: Ira K. Weiny <weiny2 at llnl.gov>
Date: Wed, 30 Jul 2008 17:28:30 -0700
Subject: [PATCH] Add a Node Description check on light sweep to ensure that the ND has been
 found for each node.  This case covers the condition where a ND message is
 dropped/lost for some reason and OpenSM is left with a valid configured node
 which is not named correctly.

This is not the same as a node which has changed it's Node Descriptioin.  In
this case the node needs to send a trap.

Signed-off-by: Ira K. Weiny <weiny2 at llnl.gov>
---
 opensm/opensm/osm_state_mgr.c |   54 +++++++++++++++++++++++++++++++++++++++++
 1 files changed, 54 insertions(+), 0 deletions(-)

diff --git a/opensm/opensm/osm_state_mgr.c b/opensm/opensm/osm_state_mgr.c
index b599582..a769f52 100644
--- a/opensm/opensm/osm_state_mgr.c
+++ b/opensm/opensm/osm_state_mgr.c
@@ -506,6 +506,54 @@ Exit:
 }
 
 /**********************************************************************
+ During a light sweep check each node to see if the node descriptor is valid
+ if not issue a ND query.
+**********************************************************************/
+static void __osm_state_mgr_get_node_desc(IN cl_map_item_t * const p_object,
+					IN void *context)
+{
+	osm_physp_t *p_physp = NULL;
+	osm_node_t *const p_node = (osm_node_t *) p_object;
+	ib_api_status_t status = IB_SUCCESS;
+	osm_madw_context_t mad_context;
+	osm_sm_t *sm = (osm_sm_t *)context;
+
+	OSM_LOG_ENTER(sm->p_log);
+
+	CL_ASSERT(p_node);
+
+	if (p_node->print_desc && strcmp(p_node->print_desc, "<unknown>"))
+		/* if ND is valid, do nothing */
+		goto exit;
+
+	OSM_LOG(sm->p_log, OSM_LOG_ERROR,
+		"__osm_state_mgr_get_node_desc: ERR 3314: "
+		"Unknown node description \"%s\" for node 0x%016" PRIx64
+		".  Reissuing ND query\n",
+		p_node->print_desc ? p_node->print_desc : "<unknown>",
+		cl_ntoh64(osm_node_get_node_guid (p_node)));
+
+	/* get a physp to request from. */
+	p_physp = osm_node_get_any_physp_ptr(p_node);
+
+	mad_context.nd_context.node_guid = osm_node_get_node_guid(p_node);
+
+	status = osm_req_get(sm,
+			     osm_physp_get_dr_path_ptr(p_physp),
+			     IB_MAD_ATTR_NODE_DESC,
+			     0, CL_DISP_MSGID_NONE, &mad_context);
+	if (status != IB_SUCCESS)
+		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
+			"__osm_state_mgr_get_node_desc: ERR 3315: "
+			"Failure initiating NodeDescription request (%s)\n",
+			ib_get_err_str(status));
+
+exit:
+	OSM_LOG_EXIT(sm->p_log);
+}
+
+
+/**********************************************************************
  Initiates a lightweight sweep of the subnet.
  Used during normal sweeps after the subnet is up.
 **********************************************************************/
@@ -514,6 +562,7 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
 	ib_api_status_t status = IB_SUCCESS;
 	osm_bind_handle_t h_bind;
 	cl_qmap_t *p_sw_tbl;
+	cl_qmap_t *p_node_tbl;
 	cl_map_item_t *p_next;
 	osm_node_t *p_node;
 	osm_physp_t *p_physp;
@@ -522,6 +571,7 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
 	OSM_LOG_ENTER(sm->p_log);
 
 	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
+	p_node_tbl = &sm->p_subn->node_guid_tbl;
 
 	/*
 	 * First, get the bind handle.
@@ -540,6 +590,10 @@ static ib_api_status_t __osm_state_mgr_light_sweep_start(IN osm_sm_t * sm)
 	cl_qmap_apply_func(p_sw_tbl, __osm_state_mgr_get_sw_info, sm);
 	CL_PLOCK_RELEASE(sm->p_lock);
 
+	CL_PLOCK_ACQUIRE(sm->p_lock);
+	cl_qmap_apply_func(p_node_tbl, __osm_state_mgr_get_node_desc, sm);
+	CL_PLOCK_RELEASE(sm->p_lock);
+
 	/* now scan the list of physical ports that were not down but have no remote port */
 	CL_PLOCK_ACQUIRE(sm->p_lock);
 	p_next = cl_qmap_head(&sm->p_subn->node_guid_tbl);
-- 
1.5.4.5





More information about the general mailing list