[ofa-general] [Fwd: Re: [PATCH ] opensm: MFT tables are not set after non full member re-join]

Eli Dorfman (Voltaire) dorfman.eli at gmail.com
Sat Jun 13 23:47:38 PDT 2009


Hi Sasha,

Resending. I haven't seen a response to this patch.

Eli

-------- Original Message --------
Subject: Re: [PATCH ] opensm: MFT tables are not set after non full member re-join
Date: Tue, 19 May 2009 11:56:36 +0300
From: Eli Dorfman (Voltaire) <dorfman.eli at gmail.com>
To: Sasha Khapyorsky <sashak at voltaire.com>
CC: OpenIB <general at lists.openfabrics.org>
References: <4A1019F6.5060900 at gmail.com>

Eli Dorfman (Voltaire) wrote:
> MFT tables are not set after non full member re-join
> 
> In case of non full member re-join MFT tables are not set.
> No need to set or check non full member reference to mlid (port->mcm_list).
> This list should be used only for full members for cleanup when port goes down.
> 
> A simple scenarion to reproduce this:
> 1. Full member creates group
> 2. Non-member join - MFT sent
> 3. Full member leave
>         a. group is deleted but non member port has still reference to the MLID
> 4. Full member re-creates the group
> 5. Non member re-joins - MFT *NOT* sent to switches
> 
> Signed-off-by: Eli Dorfman <elid at voltaire.com>
> ---
>  opensm/include/opensm/osm_sm.h         |    3 ++-
>  opensm/opensm/osm_sa_mcmember_record.c |    6 +++---
>  opensm/opensm/osm_sm.c                 |   22 +++++++++++++++++++++-
>  3 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_sm.h b/opensm/include/opensm/osm_sm.h
> index cc8321d..1a8a577 100644
> --- a/opensm/include/opensm/osm_sm.h
> +++ b/opensm/include/opensm/osm_sm.h
> @@ -539,7 +539,8 @@ osm_resp_send(IN osm_sm_t * sm,
>  ib_api_status_t
>  osm_sm_mcgrp_join(IN osm_sm_t * const p_sm,
>  		  IN const ib_net16_t mlid,
> -		  IN const ib_net64_t port_guid);
> +		  IN const ib_net64_t port_guid,
> +				  IN uint8_t scope_state);
>  /*
>  * PARAMETERS
>  *	p_sm
> diff --git a/opensm/opensm/osm_sa_mcmember_record.c b/opensm/opensm/osm_sa_mcmember_record.c
> index 5543221..fe29dd6 100644
> --- a/opensm/opensm/osm_sa_mcmember_record.c
> +++ b/opensm/opensm/osm_sa_mcmember_record.c
> @@ -1039,7 +1039,7 @@ static void mcmr_rcv_leave_mgrp(IN osm_sa_t * sa, IN osm_madw_t * p_madw)
>  	if (!p_mgrp) {
>  		char gid_str[INET6_ADDRSTRLEN];
>  		CL_PLOCK_RELEASE(sa->p_lock);
> -		OSM_LOG(sa->p_log, OSM_LOG_DEBUG,
> +		OSM_LOG(sa->p_log, OSM_LOG_INFO,
>  			"Failed since multicast group %s not present\n",
>  			inet_ntop(AF_INET6, p_recvd_mcmember_rec->mgid.raw,
>  				  gid_str, sizeof gid_str));
> @@ -1309,8 +1309,8 @@ static void mcmr_rcv_join_mgrp(IN osm_sa_t * sa, IN osm_madw_t * p_madw)
>  
>  	/* do the actual routing (actually schedule the update) */
>  	status = osm_sm_mcgrp_join(sa->sm, mlid,
> -				   p_recvd_mcmember_rec->port_gid.unicast.
> -				   interface_id);
> +							   p_recvd_mcmember_rec->port_gid.unicast.interface_id,
> +							   p_recvd_mcmember_rec->scope_state);
>  
>  	if (status != IB_SUCCESS) {
>  		OSM_LOG(sa->p_log, OSM_LOG_ERROR, "ERR 1B14: "
> diff --git a/opensm/opensm/osm_sm.c b/opensm/opensm/osm_sm.c
> index daa60ff..b334d39 100644
> --- a/opensm/opensm/osm_sm.c
> +++ b/opensm/opensm/osm_sm.c
> @@ -468,7 +468,7 @@ static ib_api_status_t sm_mgrp_process(IN osm_sm_t * p_sm,
>  /**********************************************************************
>   **********************************************************************/
>  ib_api_status_t osm_sm_mcgrp_join(IN osm_sm_t * p_sm, IN const ib_net16_t mlid,
> -				  IN const ib_net64_t port_guid)
> +				  IN const ib_net64_t port_guid, IN uint8_t scope_state)
>  {
>  	osm_mgrp_t *p_mgrp;
>  	osm_port_t *p_port;
> @@ -515,6 +515,25 @@ ib_api_status_t osm_sm_mcgrp_join(IN osm_sm_t * p_sm, IN const ib_net16_t mlid,
>  		goto Exit;
>  	}
>  
> +	/* if there was no change from the last time
> +	 * we processed the group we can skip doing anything
> +	 */
> +	if (p_mgrp->last_change_id == p_mgrp->last_tree_id) {
> +		OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
> +			"Skip processing mgrp with lid:0x%X last change id:%u\n",
> +			cl_ntoh16(mlid), p_mgrp->last_change_id);
> +		goto Exit;
> +	} else {
> +		OSM_LOG(p_sm->p_log, OSM_LOG_DEBUG,
> +			"processing mgrp with lid:0x%X port: 0x%016" PRIx64 " last change id:%u tree id:%u\n",
> +			cl_ntoh16(mlid), cl_ntoh64(port_guid), 
> +			p_mgrp->last_change_id, p_mgrp->last_tree_id);
> +	}
> +
> +	/* add mgrp only to FULL member port. used for cleanup when port goes down */
> +	if (!(scope_state & IB_JOIN_STATE_FULL))
> +		goto MgrpProcess;
> +
>  	/*
>  	 * Check if the object (according to mlid) already exists on this port.
>  	 * If it does - then no need to update it again, and no need to
> @@ -543,6 +562,7 @@ ib_api_status_t osm_sm_mcgrp_join(IN osm_sm_t * p_sm, IN const ib_net16_t mlid,
>  		goto Exit;
>  	}
>  
> +MgrpProcess:
>  	status = sm_mgrp_process(p_sm, p_mgrp);
>  	CL_PLOCK_RELEASE(p_sm->p_lock);
>  

The following fixes a bug in the above PATCH that will lock the opensm 
when multicast group was not changed.

diff --git a/opensm/opensm/osm_sm.c b/opensm/opensm/osm_sm.c
index b334d39..28cd76f 100644
--- a/opensm/opensm/osm_sm.c
+++ b/opensm/opensm/osm_sm.c
@@ -519,6 +519,7 @@ ib_api_status_t osm_sm_mcgrp_join(IN osm_sm_t * p_sm, IN const ib_net16_t mlid,
 	 * we processed the group we can skip doing anything
 	 */
 	if (p_mgrp->last_change_id == p_mgrp->last_tree_id) {
+		CL_PLOCK_RELEASE(p_sm->p_lock);
 		OSM_LOG(p_sm->p_log, OSM_LOG_VERBOSE,
 			"Skip processing mgrp with lid:0x%X last change id:%u\n",
 			cl_ntoh16(mlid), p_mgrp->last_change_id);




More information about the general mailing list