[ofa-general] Re: [OpenSM] [PATCH 3/3] implement port_offsetting option

Al Chu chu11 at llnl.gov
Tue Jun 3 15:46:03 PDT 2008


Hey Sasha,

Here are two patches to fix the minor comments you made earlier.

Sanity tested on my small cluster, but did not have the chance to test
on a big one.  However, these patches are tiny, so I doubt there'd be an
issue.

Al

On Tue, 2008-06-03 at 04:20 +0300, Sasha Khapyorsky wrote:
> Hi Al,
> 
> Sorry about huge delay. Some questions are below.
> 
> On 14:12 Thu 10 Apr     , Al Chu wrote:
> > This is the primary patch that fiddles with the path recommendation
> > code.  A few notes:
> > 
> > 1) b/c I want to keep track of how many remote destinations there can
> > be, the 'remote_guids' array now stores all remote destinations, not
> > just the ones we have already forwarded to.
> > 
> > 2) b/c I may need to free memory, I now "goto Exit" instead of just
> > calling 'return' many times.
> > 
> > 3) Although the option is called 'port_offsetting', I actually "offset"
> > both the remote destination I send to and the port pointing towards that
> > remote destination.
> > 
> > Al
> > 
> > -- 
> > Albert Chu
> > chu11 at llnl.gov
> > 925-422-5311
> > Computer Scientist
> > High Performance Systems Division
> > Lawrence Livermore National Laboratory
> 
> > From 57eb4d9bf55fbbbf39dc1c7ddfeeb2cae4776ef0 Mon Sep 17 00:00:00 2001
> > From: Albert L. Chu <chu11 at llnl.gov>
> > Date: Thu, 20 Mar 2008 16:23:13 -0700
> > Subject: [PATCH] implement port_offsetting
> > 
> > 
> > Signed-off-by: Albert L. Chu <chu11 at llnl.gov>
> > ---
> >  opensm/include/opensm/osm_switch.h |    7 +-
> >  opensm/opensm/osm_dump.c           |    3 +-
> >  opensm/opensm/osm_switch.c         |  246 ++++++++++++++++++++++++++++++++----
> >  opensm/opensm/osm_ucast_mgr.c      |   14 ++-
> >  4 files changed, 239 insertions(+), 31 deletions(-)
> > 
> > diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
> > index 2624d5f..45f4718 100644
> > --- a/opensm/include/opensm/osm_switch.h
> > +++ b/opensm/include/opensm/osm_switch.h
> > @@ -997,7 +997,8 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  			  IN const boolean_t dor,
> >  			  IN OUT osm_switch_guid_count_t * remote_guids,
> >  			  IN OUT uint16_t * p_num_remote_guids,
> > -			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used);
> > +			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used,
> > +			  IN uint16_t port_offsetting_lids_per_port);
> >  /*
> >  * PARAMETERS
> >  *	p_sw
> > @@ -1031,6 +1032,10 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  *		[in out] The specific osm_switch_guid_count_t used
> >  *		in switch recommendations.
> >  *
> > +*	port_offsetting_lids_per_port
> > +*		[in] If > 0, indicates lids_per_port to use with
> > +*		port_offsetting option.
> > +*
> >  * RETURN VALUE
> >  *	Returns the recommended port on which to route this LID.
> >  *
> > diff --git a/opensm/opensm/osm_dump.c b/opensm/opensm/osm_dump.c
> > index 2bac75a..37842ac 100644
> > --- a/opensm/opensm/osm_dump.c
> > +++ b/opensm/opensm/osm_dump.c
> > @@ -219,7 +219,8 @@ static void dump_ucast_routes(cl_map_item_t *p_map_item, FILE *file, void *cxt)
> >  			/* No LMC Optimization */
> >  			best_port = osm_switch_recommend_path(p_sw, p_port,
> >  							      lid_ho, TRUE, dor,
> > -							      NULL, NULL, NULL);
> > +							      NULL, NULL,
> > +							      NULL, 0);
> >  			fprintf(file, "No %u hop path possible via port %u!",
> >  				best_hops, best_port);
> >  		}
> > diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
> > index f346b25..8e3342b 100644
> > --- a/opensm/opensm/osm_switch.c
> > +++ b/opensm/opensm/osm_switch.c
> > @@ -55,6 +55,15 @@
> >  #include <iba/ib_types.h>
> >  #include <opensm/osm_switch.h>
> >  
> > +/* Local structs */
> > +struct osm_switch_remote_dest {
> > +	uint32_t total_paths;	
> > +	uint32_t min_paths;
> > +	uint8_t ports[256]; /* 256 b/c max_ports is a uint8_t */
> > +	unsigned int ports_count;
> > +	osm_switch_guid_count_t *p_remote_guid;
> > +};
> > +
> >  /**********************************************************************
> >   **********************************************************************/
> >  cl_status_t
> > @@ -304,9 +313,32 @@ osm_switch_find_guid_count(IN const osm_switch_t * const p_sw,
> >  					   1);
> >  }
> >  
> > +/**********************************************************************
> > + **********************************************************************/
> > +
> > +/* greatest common divisor */
> > +static unsigned int
> > +_gcd(unsigned int a, unsigned int b)
> > +{
> > +	unsigned int t;
> > +	while (b != 0) {
> > +		t = b;
> > +		b = a % b;
> > +		a = t;
> > +	}
> > +	return a;
> > +}
> > +
> > +/* least common multiple */
> > +static unsigned int
> > +_lcm(unsigned int a, unsigned int b)
> > +{
> > +	return ((a*b) / _gcd(a,b));
> > +}
> >  
> >  /**********************************************************************
> >   **********************************************************************/
> > +
> >  uint8_t
> >  osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  			  IN osm_port_t * p_port,
> > @@ -315,7 +347,8 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  			  IN const boolean_t dor,
> >  			  IN OUT osm_switch_guid_count_t * remote_guids,
> >  			  IN OUT uint16_t * p_num_remote_guids,
> > -			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used)
> > +			  IN OUT osm_switch_guid_count_t ** p_remote_guid_count_used,
> > +			  IN uint16_t port_offsetting_lids_per_port)
> >  {
> >  	/*
> >  	   We support an enhanced LMC aware routing mode:
> > @@ -356,6 +389,20 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  	osm_node_t *p_rem_node;
> >  	osm_node_t *p_rem_node_first = NULL;
> >  	osm_switch_guid_count_t *p_remote_guid = NULL;
> > +	/*
> > +	   These vars track information for port offsetting.
> > +	 */
> > +	boolean_t port_offsetting = remote_guids && p_num_remote_guids
> > +		&& p_remote_guid_count_used && port_offsetting_lids_per_port;
> > +	struct osm_switch_remote_dest * remote_dests = NULL;
> > +	struct osm_switch_remote_dest * p_remote_dest = NULL;
> > +	uint32_t num_remote_dests = 0;
> > +	uint32_t total_paths_count = 0;
> > +	uint32_t num_potential_ports = 0;
> > +	uint32_t lcm = 0;
> > +	uint32_t indx = 0;
> > +	boolean_t dest_found = FALSE;
> > +	unsigned int i;
> >  
> >  	CL_ASSERT(lid_ho > 0);
> >  
> > @@ -378,9 +425,22 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  
> >  	num_ports = p_sw->num_ports;
> >  
> > +	if (port_offsetting) {
> > +		remote_dests = malloc(sizeof(struct osm_switch_remote_dest) * num_ports);
> > +		if (remote_dests == NULL) {
> > +			osm_log(p_sw->p_log, OSM_LOG_ERROR,
> > +				"osm_switch_recommend_path: "
> > +				"Cannot allocate array. Insufficient memory: "
> > +				"Disabling port_offsetting\n");
> > +			port_offsetting = 0;
> > +		}
> > +	}
> > +
> >  	least_hops = osm_switch_get_least_hops(p_sw, base_lid);
> > -	if (least_hops == OSM_NO_PATH)
> > -		return (OSM_NO_PATH);
> > +	if (least_hops == OSM_NO_PATH) {
> > +		best_port = OSM_NO_PATH;
> > +		goto Exit;
> > +	}
> >  
> >  	/*
> >  	   First, inquire with the forwarding table for an existing
> > @@ -417,8 +477,10 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  				   in the forwarding tables that he wants to be overridden by the
> >  				   minimum hop function.
> >  				 */
> > -				if (hops == least_hops)
> > -					return (port_num);
> > +				if (hops == least_hops) {
> > +					best_port = port_num;
> > +					goto Exit;
> > +				}
> >  			}
> >  		}
> >  	}
> 
> Maybe to move remote_dests allocation to be here and so minimize needed
> 'goto Exit' changes?
> 
> > @@ -475,7 +537,8 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  								       port_num);
> >  
> >  			/* If not update the least hops for this case */
> > -			if (!p_remote_guid) {
> > +			if (!p_remote_guid
> > +			    || !p_remote_guid->forwarded_to) {
> >  				if (check_count < least_paths_other_sys) {
> >  					least_paths_other_sys = check_count;
> >  					best_port_other_sys = port_num;
> > @@ -489,7 +552,8 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  										port_num);
> >  
> >  				/* If not update the least hops for this case */
> > -				if (!p_remote_guid
> > +				if ((!p_remote_guid
> > +				     || !p_remote_guid->forwarded_to)
> >  				    && check_count < least_paths_other_nodes) {
> >  					least_paths_other_nodes = check_count;
> >  					best_port_other_node = port_num;
> > @@ -498,6 +562,50 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  				/* else prior sys and node guid already used */
> >  
> >  			}	/* same sys found */
> > +
> > +			/* Store the new sys/node guid that we haven't seen yet */
> > +			if (!p_remote_guid) {
> > +				p_rem_physp = osm_physp_get_remote(p_physp);
> > +				p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
> > +				memcpy(&(remote_guids[*p_num_remote_guids].sys_guid),
> > +				       &(p_rem_node->node_info.sys_guid),
> > +				       sizeof(uint64_t));
> > +				memcpy(&(remote_guids[*p_num_remote_guids].node_guid),
> > +				       &(p_rem_node->node_info.node_guid),
> > +				       sizeof(uint64_t));
> > +				remote_guids[*p_num_remote_guids].forwarded_to = 0;
> > +				p_remote_guid = &remote_guids[*p_num_remote_guids];
> > +				(*p_num_remote_guids)++;
> > +			}
> > +		}
> > +
> > +		if (port_offsetting) {
> > +			/* Keep track of the destinations we've seen so far */
> > +			p_remote_dest = NULL;
> > +			for (i = 0; i < num_remote_dests; i++) {
> > +				if (!memcmp(p_remote_guid,
> > +					    (&remote_dests[i])->p_remote_guid,
> > +					    sizeof(struct osm_switch_remote_dest))) {
> 
> Should there be sizeof(osm_switch_guid_count_t)?
> 
> Sasha
> 
> > +					p_remote_dest = &remote_dests[i];
> > +					break;
> > +				}
> > +			}
> > +			if (!p_remote_dest) {
> > +				p_remote_dest = &remote_dests[num_remote_dests];
> > +				p_remote_dest->p_remote_guid = p_remote_guid;
> > +				p_remote_dest->total_paths = 0;
> > +				p_remote_dest->min_paths = 0xFFFFFFFF;
> > +				p_remote_dest->ports_count = 0;
> > +				num_remote_dests++;
> > +			}
> > +			p_remote_dest->total_paths += check_count;
> > +			if (check_count < p_remote_dest->min_paths)
> > +				p_remote_dest->min_paths = check_count;
> > +			p_remote_dest->ports[p_remote_dest->ports_count] = port_num;
> > +			p_remote_dest->ports_count++;
> > +
> > +			total_paths_count += check_count;
> > +			num_potential_ports++;
> >  		}
> >  
> >  		/* routing for LMC mode */
> > @@ -523,21 +631,23 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  			best_port = port_num;
> >  			least_paths = check_count;
> >  			if (routing_for_lmc
> > -			    && p_remote_guid
> > +			    && p_remote_guid->forwarded_to
> >  			    && p_remote_guid->forwarded_to < least_forwarded_to)
> >  				least_forwarded_to = p_remote_guid->forwarded_to;
> >  		}
> >  		else if (routing_for_lmc
> > -			 && p_remote_guid
> >  			 && check_count == least_paths
> > +			 && p_remote_guid->forwarded_to
> >  			 && p_remote_guid->forwarded_to < least_forwarded_to) {
> >  			least_forwarded_to = p_remote_guid->forwarded_to;
> >  			best_port = port_num;
> >  		}
> >  	}
> >  
> > -	if (port_found == FALSE)
> > -		return (OSM_NO_PATH);
> > +	if (port_found == FALSE) {
> > +		best_port = OSM_NO_PATH;
> > +		goto Exit;
> > +	}
> >  
> >  	/*
> >  	   if we are in enhanced routing mode and the best port is not
> > @@ -555,24 +665,110 @@ osm_switch_recommend_path(IN const osm_switch_t * const p_sw,
> >  							   remote_guids,
> >  							   p_num_remote_guids,
> >  							   best_port);
> > +		/* Must be stored */
> > +		CL_ASSERT(p_remote_guid);
> > +		*p_remote_guid_count_used = p_remote_guid;
> > +	}
> >  
> > -		if (!p_remote_guid) {
> > -			/* track the remote node and system of the port used. */
> > -			p_physp = osm_node_get_physp_ptr(p_sw->p_node, best_port);
> > -			p_rem_physp = osm_physp_get_remote(p_physp);
> > -			p_rem_node = osm_physp_get_node_ptr(p_rem_physp);
> > -			memcpy(&(remote_guids[*p_num_remote_guids].sys_guid),
> > -			       &(p_rem_node->node_info.sys_guid),
> > -			       sizeof(uint64_t));
> > -			memcpy(&(remote_guids[*p_num_remote_guids].node_guid),
> > -				       &(p_rem_node->node_info.node_guid),
> > -			       sizeof(uint64_t));
> > -			remote_guids[*p_num_remote_guids].forwarded_to = 0;
> > -			(*p_num_remote_guids)++;
> > +	/*
> > +	 * As an example of what we're trying to do with port
> > +	 * offsetting, assume LMC = 2 and we are trying to route
> > +	 * the lids of 4 ports. The lids of these 4 ports are:
> > +	 *
> > +	 * (1,2,3,4)
> > +	 * (5,6,7,8)
> > +	 * (9,10,11,12)
> > +	 * (13,14,15,16)
> > +	 *
> > +	 * Suppose forwarding to all these lids goes through
> > +	 * 4 specific switch ports.  If we just cycle through
> > +	 * ports and lids in a normal iterative fashion, we would
> > +	 * normally forward out ports in this manner.
> > +	 *
> > +	 * switch port 1: 1, 5, 9, 13
> > +	 * switch port 2: 2, 6, 10, 14
> > +	 * switch port 3: 3, 7, 11, 15
> > +	 * switch port 4: 4, 8, 12, 1
> > +	 *
> > +	 * Note that the base lid of each port (lids 1, 5, 9, 13)
> > +	 * are all routed out of switch port 1.  Thus, if the user
> > +	 * only uses the base lid of each port, they will get pretty
> > +	 * bad performance.  We will try to get this layout instead.
> > +	 *
> > +	 * switch port 1: 1, 8, 11, 14
> > +	 * switch port 2: 2, 5, 12, 15
> > +	 * switch port 3: 3, 6, 9,  16
> > +	 * switch port 4: 4, 7, 10, 13
> > +	 *
> > +	 * where switch ports are distributed in a more even manner.
> > +	 * The base lid of each port is now distributed evenly
> > +	 * across all 4 switch ports.  The remaining lids are still
> > +	 * distributed evenly across all the remaining switch ports.
> > +	 *
> > +	 * In order to accomplish this, we (effectively) will iterate
> > +	 * through all ports like before, but instead of iterating from
> > +	 * 0 to N-1 all the time, we will select the starting index
> > +	 * based on the number of paths we have routed thus far.
> > +	 */
> > +
> > +	/* We will not do port offsetting if num_potential_ports == 1
> > +	 * b/c there is no offsetting that can be done.
> > +	 */
> > +	if (port_offsetting
> > +	    && best_port
> > +	    && num_potential_ports > 1) {
> > +		/* Select which destination we want to forward to with our
> > +		 * offsetting loop.
> > +		 */
> > +		lcm = _lcm(port_offsetting_lids_per_port,
> > +			   num_remote_dests);
> > +		indx = (total_paths_count / lcm) % num_remote_dests;
> > +		for (i = 0; i < num_remote_dests; i++) {
> > +			p_remote_dest = &remote_dests[indx];
> > +			p_remote_guid = p_remote_dest->p_remote_guid;
> > +
> > +			if (p_remote_guid->forwarded_to == least_forwarded_to
> > +			    && p_remote_dest->min_paths == least_paths) {
> > +				dest_found = TRUE;
> > +				break;
> > +			}
> > +
> > +			indx++;
> > +			if (indx >= num_remote_dests)
> > +				indx = 0;
> > +		}
> > +
> > +		/* Then we "offset" within the potential ports we could
> > +		 * forward out of for this specific destination.
> > +		 */
> > +		if (dest_found && p_remote_dest) {
> > +			lcm = _lcm(port_offsetting_lids_per_port,
> > +				   p_remote_dest->ports_count);
> > +			indx = (p_remote_dest->total_paths / lcm) % p_remote_dest->ports_count;
> > +			for (i = 0; i < p_remote_dest->ports_count; i++) {
> > +				port_num = p_remote_dest->ports[indx];
> > +				check_count =
> > +					osm_port_prof_path_count_get(&p_sw->p_prof[port_num]);
> > +				if (check_count == least_paths) {
> > +					if (best_port != port_num)
> > +						osm_log(p_sw->p_log, OSM_LOG_DEBUG,
> > +							"osm_switch_recommend_path: "
> > +							"best port offsetted: %d to %d\n",
> > +							best_port, port_num);
> > +					best_port = port_num;
> > +					*p_remote_guid_count_used = p_remote_dest->p_remote_guid;
> > +					break;
> > +				}
> > +				indx++;
> > +				if (indx >= p_remote_dest->ports_count)
> > +					indx = 0;
> > +			}
> >  		}
> > -		*p_remote_guid_count_used = p_remote_guid;
> >  	}
> >  
> > +Exit:
> > +	if (remote_dests)
> > +		free(remote_dests);
> >  	return (best_port);
> >  }
> >  
> > diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
> > index 938db84..501c2c7 100644
> > --- a/opensm/opensm/osm_ucast_mgr.c
> > +++ b/opensm/opensm/osm_ucast_mgr.c
> > @@ -212,18 +212,23 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
> >  	osm_switch_guid_count_t *remote_guids = NULL;
> >  	uint16_t num_used_guids = 0;
> >  	osm_switch_guid_count_t *p_remote_guid_used = NULL;
> > +	uint16_t port_offsetting_lids_per_port = 0;
> >  
> >  	OSM_LOG_ENTER(p_mgr->p_log);
> >  
> >  	if (lids_per_port > 1) {
> > -		remote_guids = malloc(sizeof(osm_switch_guid_count_t) * lids_per_port);
> > +		uint8_t num_ports = p_sw->num_ports;
> > +		remote_guids = malloc(sizeof(osm_switch_guid_count_t) * num_ports);
> >  		if (remote_guids == NULL) {
> >  			osm_log(p_mgr->p_log, OSM_LOG_ERROR,
> >  				"__osm_ucast_mgr_process_port: ERR 3A09: "
> >  				"Cannot allocate array. Insufficient memory\n");
> >  			goto Exit;
> >  		}
> > -		memset(remote_guids, 0, sizeof(osm_switch_guid_count_t) * lids_per_port);
> > +		memset(remote_guids, 0, sizeof(osm_switch_guid_count_t) * num_ports);
> > +
> > +		if (p_mgr->p_subn->opt.port_offsetting)
> > +			port_offsetting_lids_per_port = lids_per_port;
> >  	}
> >  
> >  	osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
> > @@ -270,14 +275,15 @@ __osm_ucast_mgr_process_port(IN osm_ucast_mgr_t * const p_mgr,
> >  							 p_mgr->is_dor,
> >  							 remote_guids,
> >  							 &num_used_guids,
> > -							 &p_remote_guid_used);
> > +							 &p_remote_guid_used,
> > +							 port_offsetting_lids_per_port);
> >  		}
> >  		else
> >  			port = osm_switch_recommend_path(p_sw, p_port, lid_ho,
> >  							 p_mgr->p_subn->
> >  							 ignore_existing_lfts,
> >  							 p_mgr->is_dor,
> > -							 NULL, NULL, NULL);
> > +							 NULL, NULL, NULL, 0);
> >  
> >  		/*
> >  		   There might be no path to the target
> > -- 
> > 1.5.1
> > 
> 
-- 
Albert Chu
chu11 at llnl.gov
925-422-5311
Computer Scientist
High Performance Systems Division
Lawrence Livermore National Laboratory
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-move-remote_dests-array-allocation-down-to-remove-go.patch
Type: text/x-patch
Size: 2091 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20080603/7aa31eb8/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-fix-incorrect-memcmp-length.patch
Type: text/x-patch
Size: 870 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20080603/7aa31eb8/attachment-0001.bin>


More information about the general mailing list