[ofa-general] [PATCH 1/3] Enhanced Fat-Tree algorithm: Added balanced minhop within Fat-Tree Added qsort for choosing secondary routes order instead of simple index

Nicolas Morey Chaisemartin nicolas.morey-chaisemartin at ext.bull.net
Thu Mar 5 06:01:13 PST 2009


Signed-off-by: Nicolas Morey-Chaisemartin <nicolas.morey-chaisemartin at ext.bull.net>
---
 opensm/opensm/osm_ucast_ftree.c |  169 ++++++++++++++++++++++++---------------
 1 files changed, 106 insertions(+), 63 deletions(-)

diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c
index d92265b..da72ade 100644
--- a/opensm/opensm/osm_ucast_ftree.c
+++ b/opensm/opensm/osm_ucast_ftree.c
@@ -152,6 +152,7 @@ typedef struct ftree_port_group_t_ {
 	boolean_t is_cn;	/* whether this port is a compute node */
 	boolean_t is_io;	/* whether this port is an I/O node */
 	uint32_t counter_down;	/* number of allocated routs downwards */
+	uint32_t counter_up;	/* number of allocated routs downwards */
 } ftree_port_group_t;
 
 /***************************************************
@@ -1903,6 +1904,27 @@ static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
 				    p_sw->p_osm_sw);
 }
 
+
+
+static int
+__osm_ftree_port_group_compare_load_down(const void* p1,const void* p2){
+	int temp = (*((ftree_port_group_t**)p1))->counter_down -(*((ftree_port_group_t**)p2))->counter_down ;
+	if(temp > 0)
+		return 1;
+	if(temp < 0)
+		return -1;
+	return 0;
+}
+
+static int
+__osm_ftree_port_group_compare_load_up(const void* p1,const void* p2){
+	int temp = (*((ftree_port_group_t**)p1))->counter_up -(*((ftree_port_group_t**)p2))->counter_up ;
+	if(temp > 0)
+		return 1;
+	if(temp < 0)
+		return -1;
+	return 0;
+}
 /***************************************************
  ***************************************************/
 
@@ -1935,10 +1957,10 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 	ftree_port_group_t *p_group;
 	ftree_port_t *p_port;
 	ftree_port_t *p_min_port;
-	uint16_t i;
 	uint16_t j;
 	uint16_t k;
 	boolean_t created_route = FALSE;
+	boolean_t routed=0;
 
 	/* we shouldn't enter here if both real_lid and main_path are false */
 	CL_ASSERT(is_real_lid || is_main_path);
@@ -1948,11 +1970,13 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 		return FALSE;
 
 	/* foreach down-going port group (in indexing order) */
-	i = p_sw->down_port_groups_idx;
+
+	qsort(p_sw->down_port_groups,p_sw->down_port_groups_num,sizeof(*(p_sw->down_port_groups)), __osm_ftree_port_group_compare_load_up);
+
+
 	for (k = 0; k < p_sw->down_port_groups_num; k++) {
 
-		p_group = p_sw->down_port_groups[i];
-		i = (i + 1) % p_sw->down_port_groups_num;
+		p_group = p_sw->down_port_groups[k];
 
 		/* If this port group doesn't point to a switch, mark
 		   that the route was created and skip to the next group */
@@ -2002,6 +2026,10 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 				cl_ntoh16(p_group->base_lid),
 				__osm_ftree_tuple_to_str(p_sw->tuple),
 				cl_ntoh16(p_group->remote_base_lid));
+			/* We skip only if we have come through a longer path */
+			if(((target_rank - highest_rank_in_route) +
+			    (p_remote_sw->rank - highest_rank_in_route) + 2*reverse_hops) >=
+			    osm_switch_get_least_hops(p_remote_sw->p_osm_sw, cl_ntoh16(target_lid)))
 			continue;
 		}
 
@@ -2034,10 +2062,6 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 
 		/* second case: skip the port group if the remote (lower)
 		   switch has been already configured for this target LID */
-		if (is_real_lid && !is_main_path &&
-		    p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] !=
-		    OSM_NO_PATH)
-			continue;
 
 		/* setting fwd tbl port only if this is real LID */
 		if (is_real_lid) {
@@ -2073,19 +2097,26 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 		   the upper side of the link (on switch with lower rank).
 		   Counter is promoted only if we're routing LID on the main
 		   path (whether it's a real LID or a dummy one). */
-		if (is_main_path)
-			p_min_port->counter_up++;
+		/*		if (is_main_path)
+		  p_min_port->counter_up++;*/
 
 		/* Recursion step:
 		   Assign upgoing ports by stepping down, starting on REMOTE switch */
-		created_route |= __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw,	/* remote switch - used as a route-upgoing alg. start point */
-										NULL,	/* prev. position - NULL to mark that we went down and not up */
-										target_lid,	/* LID that we're routing to */
-										target_rank,	/* rank of the LID that we're routing to */
-										is_real_lid,	/* whether the target LID is real or dummy */
-										is_main_path,	/* whether this is path to HCA that should by tracked by counters */
-										highest_rank_in_route, reverse_hops);	/* highest visited point in the tree before going down */
-	}
+		routed =
+		    __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree,
+								   p_remote_sw,	/* remote switch - used as a route-upgoing alg. start point */
+								   NULL,	/* prev. position - NULL to mark that we went down and not up */
+								   target_lid,	/* LID that we're routing to */
+								   target_rank,	/* rank of the LID that we're routing to */
+								   is_real_lid,	/* whether the target LID is real or dummy */
+								   is_main_path,	/* whether this is path to HCA that should by tracked by counters */
+								   highest_rank_in_route, reverse_hops);	/* highest visited point in the tree before going down */
+		created_route|=routed;
+		if(routed)
+			p_min_port->counter_up++;
+			p_group->counter_up++;
+		}
+
 	/* done scanning all the down-going port groups */
 
 	/* if the route was created, promote the index that
@@ -2099,6 +2130,8 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
 	return created_route;
 }				/* __osm_ftree_fabric_route_upgoing_by_going_down() */
 
+
+
 /***************************************************/
 
 /*
@@ -2112,7 +2145,7 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
  *    assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
  */
 
-static void
+static boolean_t
 __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 					       IN ftree_sw_t * p_sw,
 					       IN ftree_sw_t * p_prev_sw,
@@ -2131,12 +2164,14 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 	ftree_port_t *p_min_port;
 	uint16_t i;
 	uint16_t j;
+	boolean_t created_route = FALSE;
+	boolean_t routed = FALSE;
 
 	/* we shouldn't enter here if both real_lid and main_path are false */
 	CL_ASSERT(is_real_lid || is_main_path);
 
 	/* Assign upgoing ports by stepping down, starting on THIS switch */
-	__osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw,	/* local switch - used as a route-upgoing alg. start point */
+	created_route = __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw,	/* local switch - used as a route-upgoing alg. start point */
 						       p_prev_sw,	/* switch that we went up from (NULL means that we went down) */
 						       target_lid,	/* LID that we're routing to */
 						       target_rank,	/* rank of the LID that we're routing to */
@@ -2151,8 +2186,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 			/* We go up by going down as we have some reverse_hop_credit left */
 			/* We use the index to scatter a bit the reverse up routes */
 			p_sw->down_port_groups_idx =
-			    (p_sw->down_port_groups_idx +
-			     1) % p_sw->down_port_groups_num;
+				(p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num;
 			i = p_sw->down_port_groups_idx;
 			for (j = 0; j < p_sw->down_port_groups_num; j++) {
 
@@ -2160,39 +2194,30 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 				i = (i + 1) % p_sw->down_port_groups_num;
 
 				/* Skip this port group unless it points to a switch */
-				if (p_group->remote_node_type !=
-				    IB_NODE_TYPE_SWITCH)
+				if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
 					continue;
 				p_remote_sw = p_group->remote_hca_or_sw.p_sw;
 
-				__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
-									       p_sw,	/* this switch - prev. position switch for the function */
-									       target_lid,	/* LID that we're routing to */
-									       target_rank,	/* rank of the LID that we're routing to */
-									       is_real_lid,	/* whether this target LID is real or dummy */
-									       is_main_path,	/* whether this is path to HCA that should by tracked by counters */
-									       reverse_hop_credit - 1,	/* Remaining reverse_hops allowed */
-									       reverse_hops + 1);	/* Number of reverse_hops done up to this point */
+				created_route |=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
+													p_sw,	/* this switch - prev. position switch for the function */
+													target_lid,	/* LID that we're routing to */
+													target_rank,	/* rank of the LID that we're routing to */
+													is_real_lid,	/* whether this target LID is real or dummy */
+													is_main_path,	/* whether this is path to HCA that should by tracked by counters */
+													reverse_hop_credit - 1,	/* Remaining reverse_hops allowed */
+													reverse_hops + 1);	/* Number of reverse_hops done up to this point */
 			}
 
 		}
-		return;
+		return created_route;
 	}
 
-	/* Find the least loaded upgoing port group */
-	p_min_group = NULL;
-	for (i = 0; i < p_sw->up_port_groups_num; i++) {
-		p_group = p_sw->up_port_groups[i];
-		if (!p_min_group) {
-			/* first group that we're checking - use
-			   it as a group with the lowest load */
-			p_min_group = p_group;
-		} else if (p_group->counter_down < p_min_group->counter_down) {
-			/* this group is less loaded - use it as min */
-			p_min_group = p_group;
-		}
-	}
+	/* We should generate a list of port sorted by load so we can find easily the least
+	 * going port and explore the other pots on secondary routes more easily (and quickly) */
+	qsort(p_sw->up_port_groups,p_sw->up_port_groups_num,sizeof(*(p_sw->up_port_groups)), __osm_ftree_port_group_compare_load_down);
+
 
+	p_min_group = p_sw->up_port_groups[0];
 	/* Find the least loaded upgoing port in the selected group */
 	p_min_port = NULL;
 	ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
@@ -2297,7 +2322,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 
 		/* Recursion step:
 		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
-		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
+		created_route|=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
 							       p_sw,	/* this switch - prev. position switch for the function */
 							       target_lid,	/* LID that we're routing to */
 							       target_rank,	/* rank of the LID that we're routing to */
@@ -2309,7 +2334,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 
 	/* we're done for the third case */
 	if (!is_real_lid)
-		return;
+		return created_route;
 
 	/* What's left to do at this point:
 	 *
@@ -2343,14 +2368,15 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 	 *         - go UP(TRUE,FALSE) to the remote switch
 	 */
 
-	for (i = 0; i < p_sw->up_port_groups_num; i++) {
+	for (i = 1; i < p_sw->up_port_groups_num; i++) {
 		p_group = p_sw->up_port_groups[i];
 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
 
-		/* skip if target lid has been already set on remote switch fwd tbl */
-		if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] !=
-		    OSM_NO_PATH)
-			continue;
+		/* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count)*/
+		if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
+			if((target_rank -p_remote_sw->rank + 2*reverse_hops) >=
+			   osm_switch_get_least_hops(p_remote_sw->p_osm_sw, cl_ntoh16(target_lid)))
+				continue;
 
 		if (p_sw->is_leaf) {
 			OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
@@ -2365,8 +2391,23 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 		   We can safely assume that switch will initiate very
 		   few traffic, so there's no point waisting runtime on
 		   trying to balance these routes - always pick port 0. */
+		/* GET MIN PORT HERE */
+		p_min_port = NULL;
+		ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
+		for (j = 0; j < ports_num; j++) {
+			cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
+			if (!p_min_port) {
+				/* first port that we're checking - use
+				   it as a port with the lowest load */
+				p_min_port = p_port;
+			} else if (p_port->counter_down < p_min_port->counter_down) {
+				/* this port is less loaded - use it as min */
+				p_min_port = p_port;
+			}
+		}
 
-		cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
+		p_port = p_min_port;
+		//cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
 		p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
 		    p_port->remote_port_num;
 
@@ -2387,7 +2428,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 
 		/* Recursion step:
 		   Assign downgoing ports by stepping up, starting on REMOTE switch. */
-		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
+		routed = __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
 							       p_sw,	/* this switch - prev. position switch for the function */
 							       target_lid,	/* LID that we're routing to */
 							       target_rank,	/* rank of the LID that we're routing to */
@@ -2395,16 +2436,17 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 							       FALSE,	/* whether this is path to HCA that should by tracked by counters */
 							       reverse_hop_credit,	/* Remaining reverse_hops allowed */
 							       reverse_hops);	/* Number of reverse_hops done up to this point */
+                created_route|=routed;
 	}
 
-	/* If we don't have any reverse hop credits, we are done */
-	if (reverse_hop_credit == 0)
-		return;
+       /* If we don't have any reverse hop credits, we are done */
+       if(reverse_hop_credit==0)
+              return created_route;
 
-	/* We explore all the down group ports */
-	/* We try to reverse jump for each of them */
-	/* They already have a route to us from the upgoing_by_going_down started earlier */
-	/* This is only so it'll continue exploring up, after this step backwards */
+       /* We explore all the down group ports */
+       /* We try to reverse jump for each of them */
+       /* They already have a route to us from the upgoing_by_going_down started earlier */
+       /* This is only so it'll continue exploring up, after this step backwards*/
 	for (i = 0; i < p_sw->down_port_groups_num; i++) {
 		p_group = p_sw->down_port_groups[i];
 		p_remote_sw = p_group->remote_hca_or_sw.p_sw;
@@ -2415,7 +2457,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 
 		/* Recursion step:
 		   Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
-		__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
+		created_route |=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw,	/* remote switch - used as a route-downgoing alg. next step point */
 							       p_sw,	/* this switch - prev. position switch for the function */
 							       target_lid,	/* LID that we're routing to */
 							       target_rank,	/* rank of the LID that we're routing to */
@@ -2424,6 +2466,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
 							       reverse_hop_credit - 1,	/* Remaining reverse_hops allowed */
 							       reverse_hops + 1);	/* Number of reverse_hops done up to this point */
 	}
+	return created_route;
 
 }				/* ftree_fabric_route_downgoing_by_going_up() */
 
-- 
1.6.2-rc2.GIT





More information about the general mailing list