[ofa-general] [PATCH 1/3] Enhanced Fat-Tree algorithm: Added balanced minhop within Fat-Tree Added qsort for choosing secondary routes order instead of simple index
Nicolas Morey Chaisemartin
nicolas.morey-chaisemartin at ext.bull.net
Thu Mar 5 06:01:13 PST 2009
Signed-off-by: Nicolas Morey-Chaisemartin <nicolas.morey-chaisemartin at ext.bull.net>
---
opensm/opensm/osm_ucast_ftree.c | 169 ++++++++++++++++++++++++---------------
1 files changed, 106 insertions(+), 63 deletions(-)
diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c
index d92265b..da72ade 100644
--- a/opensm/opensm/osm_ucast_ftree.c
+++ b/opensm/opensm/osm_ucast_ftree.c
@@ -152,6 +152,7 @@ typedef struct ftree_port_group_t_ {
boolean_t is_cn; /* whether this port is a compute node */
boolean_t is_io; /* whether this port is an I/O node */
uint32_t counter_down; /* number of allocated routs downwards */
+ uint32_t counter_up; /* number of allocated routs downwards */
} ftree_port_group_t;
/***************************************************
@@ -1903,6 +1904,27 @@ static void __osm_ftree_set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
p_sw->p_osm_sw);
}
+
+
+static int
+__osm_ftree_port_group_compare_load_down(const void* p1,const void* p2){
+ int temp = (*((ftree_port_group_t**)p1))->counter_down -(*((ftree_port_group_t**)p2))->counter_down ;
+ if(temp > 0)
+ return 1;
+ if(temp < 0)
+ return -1;
+ return 0;
+}
+
+static int
+__osm_ftree_port_group_compare_load_up(const void* p1,const void* p2){
+ int temp = (*((ftree_port_group_t**)p1))->counter_up -(*((ftree_port_group_t**)p2))->counter_up ;
+ if(temp > 0)
+ return 1;
+ if(temp < 0)
+ return -1;
+ return 0;
+}
/***************************************************
***************************************************/
@@ -1935,10 +1957,10 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
ftree_port_group_t *p_group;
ftree_port_t *p_port;
ftree_port_t *p_min_port;
- uint16_t i;
uint16_t j;
uint16_t k;
boolean_t created_route = FALSE;
+ boolean_t routed=0;
/* we shouldn't enter here if both real_lid and main_path are false */
CL_ASSERT(is_real_lid || is_main_path);
@@ -1948,11 +1970,13 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
return FALSE;
/* foreach down-going port group (in indexing order) */
- i = p_sw->down_port_groups_idx;
+
+ qsort(p_sw->down_port_groups,p_sw->down_port_groups_num,sizeof(*(p_sw->down_port_groups)), __osm_ftree_port_group_compare_load_up);
+
+
for (k = 0; k < p_sw->down_port_groups_num; k++) {
- p_group = p_sw->down_port_groups[i];
- i = (i + 1) % p_sw->down_port_groups_num;
+ p_group = p_sw->down_port_groups[k];
/* If this port group doesn't point to a switch, mark
that the route was created and skip to the next group */
@@ -2002,6 +2026,10 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
cl_ntoh16(p_group->base_lid),
__osm_ftree_tuple_to_str(p_sw->tuple),
cl_ntoh16(p_group->remote_base_lid));
+ /* We skip only if we have come through a longer path */
+ if(((target_rank - highest_rank_in_route) +
+ (p_remote_sw->rank - highest_rank_in_route) + 2*reverse_hops) >=
+ osm_switch_get_least_hops(p_remote_sw->p_osm_sw, cl_ntoh16(target_lid)))
continue;
}
@@ -2034,10 +2062,6 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
/* second case: skip the port group if the remote (lower)
switch has been already configured for this target LID */
- if (is_real_lid && !is_main_path &&
- p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] !=
- OSM_NO_PATH)
- continue;
/* setting fwd tbl port only if this is real LID */
if (is_real_lid) {
@@ -2073,19 +2097,26 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
the upper side of the link (on switch with lower rank).
Counter is promoted only if we're routing LID on the main
path (whether it's a real LID or a dummy one). */
- if (is_main_path)
- p_min_port->counter_up++;
+ /* if (is_main_path)
+ p_min_port->counter_up++;*/
/* Recursion step:
Assign upgoing ports by stepping down, starting on REMOTE switch */
- created_route |= __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */
- NULL, /* prev. position - NULL to mark that we went down and not up */
- target_lid, /* LID that we're routing to */
- target_rank, /* rank of the LID that we're routing to */
- is_real_lid, /* whether the target LID is real or dummy */
- is_main_path, /* whether this is path to HCA that should by tracked by counters */
- highest_rank_in_route, reverse_hops); /* highest visited point in the tree before going down */
- }
+ routed =
+ __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree,
+ p_remote_sw, /* remote switch - used as a route-upgoing alg. start point */
+ NULL, /* prev. position - NULL to mark that we went down and not up */
+ target_lid, /* LID that we're routing to */
+ target_rank, /* rank of the LID that we're routing to */
+ is_real_lid, /* whether the target LID is real or dummy */
+ is_main_path, /* whether this is path to HCA that should by tracked by counters */
+ highest_rank_in_route, reverse_hops); /* highest visited point in the tree before going down */
+ created_route|=routed;
+ if(routed)
+ p_min_port->counter_up++;
+ p_group->counter_up++;
+ }
+
/* done scanning all the down-going port groups */
/* if the route was created, promote the index that
@@ -2099,6 +2130,8 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
return created_route;
} /* __osm_ftree_fabric_route_upgoing_by_going_down() */
+
+
/***************************************************/
/*
@@ -2112,7 +2145,7 @@ __osm_ftree_fabric_route_upgoing_by_going_down(IN ftree_fabric_t * p_ftree,
* assign-down-going-port-by-ascending-up on REMOTE switch (recursion)
*/
-static void
+static boolean_t
__osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
IN ftree_sw_t * p_sw,
IN ftree_sw_t * p_prev_sw,
@@ -2131,12 +2164,14 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
ftree_port_t *p_min_port;
uint16_t i;
uint16_t j;
+ boolean_t created_route = FALSE;
+ boolean_t routed = FALSE;
/* we shouldn't enter here if both real_lid and main_path are false */
CL_ASSERT(is_real_lid || is_main_path);
/* Assign upgoing ports by stepping down, starting on THIS switch */
- __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */
+ created_route = __osm_ftree_fabric_route_upgoing_by_going_down(p_ftree, p_sw, /* local switch - used as a route-upgoing alg. start point */
p_prev_sw, /* switch that we went up from (NULL means that we went down) */
target_lid, /* LID that we're routing to */
target_rank, /* rank of the LID that we're routing to */
@@ -2151,8 +2186,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
/* We go up by going down as we have some reverse_hop_credit left */
/* We use the index to scatter a bit the reverse up routes */
p_sw->down_port_groups_idx =
- (p_sw->down_port_groups_idx +
- 1) % p_sw->down_port_groups_num;
+ (p_sw->down_port_groups_idx + 1) % p_sw->down_port_groups_num;
i = p_sw->down_port_groups_idx;
for (j = 0; j < p_sw->down_port_groups_num; j++) {
@@ -2160,39 +2194,30 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
i = (i + 1) % p_sw->down_port_groups_num;
/* Skip this port group unless it points to a switch */
- if (p_group->remote_node_type !=
- IB_NODE_TYPE_SWITCH)
+ if (p_group->remote_node_type != IB_NODE_TYPE_SWITCH)
continue;
p_remote_sw = p_group->remote_hca_or_sw.p_sw;
- __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
- p_sw, /* this switch - prev. position switch for the function */
- target_lid, /* LID that we're routing to */
- target_rank, /* rank of the LID that we're routing to */
- is_real_lid, /* whether this target LID is real or dummy */
- is_main_path, /* whether this is path to HCA that should by tracked by counters */
- reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
- reverse_hops + 1); /* Number of reverse_hops done up to this point */
+ created_route |=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
+ p_sw, /* this switch - prev. position switch for the function */
+ target_lid, /* LID that we're routing to */
+ target_rank, /* rank of the LID that we're routing to */
+ is_real_lid, /* whether this target LID is real or dummy */
+ is_main_path, /* whether this is path to HCA that should by tracked by counters */
+ reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
+ reverse_hops + 1); /* Number of reverse_hops done up to this point */
}
}
- return;
+ return created_route;
}
- /* Find the least loaded upgoing port group */
- p_min_group = NULL;
- for (i = 0; i < p_sw->up_port_groups_num; i++) {
- p_group = p_sw->up_port_groups[i];
- if (!p_min_group) {
- /* first group that we're checking - use
- it as a group with the lowest load */
- p_min_group = p_group;
- } else if (p_group->counter_down < p_min_group->counter_down) {
- /* this group is less loaded - use it as min */
- p_min_group = p_group;
- }
- }
+ /* We should generate a list of port sorted by load so we can find easily the least
+ * going port and explore the other pots on secondary routes more easily (and quickly) */
+ qsort(p_sw->up_port_groups,p_sw->up_port_groups_num,sizeof(*(p_sw->up_port_groups)), __osm_ftree_port_group_compare_load_down);
+
+ p_min_group = p_sw->up_port_groups[0];
/* Find the least loaded upgoing port in the selected group */
p_min_port = NULL;
ports_num = (uint16_t) cl_ptr_vector_get_size(&p_min_group->ports);
@@ -2297,7 +2322,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
/* Recursion step:
Assign downgoing ports by stepping up, starting on REMOTE switch. */
- __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
+ created_route|=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
p_sw, /* this switch - prev. position switch for the function */
target_lid, /* LID that we're routing to */
target_rank, /* rank of the LID that we're routing to */
@@ -2309,7 +2334,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
/* we're done for the third case */
if (!is_real_lid)
- return;
+ return created_route;
/* What's left to do at this point:
*
@@ -2343,14 +2368,15 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
* - go UP(TRUE,FALSE) to the remote switch
*/
- for (i = 0; i < p_sw->up_port_groups_num; i++) {
+ for (i = 1; i < p_sw->up_port_groups_num; i++) {
p_group = p_sw->up_port_groups[i];
p_remote_sw = p_group->remote_hca_or_sw.p_sw;
- /* skip if target lid has been already set on remote switch fwd tbl */
- if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] !=
- OSM_NO_PATH)
- continue;
+ /* skip if target lid has been already set on remote switch fwd tbl (with a bigger hop count)*/
+ if (p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] != OSM_NO_PATH)
+ if((target_rank -p_remote_sw->rank + 2*reverse_hops) >=
+ osm_switch_get_least_hops(p_remote_sw->p_osm_sw, cl_ntoh16(target_lid)))
+ continue;
if (p_sw->is_leaf) {
OSM_LOG(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
@@ -2365,8 +2391,23 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
We can safely assume that switch will initiate very
few traffic, so there's no point waisting runtime on
trying to balance these routes - always pick port 0. */
+ /* GET MIN PORT HERE */
+ p_min_port = NULL;
+ ports_num = (uint16_t) cl_ptr_vector_get_size(&p_group->ports);
+ for (j = 0; j < ports_num; j++) {
+ cl_ptr_vector_at(&p_group->ports, j, (void *)&p_port);
+ if (!p_min_port) {
+ /* first port that we're checking - use
+ it as a port with the lowest load */
+ p_min_port = p_port;
+ } else if (p_port->counter_down < p_min_port->counter_down) {
+ /* this port is less loaded - use it as min */
+ p_min_port = p_port;
+ }
+ }
- cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
+ p_port = p_min_port;
+ //cl_ptr_vector_at(&p_group->ports, 0, (void *)&p_port);
p_remote_sw->p_osm_sw->new_lft[cl_ntoh16(target_lid)] =
p_port->remote_port_num;
@@ -2387,7 +2428,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
/* Recursion step:
Assign downgoing ports by stepping up, starting on REMOTE switch. */
- __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
+ routed = __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
p_sw, /* this switch - prev. position switch for the function */
target_lid, /* LID that we're routing to */
target_rank, /* rank of the LID that we're routing to */
@@ -2395,16 +2436,17 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
FALSE, /* whether this is path to HCA that should by tracked by counters */
reverse_hop_credit, /* Remaining reverse_hops allowed */
reverse_hops); /* Number of reverse_hops done up to this point */
+ created_route|=routed;
}
- /* If we don't have any reverse hop credits, we are done */
- if (reverse_hop_credit == 0)
- return;
+ /* If we don't have any reverse hop credits, we are done */
+ if(reverse_hop_credit==0)
+ return created_route;
- /* We explore all the down group ports */
- /* We try to reverse jump for each of them */
- /* They already have a route to us from the upgoing_by_going_down started earlier */
- /* This is only so it'll continue exploring up, after this step backwards */
+ /* We explore all the down group ports */
+ /* We try to reverse jump for each of them */
+ /* They already have a route to us from the upgoing_by_going_down started earlier */
+ /* This is only so it'll continue exploring up, after this step backwards*/
for (i = 0; i < p_sw->down_port_groups_num; i++) {
p_group = p_sw->down_port_groups[i];
p_remote_sw = p_group->remote_hca_or_sw.p_sw;
@@ -2415,7 +2457,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
/* Recursion step:
Assign downgoing ports by stepping up, fter doing one step down starting on REMOTE switch. */
- __osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
+ created_route |=__osm_ftree_fabric_route_downgoing_by_going_up(p_ftree, p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
p_sw, /* this switch - prev. position switch for the function */
target_lid, /* LID that we're routing to */
target_rank, /* rank of the LID that we're routing to */
@@ -2424,6 +2466,7 @@ __osm_ftree_fabric_route_downgoing_by_going_up(IN ftree_fabric_t * p_ftree,
reverse_hop_credit - 1, /* Remaining reverse_hops allowed */
reverse_hops + 1); /* Number of reverse_hops done up to this point */
}
+ return created_route;
} /* ftree_fabric_route_downgoing_by_going_up() */
--
1.6.2-rc2.GIT
More information about the general
mailing list