[ofa-general] [PATCH] osm: fat-tree optimization - improved ranking

Yevgeny Kliteynik kliteyn at dev.mellanox.co.il
Mon May 14 06:02:31 PDT 2007


Hi Hal,

This patch optimizes fabric ranking.
All the leaf switches are marked with rank and added to the BFS list,
and only then ranking of rest of the fabric begins.

I actually thought that this is the way I've originally
implemented it, as I mentioned in the patch that was dealing 
with 8 and 16 bit integers :)

Similar optimization may be applicable to up/dn routing - the roots
should be marked with rank 0 and only then ranking of rest of the 
switches should begin, but frankly, it practically doesn't reduce
the routing time, because ranking is only a small fraction of the 
routing runtime (I checked it on a 4K+ subnet).

In case of fat-tree I'm going to need it anyway when I enhance
the routing to consider only subset of HCAs in the routing balancing
(compute nodes vs. management nodes).

Please apply to master.

-- Yevgeny

Signed-off-by:  Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il>

>From dfa455f86d9ac48ff5cefd38a009718e5aeab1f9 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il>
Date: Mon, 14 May 2007 15:45:00 +0300
Subject: [PATCH] DELETE AFTER UPDATE: ranking

Signed-off-by: Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il>
---
 osm/opensm/osm_ucast_ftree.c |   83 +++++++++++++++++++++++++-----------------
 1 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/osm/opensm/osm_ucast_ftree.c b/osm/opensm/osm_ucast_ftree.c
index 3bad2fc..84da3d7 100644
--- a/osm/opensm/osm_ucast_ftree.c
+++ b/osm/opensm/osm_ucast_ftree.c
@@ -2406,10 +2406,24 @@ __osm_ftree_fabric_populate_nodes(
 /***************************************************
  ***************************************************/
 
+static boolean_t 
+__osm_ftree_sw_update_rank(
+   IN  ftree_sw_t  * p_sw,
+   IN  uint32_t      new_rank)
+{
+   if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank <= new_rank)
+      return FALSE;
+   p_sw->rank = new_rank;
+   return TRUE;
+
+}
+
+/***************************************************/
+
 static void
-__osm_ftree_rank_from_switch(
+__osm_ftree_rank_switches_from_leafs(
    IN  ftree_fabric_t * p_ftree, 
-   IN  ftree_sw_t *     p_starting_sw)
+   IN  cl_list_t      * p_ranking_bfs_list)
 {
    ftree_sw_t   * p_sw;
    ftree_sw_t   * p_remote_sw;
@@ -2417,19 +2431,11 @@ __osm_ftree_rank_from_switch(
    osm_node_t   * p_remote_node;
    osm_physp_t  * p_osm_port;
    uint8_t        i;
-   cl_list_t      bfs_list;
    ftree_sw_tbl_element_t * p_sw_tbl_element = NULL;
 
-   p_starting_sw->rank = 0;
-
-   /* Run BFS scan of the tree, starting from this switch */
-
-   cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
-   cl_list_insert_tail(&bfs_list, &__osm_ftree_sw_tbl_element_create(p_starting_sw)->map_item);
-
-   while (!cl_is_list_empty(&bfs_list))
+   while (!cl_is_list_empty(p_ranking_bfs_list))
    {
-      p_sw_tbl_element = (ftree_sw_tbl_element_t *)cl_list_remove_head(&bfs_list);
+      p_sw_tbl_element = (ftree_sw_tbl_element_t *) cl_list_remove_head(p_ranking_bfs_list);
       p_sw = p_sw_tbl_element->p_sw;
       __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
 
@@ -2457,26 +2463,23 @@ __osm_ftree_rank_from_switch(
             /* remote node is not a switch */
             continue;
          }
-         if (__osm_ftree_sw_ranked(p_remote_sw) && p_remote_sw->rank <= (p_sw->rank + 1))
-            continue;
 
-         /* rank the remote switch and add it to the BFS list */
-         p_remote_sw->rank = p_sw->rank + 1;
-         cl_list_insert_tail(&bfs_list, 
-                             &__osm_ftree_sw_tbl_element_create(p_remote_sw)->map_item);
+         /* if needed, rank the remote switch and add it to the BFS list */
+         if (__osm_ftree_sw_update_rank(p_remote_sw, p_sw->rank + 1))
+            cl_list_insert_tail(p_ranking_bfs_list, 
+                                &__osm_ftree_sw_tbl_element_create(p_remote_sw)->map_item);
       }
    }
-   cl_list_destroy(&bfs_list);
-} /* __osm_ftree_rank_from_switch() */
 
+} /* __osm_ftree_rank_switches_from_leafs() */
 
-/***************************************************
- ***************************************************/
+/***************************************************/
 
 static int 
-__osm_ftree_rank_switches_from_hca(
+__osm_ftree_rank_leaf_switches(
    IN  ftree_fabric_t * p_ftree,
-   IN  ftree_hca_t    * p_hca)
+   IN  ftree_hca_t    * p_hca,
+   IN  cl_list_t      * p_ranking_bfs_list)
 {
    ftree_sw_t     * p_sw;
    osm_node_t     * p_osm_node = p_hca->p_osm_node;
@@ -2502,7 +2505,7 @@ __osm_ftree_rank_switches_from_hca(
          case IB_NODE_TYPE_CA:
             /* HCA connected directly to another HCA - not FatTree */
             osm_log(&p_ftree->p_osm->log, OSM_LOG_ERROR,
-                    "__osm_ftree_rank_switches_from_hca: ERR AB0F: "
+                    "__osm_ftree_rank_leaf_switches: ERR AB0F: "
                     "HCA conected directly to another HCA: "
                     "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
                     cl_ntoh64(osm_node_get_node_guid(p_hca->p_osm_node)),
@@ -2520,7 +2523,7 @@ __osm_ftree_rank_switches_from_hca(
 
          default:
             osm_log(&p_ftree->p_osm->log, OSM_LOG_ERROR,
-                    "__osm_ftree_rank_switches_from_hca: ERR AB10: "
+                    "__osm_ftree_rank_leaf_switches: ERR AB10: "
                     "Node GUID 0x%016" PRIx64 " - Unknown node type: %s\n",
                     cl_ntoh64(osm_node_get_node_guid(p_remote_osm_node)),
                     ib_get_node_type_str(osm_node_get_type(p_remote_osm_node)));
@@ -2535,11 +2538,12 @@ __osm_ftree_rank_switches_from_hca(
 
       CL_ASSERT(p_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl));
 
-      if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank == 0)
+      if ( !__osm_ftree_sw_update_rank(p_sw,0) )
          continue;
 
+      /* if needed, rank the remote switch and add it to the BFS list */
       osm_log(&p_ftree->p_osm->log, OSM_LOG_DEBUG,
-              "__osm_ftree_rank_switches_from_hca: "
+              "__osm_ftree_rank_leaf_switches: "
               "Marking rank of switch that is directly connected to HCA:\n"
               "                                            - HCA guid   : 0x%016" PRIx64 "\n"
               "                                            - Switch guid: 0x%016" PRIx64 "\n"
@@ -2547,13 +2551,14 @@ __osm_ftree_rank_switches_from_hca(
               cl_ntoh64(osm_node_get_node_guid(p_hca->p_osm_node)),
               cl_ntoh64(osm_node_get_node_guid(p_sw->p_osm_sw->p_node)),
               cl_ntoh16(p_sw->base_lid));
-      __osm_ftree_rank_from_switch(p_ftree, p_sw);
+      cl_list_insert_tail(p_ranking_bfs_list, 
+                          &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
    }
 
  Exit:
    OSM_LOG_EXIT(&p_ftree->p_osm->log);
    return res;
-} /* __osm_ftree_rank_switches_from_hca() */
+} /* __osm_ftree_rank_leaf_switches() */
 
 /***************************************************/
 
@@ -2789,18 +2794,21 @@ __osm_ftree_fabric_construct_sw_ports(
 /***************************************************
  ***************************************************/
 
-/* ToDo: improve ranking algorithm complexity
-   by propogating BFS from more nodes */ 
 static int
 __osm_ftree_fabric_perform_ranking(
    IN  ftree_fabric_t * p_ftree)
 {
    ftree_hca_t * p_hca;
    ftree_hca_t * p_next_hca;
+   cl_list_t     ranking_bfs_list;
    int res = 0;
 
    OSM_LOG_ENTER(&p_ftree->p_osm->log, __osm_ftree_fabric_perform_ranking);
 
+   /* Init the bfs list - the list of the switches that will be
+      initially filled with the leaf switches */
+   cl_list_init(&ranking_bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
+
    /* Mark REVERSED rank of all the switches in the subnet. 
       Start from switches that are connected to hca's, and 
       scan all the switches in the subnet. */
@@ -2809,7 +2817,7 @@ __osm_ftree_fabric_perform_ranking(
    {
       p_hca = p_next_hca;
       p_next_hca = (ftree_hca_t *)cl_qmap_next(&p_hca->map_item );
-      if (__osm_ftree_rank_switches_from_hca(p_ftree,p_hca) != 0)
+      if (__osm_ftree_rank_leaf_switches(p_ftree, p_hca, &ranking_bfs_list) != 0)
       {
          res = -1;
          osm_log(&p_ftree->p_osm->log, OSM_LOG_ERROR,
@@ -2819,7 +2827,14 @@ __osm_ftree_fabric_perform_ranking(
       }
    }
 
-   /* calculate and set FatTree rank */
+   /* Now rank rest of the switches in the fabric, while the
+      list already contains all the ranked leaf switches */
+   __osm_ftree_rank_switches_from_leafs(p_ftree, &ranking_bfs_list);
+   cl_list_destroy(&ranking_bfs_list);
+   
+   /* REVERSED ranking of all the switches completed.
+      Calculate and set FatTree rank */
+
    __osm_ftree_fabric_calculate_rank(p_ftree);
    osm_log(&p_ftree->p_osm->log, OSM_LOG_INFO,
            "__osm_ftree_fabric_perform_ranking: "
-- 
1.4.4.1.GIT




More information about the general mailing list