[openib-general] [PATCH] osm: Adding FatTree routing engine [2/2]

Yevgeny Kliteynik kliteyn at dev.mellanox.co.il
Thu Dec 14 15:27:59 PST 2006


Hi Hal

This patch (2/2) adds Fat Tree routing engine to OpenSM.

--
Yevgeny

Signed-off-by: Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il>
---
 osm/opensm/osm_ucast_ftree.c | 2936 ++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 2936 insertions(+), 0 deletions(-)

diff --git a/osm/opensm/osm_ucast_ftree.c b/osm/opensm/osm_ucast_ftree.c
new file mode 100644
index 0000000..15e4cd0
--- /dev/null
+++ b/osm/opensm/osm_ucast_ftree.c
@@ -0,0 +1,2936 @@
+/*
+ * Copyright (c) 2004-2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/*
+ * Abstract:
+ *    Implementation of OpenSM FatTree routing
+ *
+ * Environment:
+ *    Linux User Mode
+ *
+ */
+
+#if HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <iba/ib_types.h>
+#include <complib/cl_qmap.h>
+#include <complib/cl_pool.h>
+#include <complib/cl_debug.h>
+#include <opensm/osm_opensm.h>
+#include <opensm/osm_switch.h>
+
+/* This var is predefined and initialized */
+extern osm_opensm_t osm;
+
+/*
+ * FatTree rank is bounded between 2 and 8:
+ *  - Tree of rank 1 has only trivial routing pathes,
+ *    so no need to use FatTree routing.
+ *  - Why maximum rank is 8:
+ *    Each node (switch) is assigned a unique tuple.
+ *    Switches are stored in two cl_qmaps - one is 
+ *    ordered by guid, and the other by a key that is
+ *    generated from tuple. Since cl_qmap supports only
+ *    a 64-bit key, the maximal tuple lenght is 8 bytes.
+ *    which means that maximal tree rank is 8.
+ * Note that the above also implies that each switch 
+ * can have at max 255 up/down ports.
+ */
+
+#define FAT_TREE_MIN_RANK 2
+#define FAT_TREE_MAX_RANK 8
+
+typedef enum {
+   FTREE_DIRECTION_DOWN = -1,
+   FTREE_DIRECTION_SAME,
+   FTREE_DIRECTION_UP
+} ftree_direction_t;
+
+
+/***************************************************
+ **
+ **  Forward references
+ **
+ ***************************************************/
+
+struct ftree_sw_t_;
+struct ftree_hca_t_;
+struct ftree_port_t_;
+struct ftree_port_group_t_;
+struct ftree_fabric_t_;
+
+/***************************************************
+ **
+ **  ftree_tuple_t definition
+ **
+ ***************************************************/
+
+#define FTREE_TUPLE_BUFF_LEN 1024
+#define FTREE_TUPLE_LEN 8
+
+typedef uint8_t ftree_tuple_t[FTREE_TUPLE_LEN];
+typedef uint64_t ftree_tuple_key_t;
+
+/***************************************************
+ **
+ **  ftree_sw_table_element_t definition
+ **
+ ***************************************************/
+
+typedef struct {
+   cl_map_item_t map_item;
+   struct ftree_sw_t_ * p_sw;
+} ftree_sw_tbl_element_t;
+
+/***************************************************
+ **
+ **  ftree_fwd_tbl_t definition
+ **
+ ***************************************************/
+
+typedef uint8_t * ftree_fwd_tbl_t;
+#define FTREE_FWD_TBL_LEN (IB_LID_UCAST_END_HO + 1)
+
+/***************************************************
+ **
+ **  ftree_port_t definition
+ **
+ ***************************************************/
+
+typedef struct ftree_port_t_ 
+{
+   cl_map_item_t  map_item;
+   uint16_t       port_num;           /* port number on the current node */
+   uint16_t       remote_port_num;    /* port number on the remote node */
+   uint32_t       counter_up;         /* number of allocated routs upwards */
+   uint32_t       counter_down;       /* number of allocated routs downwards */
+} ftree_port_t;
+
+/***************************************************
+ **
+ **  ftree_port_group_t definition
+ **
+ ***************************************************/
+
+typedef struct ftree_port_group_t_
+{
+   cl_map_item_t  map_item;
+   ib_net16_t     base_lid;           /* base lid of the current node */
+   uint8_t        lmc;                /* LMC of the current node */
+   ib_net16_t     remote_base_lid;    /* base lid of the remote node */
+   uint8_t        remote_lmc;         /* LMC of the remote node */
+   ib_net64_t     port_guid;          /* port guid of this port */
+   ib_net64_t     remote_port_guid;   /* port guid of the remote port */
+   ib_net64_t     remote_node_guid;   /* node guid of the remote node */
+   uint8_t        remote_node_type;   /* IB_NODE_TYPE_{CA,SWITCH,ROUTER,...} */
+   union remote_hca_or_sw_
+   {
+      struct ftree_hca_t_ * remote_hca;
+      struct ftree_sw_t_  * remote_sw;
+   } remote_hca_or_sw;                /* pointer to remote hca/switch */
+   cl_ptr_vector_t ports;             /* vector of ports to the same lid */
+} ftree_port_group_t;
+
+/***************************************************
+ **
+ **  ftree_sw_t definition
+ **
+ ***************************************************/
+
+typedef struct ftree_sw_t_ 
+{
+   cl_map_item_t          map_item;
+   osm_switch_t         * p_osm_sw;
+   uint8_t                rank;
+   ftree_tuple_t          tuple;
+   ib_net16_t             base_lid;
+   uint8_t                lmc;
+   ftree_port_group_t  ** down_port_groups;
+   uint16_t               down_port_groups_num;
+   ftree_port_group_t  ** up_port_groups;
+   uint16_t               up_port_groups_num;
+   ftree_fwd_tbl_t        lft_buf;
+} ftree_sw_t;
+
+/***************************************************
+ **
+ **  ftree_hca_t definition
+ **
+ ***************************************************/
+
+typedef struct ftree_hca_t_ {
+   cl_map_item_t          map_item;
+   osm_node_t           * p_osm_node;
+   ftree_port_group_t  ** up_port_groups;
+   uint16_t               up_port_groups_num;
+} ftree_hca_t;
+
+/***************************************************
+ **
+ **  ftree_fabric_t definition
+ **
+ ***************************************************/
+
+typedef struct ftree_fabric_t_ 
+{
+   cl_qmap_t     hca_tbl;
+   cl_qmap_t     sw_tbl;
+   cl_qmap_t     sw_by_tuple_tbl;
+   uint32_t      tree_rank;
+   ftree_sw_t ** leaf_switches;
+   uint32_t      leaf_switches_num;
+   uint16_t      max_hcas_per_leaf;
+   cl_pool_t     sw_fwd_tbl_pool;
+} ftree_fabric_t;
+
+/***************************************************
+ **
+ ** comparators
+ **
+ ***************************************************/
+
+int
+__osm_ftree_compare_switches_by_index(
+   IN  const void * p1, 
+   IN  const void * p2)
+{
+   ftree_sw_t ** pp_sw1 = (ftree_sw_t **)p1; 
+   ftree_sw_t ** pp_sw2 = (ftree_sw_t **)p2; 
+
+   uint16_t i;
+   for (i = 0; i < FTREE_TUPLE_LEN; i++)
+   {
+      if ((*pp_sw1)->tuple[i] > (*pp_sw2)->tuple[i])
+         return 1;
+      if ((*pp_sw1)->tuple[i] < (*pp_sw2)->tuple[i])
+         return -1;
+   }
+   return 0;
+}
+
+/***************************************************/
+
+int
+__osm_ftree_compare_port_groups_by_remote_switch_index(
+   IN  const void * p1, 
+   IN  const void * p2)
+{
+   ftree_port_group_t ** pp_g1 = (ftree_port_group_t **)p1; 
+   ftree_port_group_t ** pp_g2 = (ftree_port_group_t **)p2; 
+
+   return __osm_ftree_compare_switches_by_index( 
+                  &((*pp_g1)->remote_hca_or_sw.remote_sw),
+                  &((*pp_g2)->remote_hca_or_sw.remote_sw) );
+}
+
+/***************************************************/
+
+boolean_t
+__osm_ftree_sw_less_by_index(
+   IN  ftree_sw_t * p_sw1,
+   IN  ftree_sw_t * p_sw2)
+{
+   if (__osm_ftree_compare_switches_by_index((void *)&p_sw1,
+                                             (void *)&p_sw2) < 0)
+      return TRUE;
+   return FALSE;
+}
+
+/***************************************************/
+
+boolean_t
+__osm_ftree_sw_greater_by_index(
+   IN  ftree_sw_t * p_sw1,
+   IN  ftree_sw_t * p_sw2)
+{
+   if (__osm_ftree_compare_switches_by_index((void *)&p_sw1,
+                                             (void *)&p_sw2) > 0)
+      return TRUE;
+   return FALSE;
+}
+
+/***************************************************
+ **
+ ** ftree_tuple_t functions
+ **
+ ***************************************************/
+
+static void 
+__osm_ftree_tuple_init(
+   IN  ftree_tuple_t tuple)
+{
+   memset(tuple, 0xFF, FTREE_TUPLE_LEN);
+}
+
+/***************************************************/
+
+static inline boolean_t
+__osm_ftree_tuple_assigned(
+   IN  ftree_tuple_t tuple)
+{
+   return (tuple[0] != 0xFF);
+}
+
+/***************************************************/
+
+#define FTREE_TUPLE_BUFFERS_NUM 6
+
+static char * 
+__osm_ftree_tuple_to_str(
+   IN  ftree_tuple_t tuple)
+{
+   static char buffer[FTREE_TUPLE_BUFFERS_NUM][FTREE_TUPLE_BUFF_LEN];
+   static uint8_t ind = 0;
+   char * ret_buffer;
+   uint32_t i;
+
+   if (!__osm_ftree_tuple_assigned(tuple))
+      return "INDEX.NOT.ASSIGNED";
+
+   buffer[ind][0] = '\0';
+
+   for(i = 0; (i < FTREE_TUPLE_LEN) && (tuple[i] != 0xFF); i++)
+   {
+      if ((strlen(buffer[ind]) + 10) > FTREE_TUPLE_BUFF_LEN)
+         return "INDEX.TOO.LONG";
+      if (i != 0)
+         strcat(buffer[ind],".");
+      sprintf(&buffer[ind][strlen(buffer[ind])], "%u", tuple[i]);
+   }
+
+   ret_buffer = buffer[ind];
+   ind = (ind + 1) % FTREE_TUPLE_BUFFERS_NUM;
+   return ret_buffer;
+} /* __osm_ftree_tuple_to_str() */
+
+/***************************************************/
+
+static inline ftree_tuple_key_t 
+__osm_ftree_tuple_to_key(
+   IN  ftree_tuple_t tuple)
+{
+   ftree_tuple_key_t key;
+   memcpy(&key, tuple, FTREE_TUPLE_LEN);
+   return key;
+}
+
+/***************************************************/
+
+static inline void 
+__osm_ftree_tuple_from_key(
+   IN  ftree_tuple_t tuple, 
+   IN  ftree_tuple_key_t key)
+{
+   memcpy(tuple, &key, FTREE_TUPLE_LEN);
+}
+
+/***************************************************
+ **
+ ** ftree_sw_tbl_element_t functions
+ **
+ ***************************************************/
+
+static ftree_sw_tbl_element_t *
+__osm_ftree_sw_tbl_element_create(
+   IN  ftree_sw_t * p_sw)
+{
+   ftree_sw_tbl_element_t * p_element = 
+      (ftree_sw_tbl_element_t *) malloc(sizeof(ftree_sw_tbl_element_t));
+   if (!p_element)
+       return NULL;
+   memset(p_element, 0,sizeof(ftree_sw_tbl_element_t));
+
+   if (p_element)
+      p_element->p_sw = p_sw;
+   return p_element;
+}
+
+/***************************************************/
+
+static void
+__osm_ftree_sw_tbl_element_destroy(
+   IN  ftree_sw_tbl_element_t * p_element)
+{
+   if (!p_element)
+      return;
+   free(p_element);
+}
+
+/***************************************************
+ **
+ ** ftree_port_t functions
+ **
+ ***************************************************/
+
+static ftree_port_t * 
+__osm_ftree_port_create( 
+   IN  uint16_t port_num,
+   IN  uint16_t remote_port_num)
+{
+   ftree_port_t * p_port = (ftree_port_t *)malloc(sizeof(ftree_port_t));
+   if (!p_port)
+      return NULL;
+   memset(p_port,0,sizeof(ftree_port_t));
+
+   p_port->port_num = port_num;
+   p_port->remote_port_num = remote_port_num;
+
+   return p_port;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_port_destroy(
+   IN  ftree_port_t * p_port)
+{
+   if(p_port)
+      free(p_port);
+}
+
+/***************************************************
+ **
+ ** ftree_port_group_t functions
+ **
+ ***************************************************/
+
+static ftree_port_group_t * 
+__osm_ftree_port_group_create( 
+   IN  ib_net16_t    base_lid,
+   IN  uint8_t       lmc,
+   IN  ib_net16_t    remote_base_lid,
+   IN  uint8_t       remote_lmc,
+   IN  ib_net64_t  * p_port_guid,
+   IN  ib_net64_t  * p_remote_port_guid,
+   IN  ib_net64_t  * p_remote_node_guid,
+   IN  uint8_t       remote_node_type,
+   IN  void        * p_remote_hca_or_sw)
+{
+   ftree_port_group_t * p_group = 
+            (ftree_port_group_t *)malloc(sizeof(ftree_port_group_t));
+   if (p_group == NULL) 
+      return NULL;
+   memset(p_group, 0, sizeof(ftree_port_group_t));
+
+   p_group->base_lid = base_lid;
+   p_group->lmc = lmc;
+   p_group->remote_base_lid = remote_base_lid;
+   p_group->remote_lmc = remote_lmc;
+   memcpy(&p_group->port_guid, p_port_guid, sizeof(ib_net64_t));
+   memcpy(&p_group->remote_port_guid, p_remote_port_guid, sizeof(ib_net64_t));
+   memcpy(&p_group->remote_node_guid, p_remote_node_guid, sizeof(ib_net64_t));
+
+   p_group->remote_node_type = remote_node_type;
+   switch (remote_node_type)
+   {
+      case IB_NODE_TYPE_CA:
+         p_group->remote_hca_or_sw.remote_hca = (ftree_hca_t *)p_remote_hca_or_sw;
+         break;
+      case IB_NODE_TYPE_SWITCH:
+         p_group->remote_hca_or_sw.remote_sw = (ftree_sw_t *)p_remote_hca_or_sw;
+         break;
+      default:
+         /* we shouldn't get here - port is created only in hca or switch */
+         CL_ASSERT(0);
+   }
+
+   cl_ptr_vector_init(&p_group->ports,
+                      0,  /* min size */
+                      8); /* grow size */
+   return p_group;
+} /* __osm_ftree_port_group_create() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_port_group_destroy(
+   IN  ftree_port_group_t * p_group)
+{
+   uint32_t i;
+   uint32_t size;
+   ftree_port_t * p_port;
+
+   if (!p_group)
+      return;
+
+   /* remove all the elements of p_group->ports vector */
+   size = cl_ptr_vector_get_size(&p_group->ports);
+   for (i = 0; i < size; i++)
+   {
+      cl_ptr_vector_at(&p_group->ports, i, (void **)&p_port);
+      __osm_ftree_port_destroy(p_port);
+   }
+   cl_ptr_vector_destroy(&p_group->ports);
+   free(p_group);
+} /* __osm_ftree_port_group_destroy() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_port_group_dump(
+   IN  ftree_port_group_t * p_group,
+   IN  ftree_direction_t direction)
+{
+   ftree_port_t * p_port;
+   uint32_t size;
+   uint32_t i;
+   char buff[10*1024];
+
+   if (!p_group)
+      return;
+
+   if (!osm_log_is_active(&osm.log,OSM_LOG_DEBUG))
+      return;
+
+   size = cl_ptr_vector_get_size(&p_group->ports);
+   buff[0] = '\0';
+
+   for (i = 0; i < size; i++)
+   {
+      cl_ptr_vector_at(&p_group->ports, i, (void **)&p_port);
+      CL_ASSERT(p_port);
+
+      if (i != 0)
+         strcat(buff,", ");
+      sprintf(buff + strlen(buff), "%u", p_port->port_num);
+   }
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,
+           "__osm_ftree_port_group_dump:"
+           "    Port Group of size %u, port(s): %s, direction: %s\n" 
+           "                  Local <--> Remote GUID (LID):"
+           "0x%016" PRIx64 " (0x%x) <--> 0x%016" PRIx64 " (0x%x)\n", 
+           size,
+           buff,
+           (direction == FTREE_DIRECTION_DOWN)? "DOWN" : "UP",
+           cl_ntoh64(p_group->port_guid),
+           cl_ntoh16(p_group->base_lid),
+           cl_ntoh64(p_group->remote_port_guid),
+           cl_ntoh16(p_group->remote_base_lid));
+
+} /* __osm_ftree_port_group_dump() */
+
+/***************************************************/
+
+static void
+__osm_ftree_port_group_add_port(
+   IN  ftree_port_group_t * p_group,
+   IN  uint16_t             port_num,
+   IN  uint16_t             remote_port_num)
+{
+   uint16_t i;
+   ftree_port_t * p_port;
+
+   for (i = 0; i < cl_ptr_vector_get_size(&p_group->ports); i++)
+   {
+      cl_ptr_vector_at(&p_group->ports, i, (void **)&p_port);
+      if (p_port->port_num == port_num)
+         return;
+   }
+
+   p_port = __osm_ftree_port_create(port_num,remote_port_num);
+   cl_ptr_vector_insert(&p_group->ports, p_port, NULL);
+}
+
+/***************************************************
+ **
+ ** ftree_sw_t functions
+ **
+ ***************************************************/
+
+static ftree_sw_t * 
+__osm_ftree_sw_create(
+   IN  ftree_fabric_t * p_ftree,
+   IN  osm_switch_t   * p_osm_sw)
+{
+   ftree_sw_t * p_sw;
+   uint8_t ports_num;
+
+   /* make sure that the switch has ports */
+   if (osm_switch_get_num_ports(p_osm_sw) == 1)
+      return NULL;
+
+   p_sw = (ftree_sw_t *)malloc(sizeof(ftree_sw_t));
+   if (p_sw == NULL) 
+      return NULL;
+   memset(p_sw, 0, sizeof(ftree_sw_t));
+
+   p_sw->p_osm_sw = p_osm_sw;
+   p_sw->rank = 0xFF;
+   __osm_ftree_tuple_init(p_sw->tuple);
+
+   p_sw->base_lid = osm_node_get_base_lid(osm_switch_get_node_ptr(p_sw->p_osm_sw),0);
+
+   ports_num = osm_node_get_num_physp(osm_switch_get_node_ptr(p_sw->p_osm_sw));
+   p_sw->down_port_groups = 
+      (ftree_port_group_t **) malloc(ports_num * sizeof(ftree_port_group_t *));
+   p_sw->up_port_groups = 
+      (ftree_port_group_t **) malloc(ports_num * sizeof(ftree_port_group_t *));
+   if (!p_sw->down_port_groups || !p_sw->up_port_groups)
+      return NULL;
+   p_sw->down_port_groups_num = 0;
+   p_sw->up_port_groups_num = 0;
+
+   /* initialize lft buffer */
+   p_sw->lft_buf = (ftree_fwd_tbl_t)cl_pool_get(&p_ftree->sw_fwd_tbl_pool);
+   memset(p_sw->lft_buf, OSM_NO_PATH, FTREE_FWD_TBL_LEN);
+
+   return p_sw;
+} /* __osm_ftree_sw_create() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_sw_destroy(
+   IN  ftree_fabric_t * p_ftree,
+   IN  ftree_sw_t     * p_sw)
+{
+   uint8_t i;
+
+   if (!p_sw)
+      return;
+
+   for (i = 0; i < p_sw->down_port_groups_num; i++)
+      __osm_ftree_port_group_destroy(p_sw->down_port_groups[i]);
+   for (i = 0; i < p_sw->up_port_groups_num; i++)
+      __osm_ftree_port_group_destroy(p_sw->up_port_groups[i]);
+   if (p_sw->down_port_groups)
+      free(p_sw->down_port_groups);
+   if (p_sw->up_port_groups)
+      free(p_sw->up_port_groups);
+
+   /* return switch fwd_tbl to pool */
+   if (p_sw->lft_buf)
+      cl_pool_put(&p_ftree->sw_fwd_tbl_pool, (void *)p_sw->lft_buf);
+
+   free(p_sw);
+} /* __osm_ftree_sw_destroy() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_sw_dump(
+   IN  ftree_sw_t * p_sw)
+{
+   uint32_t i;
+   if (!p_sw)
+      return;
+
+   if (!osm_log_is_active(&osm.log,OSM_LOG_DEBUG))
+      return;
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,
+           "__osm_ftree_sw_dump: "
+           "Switch index: %s, GUID: 0x%016" PRIx64 ", Ports: %u DOWN, %u UP\n",
+          __osm_ftree_tuple_to_str(p_sw->tuple),
+          cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))), 
+          p_sw->down_port_groups_num, 
+          p_sw->up_port_groups_num);
+
+   for( i = 0; i < p_sw->down_port_groups_num; i++ ) 
+      __osm_ftree_port_group_dump(p_sw->down_port_groups[i], FTREE_DIRECTION_DOWN);
+   for( i = 0; i < p_sw->up_port_groups_num; i++ ) 
+      __osm_ftree_port_group_dump(p_sw->up_port_groups[i], FTREE_DIRECTION_UP);
+
+} /* __osm_ftree_sw_dump() */
+
+/***************************************************/
+
+static boolean_t
+__osm_ftree_sw_ranked(
+   IN  ftree_sw_t * p_sw)
+{
+   return (p_sw->rank != 0xFF); 
+}
+
+/***************************************************/
+
+static ftree_port_group_t *
+__osm_ftree_sw_get_port_group_by_remote_lid(
+   IN  ftree_sw_t       * p_sw,
+   IN  ib_net16_t         remote_base_lid,
+   IN  ftree_direction_t  direction)
+{
+   uint32_t i;
+   uint32_t size;
+   ftree_port_group_t ** port_groups;
+
+   if (direction == FTREE_DIRECTION_UP)
+   {
+      port_groups = p_sw->up_port_groups;
+      size = p_sw->up_port_groups_num;
+   }
+   else
+   {
+      port_groups = p_sw->down_port_groups;
+      size = p_sw->down_port_groups_num;
+   }
+
+   for (i = 0; i < size; i++)
+      if (remote_base_lid == port_groups[i]->remote_base_lid)
+         return port_groups[i];
+
+   return NULL;
+} /* __osm_ftree_sw_get_port_group_by_remote_lid() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_sw_add_port(
+   IN  ftree_sw_t       * p_sw,
+   IN  uint16_t           port_num,
+   IN  uint16_t           remote_port_num,
+   IN  ib_net16_t         base_lid,
+   IN  uint8_t            lmc,
+   IN  ib_net16_t         remote_base_lid,
+   IN  uint8_t            remote_lmc,
+   IN  ib_net64_t         port_guid,
+   IN  ib_net64_t         remote_port_guid,
+   IN  ib_net64_t         remote_node_guid,
+   IN  uint8_t            remote_node_type,
+   IN  void             * p_remote_hca_or_sw,
+   IN  ftree_direction_t  direction)
+{
+   ftree_port_group_t * p_group = 
+       __osm_ftree_sw_get_port_group_by_remote_lid(p_sw,remote_base_lid,direction);
+
+   if (!p_group)
+   {
+      p_group = __osm_ftree_port_group_create(
+                     base_lid,
+                     lmc,
+                     remote_base_lid,
+                     remote_lmc,
+                     &port_guid,
+                     &remote_port_guid,
+                     &remote_node_guid,
+                     remote_node_type,
+                     p_remote_hca_or_sw);
+      CL_ASSERT(p_group);
+
+      if (direction == FTREE_DIRECTION_UP)
+         p_sw->up_port_groups[p_sw->up_port_groups_num++] = p_group;
+      else
+         p_sw->down_port_groups[p_sw->down_port_groups_num++] = p_group;
+   }
+   __osm_ftree_port_group_add_port(p_group,port_num,remote_port_num);
+} /* __osm_ftree_sw_add_port() */
+
+/***************************************************/
+
+static void
+__osm_ftree_sw_set_fwd_table_block(
+    IN  ftree_sw_t * p_sw,
+    IN  uint16_t     lid_ho, 
+    IN  uint8_t      port_num)
+{
+   p_sw->lft_buf[lid_ho] = port_num;
+}
+
+/***************************************************/
+
+static uint8_t
+__osm_ftree_sw_get_fwd_table_block(
+    IN  ftree_sw_t * p_sw,
+    IN  uint16_t     lid_ho)
+{
+   return p_sw->lft_buf[lid_ho];
+}
+
+/***************************************************
+ **
+ ** ftree_hca_t functions
+ **
+ ***************************************************/
+
+static ftree_hca_t * 
+__osm_ftree_hca_create(
+   IN  osm_node_t * p_osm_node)
+{
+   ftree_hca_t * p_hca = (ftree_hca_t *)malloc(sizeof(ftree_hca_t));
+   if (p_hca == NULL) 
+      return NULL;
+   memset(p_hca,0,sizeof(ftree_hca_t));
+
+   p_hca->p_osm_node = p_osm_node;
+   p_hca->up_port_groups = (ftree_port_group_t **) 
+        malloc(osm_node_get_num_physp(p_hca->p_osm_node) * sizeof (ftree_port_group_t *));
+   if (!p_hca->up_port_groups)
+      return NULL;
+   p_hca->up_port_groups_num = 0;
+   return p_hca;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_hca_destroy(
+   IN  ftree_hca_t * p_hca)
+{
+   uint32_t i;
+
+   if (!p_hca)
+      return;
+
+   for (i = 0; i < p_hca->up_port_groups_num; i++)
+      __osm_ftree_port_group_destroy(p_hca->up_port_groups[i]);
+
+   if (p_hca->up_port_groups)
+      free(p_hca->up_port_groups);
+
+   free(p_hca);
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_hca_dump(
+   IN  ftree_hca_t * p_hca)
+{
+   uint32_t i;
+   if (!p_hca)
+      return;
+
+   if (!osm_log_is_active(&osm.log,OSM_LOG_DEBUG))
+      return;
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,
+           "__osm_ftree_hca_dump: "
+           "HCA GUID: 0x%016" PRIx64 ", Ports: %u UP\n",
+          cl_ntoh64(osm_node_get_node_guid(p_hca->p_osm_node)), 
+          p_hca->up_port_groups_num);
+
+   for( i = 0; i < p_hca->up_port_groups_num; i++ ) 
+      __osm_ftree_port_group_dump(p_hca->up_port_groups[i],FTREE_DIRECTION_UP);
+}
+
+/***************************************************/
+
+static ftree_port_group_t *
+__osm_ftree_hca_get_port_group_by_remote_lid(
+   IN  ftree_hca_t * p_hca,
+   IN  ib_net16_t    remote_base_lid)
+{
+   uint32_t i;
+   for (i = 0; i < p_hca->up_port_groups_num; i++)
+      if (remote_base_lid == p_hca->up_port_groups[i]->remote_base_lid)
+         return p_hca->up_port_groups[i];
+
+   return NULL;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_hca_add_port(
+   IN  ftree_hca_t * p_hca,
+   IN  uint16_t      port_num,
+   IN  uint16_t      remote_port_num,
+   IN  ib_net16_t    base_lid,
+   IN  uint8_t       lmc,
+   IN  ib_net16_t    remote_base_lid,
+   IN  uint8_t       remote_lmc,
+   IN  ib_net64_t    port_guid,
+   IN  ib_net64_t    remote_port_guid,
+   IN  ib_net64_t    remote_node_guid,
+   IN  uint8_t       remote_node_type,
+   IN  void        * p_remote_hca_or_sw)
+{
+   ftree_port_group_t * p_group;
+
+   /* this function is supposed to be called only for adding ports
+      in hca's that lead to switches */ 
+   CL_ASSERT(remote_node_type == IB_NODE_TYPE_SWITCH);
+
+   p_group = __osm_ftree_hca_get_port_group_by_remote_lid(p_hca,remote_base_lid);
+
+   if (!p_group)
+   {
+      p_group = __osm_ftree_port_group_create(
+                     base_lid,
+                     lmc,
+                     remote_base_lid,
+                     remote_lmc,
+                     &port_guid,
+                     &remote_port_guid,
+                     &remote_node_guid,
+                     remote_node_type,
+                     p_remote_hca_or_sw);
+      p_hca->up_port_groups[p_hca->up_port_groups_num++] = p_group;
+   }
+   __osm_ftree_port_group_add_port(p_group, port_num, remote_port_num);
+
+} /* __osm_ftree_hca_add_port() */
+
+/***************************************************
+ **
+ ** ftree_fabric_t functions
+ **
+ ***************************************************/
+
+static ftree_fabric_t * 
+__osm_ftree_fabric_create()
+{
+   cl_status_t status;
+   ftree_fabric_t * p_ftree = (ftree_fabric_t *)malloc(sizeof(ftree_fabric_t));
+   if (p_ftree == NULL) 
+      return NULL;
+
+   memset(p_ftree,0,sizeof(ftree_fabric_t));
+
+   cl_qmap_init(&p_ftree->hca_tbl);
+   cl_qmap_init(&p_ftree->sw_tbl);
+   cl_qmap_init(&p_ftree->sw_by_tuple_tbl);
+
+   status = cl_pool_init( &p_ftree->sw_fwd_tbl_pool,
+                          8,                 /* min pool size */
+                          0,                 /* max pool size - unlimited */
+                          8,                 /* grow size */
+                          FTREE_FWD_TBL_LEN, /* object_size */
+                          NULL,              /* object initializer */
+                          NULL,              /* object destructor */
+                          NULL );            /* context */
+   if (status != CL_SUCCESS)
+      return NULL;
+
+   p_ftree->tree_rank = 1;
+   return p_ftree;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_clear(ftree_fabric_t * p_ftree)
+{
+   ftree_hca_t * p_hca;
+   ftree_hca_t * p_next_hca;
+   ftree_sw_t * p_sw;
+   ftree_sw_t * p_next_sw;
+   ftree_sw_tbl_element_t * p_element;
+   ftree_sw_tbl_element_t * p_next_element;
+
+   if (!p_ftree)
+      return;
+
+   /* remove all the elements of hca_tbl */
+
+   p_next_hca = (ftree_hca_t *)cl_qmap_head(&p_ftree->hca_tbl);
+   while( p_next_hca != (ftree_hca_t *)cl_qmap_end( &p_ftree->hca_tbl ) )
+   {
+      p_hca = p_next_hca;
+      p_next_hca = (ftree_hca_t *)cl_qmap_next(&p_hca->map_item );
+      __osm_ftree_hca_destroy(p_hca);
+   }
+   cl_qmap_remove_all(&p_ftree->hca_tbl);
+
+   /* remove all the elements of sw_tbl */
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( p_next_sw != (ftree_sw_t *)cl_qmap_end( &p_ftree->sw_tbl ) )
+   {
+      p_sw = p_next_sw;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+      __osm_ftree_sw_destroy(p_ftree,p_sw);
+   }
+   cl_qmap_remove_all(&p_ftree->sw_tbl);
+
+   /* remove all the elements of sw_by_tuple_tbl */
+
+   p_next_element = 
+      (ftree_sw_tbl_element_t *)cl_qmap_head(&p_ftree->sw_by_tuple_tbl);
+   while( p_next_element != 
+          (ftree_sw_tbl_element_t *)cl_qmap_end( &p_ftree->sw_by_tuple_tbl ) )
+   {
+      p_element = p_next_element;
+      p_next_element = 
+         (ftree_sw_tbl_element_t *)cl_qmap_next(&p_element->map_item);
+      __osm_ftree_sw_tbl_element_destroy(p_element);
+   }
+   cl_qmap_remove_all(&p_ftree->sw_by_tuple_tbl);
+
+   /* free the leaf switches array */
+   if ((p_ftree->leaf_switches_num > 0) && (p_ftree->leaf_switches))
+      free(p_ftree->leaf_switches);
+
+   p_ftree->leaf_switches_num = 0;
+   p_ftree->leaf_switches = NULL;
+
+} /* __osm_ftree_fabric_destroy() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_destroy(ftree_fabric_t * p_ftree)
+{
+   if (!p_ftree)
+      return;
+   __osm_ftree_fabric_clear(p_ftree);
+   cl_pool_destroy(&p_ftree->sw_fwd_tbl_pool);
+   free(p_ftree);
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_set_rank(ftree_fabric_t * p_ftree, uint16_t rank)
+{
+   if (rank > p_ftree->tree_rank)
+      p_ftree->tree_rank = rank;
+}
+
+/***************************************************/
+
+static uint16_t 
+__osm_ftree_fabric_get_rank(ftree_fabric_t * p_ftree)
+{
+   return p_ftree->tree_rank;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_add_hca(ftree_fabric_t * p_ftree, osm_node_t * p_osm_node)
+{
+   ftree_hca_t * p_hca = __osm_ftree_hca_create(p_osm_node);
+
+   CL_ASSERT(osm_node_get_type(p_osm_node) == IB_NODE_TYPE_CA);
+
+   cl_qmap_insert(&p_ftree->hca_tbl,
+                  p_osm_node->node_info.node_guid,
+                  &p_hca->map_item);
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_add_sw(ftree_fabric_t * p_ftree, osm_switch_t * p_osm_sw)
+{
+   ftree_sw_t * p_sw = __osm_ftree_sw_create(p_ftree,p_osm_sw);
+
+   CL_ASSERT(osm_node_get_type(p_osm_sw->p_node) == IB_NODE_TYPE_SWITCH);
+
+   cl_qmap_insert(&p_ftree->sw_tbl,
+                  p_osm_sw->p_node->node_info.node_guid,
+                  &p_sw->map_item);
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_add_sw_by_tuple(
+   IN  ftree_fabric_t * p_ftree, 
+   IN  ftree_sw_t * p_sw)
+{
+   CL_ASSERT(__osm_ftree_tuple_assigned(p_sw->tuple));
+
+   cl_qmap_insert(&p_ftree->sw_by_tuple_tbl,
+                  __osm_ftree_tuple_to_key(p_sw->tuple),
+                  &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
+}
+
+/***************************************************/
+
+static ftree_sw_t * 
+__osm_ftree_fabric_get_sw_by_tuple(
+   IN  ftree_fabric_t * p_ftree, 
+   IN  ftree_tuple_t tuple)
+{
+   ftree_sw_tbl_element_t * p_element;
+
+   CL_ASSERT(__osm_ftree_tuple_assigned(tuple));
+
+   __osm_ftree_tuple_to_key(tuple);
+
+   p_element = (ftree_sw_tbl_element_t * )cl_qmap_get(&p_ftree->sw_by_tuple_tbl,
+                                                      __osm_ftree_tuple_to_key(tuple));
+   if (p_element == (ftree_sw_tbl_element_t * )cl_qmap_end(&p_ftree->sw_by_tuple_tbl))
+      return NULL;
+
+   return p_element->p_sw;
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_dump(ftree_fabric_t * p_ftree)
+{
+   uint32_t i;
+   ftree_hca_t * p_hca;
+   ftree_sw_t * p_sw;
+
+   if (!osm_log_is_active(&osm.log,OSM_LOG_DEBUG))
+      return;
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,"__osm_ftree_fabric_dump: \n"
+           "                       |-------------------------------|\n"
+           "                       |-  Full fabric topology dump  -|\n"
+           "                       |-------------------------------|\n\n");
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,
+           "__osm_ftree_fabric_dump: -- HCAs:\n");
+
+   for ( p_hca = (ftree_hca_t *)cl_qmap_head(&p_ftree->hca_tbl);
+         p_hca != (ftree_hca_t *)cl_qmap_end(&p_ftree->hca_tbl);
+         p_hca = (ftree_hca_t *)cl_qmap_next(&p_hca->map_item) )
+   {
+      __osm_ftree_hca_dump(p_hca);
+   }
+
+   for (i = 0; i < __osm_ftree_fabric_get_rank(p_ftree); i++)
+   {
+      osm_log(&osm.log, OSM_LOG_DEBUG,
+              "__osm_ftree_fabric_dump: -- Rank %u switches\n", i);
+      for ( p_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+            p_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl);
+            p_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item) )
+      {
+         if (p_sw->rank == i)
+            __osm_ftree_sw_dump(p_sw);
+      }
+   }
+
+   osm_log(&osm.log, OSM_LOG_DEBUG,"__osm_ftree_fabric_dump: \n"
+           "                       |---------------------------------------|\n"
+           "                       |- Full fabric topology dump completed -|\n"
+           "                       |---------------------------------------|\n\n");
+} /* __osm_ftree_fabric_dump() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_dump_general_info(
+   IN  ftree_fabric_t * p_ftree)
+{
+   uint32_t i,j;
+   ftree_sw_t * p_sw;
+   char * addition_str;
+
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info:\n");
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info: "
+           "General fabric topology info\n");
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info: "
+           "============================\n");
+
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info: "
+           "  - FatTree rank (switches only): %u\n",
+          p_ftree->tree_rank);
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info: "
+           "  - Fabric has %u HCAs, %u switches\n",
+          cl_qmap_count(&p_ftree->hca_tbl),
+          cl_qmap_count(&p_ftree->sw_tbl));
+
+   for (i = 0; i < __osm_ftree_fabric_get_rank(p_ftree); i++)
+   {
+      j = 0;
+      for ( p_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+            p_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl);
+            p_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item) )
+      {
+         if (p_sw->rank == i)
+            j++;
+      }
+      if (i == 0)
+         addition_str = " (root) ";
+      else 
+         if (i == (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+            addition_str = " (leaf) ";
+         else
+            addition_str = " ";
+         osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_dump_general_info: "
+                 "  - Fabric has %u rank %u%sswitches\n",j,i,addition_str);
+   }
+
+   if (osm_log_is_active(&osm.log,OSM_LOG_VERBOSE))
+   {
+      osm_log(&osm.log, OSM_LOG_VERBOSE,
+              "__osm_ftree_fabric_dump_general_info: "
+              "  - Root switches:\n");
+      for ( p_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+            p_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl);
+            p_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item) )
+      {
+         if (p_sw->rank == 0)
+               osm_log(&osm.log, OSM_LOG_VERBOSE,
+                       "__osm_ftree_fabric_dump_general_info: "
+                       "      GUID: 0x%016" PRIx64 ", LID: 0x%x, Index %s\n",
+                       cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+                       cl_ntoh16(p_sw->base_lid),
+                       __osm_ftree_tuple_to_str(p_sw->tuple));
+      }
+
+      osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_fabric_dump_general_info: "
+              "  - Leaf switches (sorted by index):\n");
+      for (i = 0; i < p_ftree->leaf_switches_num; i++)
+      {
+            osm_log(&osm.log, OSM_LOG_VERBOSE,
+                    "__osm_ftree_fabric_dump_general_info: "
+                    "      GUID: 0x%016" PRIx64 ", LID: 0x%x, Index %s\n",
+                    cl_ntoh64(osm_node_get_node_guid(
+                                 osm_switch_get_node_ptr(p_ftree->leaf_switches[i]->p_osm_sw))),
+                    cl_ntoh16(p_ftree->leaf_switches[i]->base_lid),
+                    __osm_ftree_tuple_to_str(p_ftree->leaf_switches[i]->tuple));
+      }
+   }
+} /* __osm_ftree_fabric_dump_general_info() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_dump_hca_ordering(
+   IN  ftree_fabric_t * p_ftree)
+{  
+   ftree_hca_t        * p_hca;
+   ftree_sw_t         * p_sw;
+   ftree_port_group_t * p_group;
+   uint32_t             i;
+   uint32_t             j;
+
+   char desc[IB_NODE_DESCRIPTION_SIZE + 1];
+   char path[1024];
+   FILE * p_hca_ordering_file;
+   char * filename = "osm-ftree-ca-order.dump";
+
+   snprintf(path, sizeof(path), "%s/%s", 
+            osm.subn.opt.dump_files_dir, filename);
+   p_hca_ordering_file = fopen(path, "w");
+   if (!p_hca_ordering_file) 
+   {
+      osm_log(&osm.log, OSM_LOG_ERROR,
+              "__osm_ftree_fabric_dump_hca_ordering: ERR AB01: "
+              "cannot open file \'%s\': %s\n",
+               filename, strerror(errno));
+      OSM_LOG_EXIT(&(osm.log));
+      return;
+   }
+   
+   /* for each leaf switch (in indexing order) */
+   for(i = 0; i < p_ftree->leaf_switches_num; i++)
+   {
+      p_sw = p_ftree->leaf_switches[i];
+      /* for each real HCA connected to this switch */
+      for (j = 0; j < p_sw->down_port_groups_num; j++)
+      {
+         p_group = p_sw->down_port_groups[j];
+         p_hca = p_group->remote_hca_or_sw.remote_hca;
+         memcpy(desc,p_hca->p_osm_node->node_desc.description,IB_NODE_DESCRIPTION_SIZE);
+         desc[IB_NODE_DESCRIPTION_SIZE] = '\0';
+
+         fprintf(p_hca_ordering_file,"0x%x\t%s\n", 
+                 cl_ntoh16(p_group->remote_base_lid), desc);
+      }
+
+      /* now print dummy HCAs */
+      for (j = p_sw->down_port_groups_num; j < p_ftree->max_hcas_per_leaf; j++)
+      {
+         fprintf(p_hca_ordering_file,"0xFFFF\tDUMMY\n");
+      }
+
+   }
+   /* done going through all the leaf switches */
+
+   fclose(p_hca_ordering_file);
+} /* __osm_ftree_fabric_dump_hca_ordering() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_assign_tuple(
+   IN   ftree_fabric_t * p_ftree,
+   IN   ftree_sw_t * p_sw,
+   IN   ftree_tuple_t new_tuple)
+{
+   memcpy(p_sw->tuple, new_tuple, FTREE_TUPLE_LEN);
+   __osm_ftree_fabric_add_sw_by_tuple(p_ftree,p_sw);
+}
+
+/***************************************************/
+
+static void 
+__osm_ftree_fabric_assign_first_tuple(
+   IN   ftree_fabric_t * p_ftree,
+   IN   ftree_sw_t * p_sw)
+{
+   uint8_t i;
+   ftree_tuple_t new_tuple;
+
+   __osm_ftree_tuple_init(new_tuple);
+   new_tuple[0] = p_sw->rank;
+   for (i = 1; i <= p_sw->rank; i++)
+      new_tuple[i] = 0;
+
+   __osm_ftree_fabric_assign_tuple(p_ftree,p_sw,new_tuple);
+}
+
+/***************************************************/
+
+static void
+__osm_ftree_fabric_get_new_tuple(
+   IN   ftree_fabric_t * p_ftree,
+   OUT  ftree_tuple_t new_tuple,
+   IN   ftree_tuple_t from_tuple,
+   IN   ftree_direction_t direction)
+{
+   ftree_sw_t * p_sw;
+   ftree_tuple_t temp_tuple;
+   uint8_t var_index;
+   uint8_t i;
+
+   __osm_ftree_tuple_init(new_tuple);
+   memcpy(temp_tuple, from_tuple, FTREE_TUPLE_LEN);
+
+   if (direction == FTREE_DIRECTION_DOWN)
+   {
+      temp_tuple[0] ++;
+      var_index = from_tuple[0] + 1;
+   }
+   else
+   {
+      temp_tuple[0] --;
+      var_index = from_tuple[0];
+   }
+
+   for (i = 0; i < 0xFF; i++)
+   {
+      temp_tuple[var_index] = i;
+      p_sw = __osm_ftree_fabric_get_sw_by_tuple(p_ftree,temp_tuple);
+      if (p_sw == NULL) /* found free tuple */ 
+         break;
+   }
+
+   if (i == 0xFF)
+   {
+      /* new tuple not found - there are more than 255 ports in one direction */
+      return;
+   }
+   memcpy(new_tuple, temp_tuple, FTREE_TUPLE_LEN);
+
+} /* __osm_ftree_fabric_get_new_tuple() */
+
+/***************************************************/
+
+static void
+__osm_ftree_fabric_calculate_rank(
+   IN  ftree_fabric_t * p_ftree)
+{
+   ftree_sw_t   * p_sw;
+   ftree_sw_t   * p_next_sw;
+   uint16_t       max_rank = 0;
+
+   /* go over all the switches and find maximal switch rank */
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( p_next_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl) )
+   {
+      p_sw = p_next_sw;
+      if(p_sw->rank > max_rank)
+         max_rank = p_sw->rank;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+   }
+
+   /* set FatTree rank */
+   __osm_ftree_fabric_set_rank(p_ftree, max_rank + 1);
+}
+
+/***************************************************/
+
+static void
+__osm_ftree_fabric_make_indexing(
+   IN   ftree_fabric_t * p_ftree)
+{
+   ftree_sw_t         * p_remote_sw;
+   ftree_sw_t         * p_sw;
+   ftree_sw_t         * p_next_sw;
+   ftree_tuple_t        new_tuple;
+   uint32_t             i;
+   cl_list_t            bfs_list;
+   ftree_sw_tbl_element_t * p_sw_tbl_element;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_make_indexing);
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_fabric_make_indexing: "
+           "Starting FatTree indexing\n");
+
+   /* create array of leaf switches */
+   p_ftree->leaf_switches = (ftree_sw_t **)
+         malloc(cl_qmap_count(&p_ftree->sw_tbl) * sizeof(ftree_sw_t *));
+
+   /* Looking for a leaf switch - the one that has rank equal to (tree_rank - 1).
+      This switch will be used as a starting point for indexing algorithm. */
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( p_next_sw != (ftree_sw_t *)cl_qmap_end( &p_ftree->sw_tbl ) )
+   {
+      p_sw = p_next_sw;
+      if(p_sw->rank == (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+         break;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+   }
+
+   CL_ASSERT(p_next_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl));
+
+   /* Assign the first tuple to the switch that is used as BFS starting point.
+      The tuple will be as follows: [rank].0.0.0...
+      This fuction also adds the switch it into the switch_by_tuple table. */
+   __osm_ftree_fabric_assign_first_tuple(p_ftree,p_sw);
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_fabric_make_indexing: "
+           "Indexing starting point:\n"
+           "                                            - Switch rank  : %u\n"
+           "                                            - Switch index : %s\n"
+           "                                            - Node LID     : 0x%x\n"
+           "                                            - Node GUID    : 0x%016" PRIx64 "\n",
+           p_sw->rank,
+           __osm_ftree_tuple_to_str(p_sw->tuple),
+           cl_ntoh16(p_sw->base_lid),
+           cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))));
+
+   /* 
+    * Now run BFS and assign indexes to all switches
+    * Pseudo code of the algorithm is as follows:
+    *
+    *  * Add first switch to BFS queue
+    *  * While (BFS queue not empty)
+    *      - Pop the switch from the head of the queue
+    *      - Scan all the downward and upward ports
+    *      - For each port
+    *          + Get the remote switch
+    *          + Assign index to the remote switch
+    *          + Add remote switch to the BFS queue
+    */
+
+   cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
+   cl_list_insert_tail(&bfs_list, &__osm_ftree_sw_tbl_element_create(p_sw)->map_item);
+
+   while (!cl_is_list_empty(&bfs_list))
+   {
+      p_sw_tbl_element = (ftree_sw_tbl_element_t *)cl_list_remove_head(&bfs_list);
+      p_sw = p_sw_tbl_element->p_sw;
+      __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
+
+      /* Discover all the nodes from ports that are pointing down */
+
+      if (p_sw->rank == (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+      {
+         /* add switch to leaf switches array */
+         p_ftree->leaf_switches[p_ftree->leaf_switches_num++] = p_sw;
+         /* update the max_hcas_per_leaf value */
+         if (p_sw->down_port_groups_num > p_ftree->max_hcas_per_leaf)
+            p_ftree->max_hcas_per_leaf = p_sw->down_port_groups_num;
+      }
+      else
+      {
+         /* This is not the leaf switch, which means that all the
+            ports that point down are taking us to another switches.
+            No need to assign indexing to HCAs */
+         for( i = 0; i < p_sw->down_port_groups_num; i++ ) 
+         {
+            p_remote_sw = p_sw->down_port_groups[i]->remote_hca_or_sw.remote_sw;
+            if (__osm_ftree_tuple_assigned(p_remote_sw->tuple))
+            {
+               /* this switch has been already indexed */
+               continue;
+            }
+            /* allocate new tuple */
+            __osm_ftree_fabric_get_new_tuple(p_ftree,
+                                             new_tuple,
+                                             p_sw->tuple,
+                                             FTREE_DIRECTION_DOWN);
+            /* Assign the new tuple to the remote switch.
+               This fuction also adds the switch into the switch_by_tuple table. */
+            __osm_ftree_fabric_assign_tuple(p_ftree,
+                                            p_remote_sw,
+                                            new_tuple);
+
+            /* add the newly discovered switch to the BFS queue */
+            cl_list_insert_tail(&bfs_list, 
+                                &__osm_ftree_sw_tbl_element_create(p_remote_sw)->map_item);
+         }
+         /* Done assigning indexes to all the remote switches 
+            that are pointed by the downgoing ports. 
+            Now sort port groups according to remote index. */
+         qsort(p_sw->down_port_groups,                      /* array */
+               p_sw->down_port_groups_num,                  /* number of elements */
+               sizeof(ftree_port_group_t *),                /* size of each element */
+               __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */
+      }
+
+      /* Done indexing switches from ports that go down.
+         Now do the same with ports that are pointing up. */
+
+      if (p_sw->rank != 0)
+      {
+         /* This is not the root switch, which means that all the ports
+            that are pointing up are taking us to another switches. */
+         for( i = 0; i < p_sw->up_port_groups_num; i++ ) 
+         {
+            p_remote_sw = p_sw->up_port_groups[i]->remote_hca_or_sw.remote_sw;
+            if (__osm_ftree_tuple_assigned(p_remote_sw->tuple))
+               continue;
+            /* allocate new tuple */
+            __osm_ftree_fabric_get_new_tuple(p_ftree,
+                                             new_tuple,
+                                             p_sw->tuple,
+                                             FTREE_DIRECTION_UP);
+            /* Assign the new tuple to the remote switch.
+               This fuction also adds the switch to the
+               switch_by_tuple table. */
+            __osm_ftree_fabric_assign_tuple(p_ftree,
+                                            p_remote_sw,
+                                            new_tuple);
+            /* add the newly discovered switch to the BFS queue */
+            cl_list_insert_tail(&bfs_list, 
+                                &__osm_ftree_sw_tbl_element_create(p_remote_sw)->map_item);
+         }
+         /* Done assigning indexes to all the remote switches 
+            that are pointed by the upgoing ports. 
+            Now sort port groups according to remote index. */
+         qsort(p_sw->up_port_groups,                        /* array */
+               p_sw->up_port_groups_num,                    /* number of elements */
+               sizeof(ftree_port_group_t *),                /* size of each element */
+               __osm_ftree_compare_port_groups_by_remote_switch_index); /* comparator */
+      }
+      /* Done assigning indexes to all the switches that are directly connected 
+         to the current switch - go to the next switch in the BFS queue */
+   }
+
+   /* sort array of leaf switches by index */
+   qsort(p_ftree->leaf_switches,     /* array */
+         p_ftree->leaf_switches_num, /* number of elements */
+         sizeof(ftree_sw_t *),       /* size of each element */
+         __osm_ftree_compare_switches_by_index); /* comparator */
+
+   OSM_LOG_EXIT(&(osm.log));
+} /* __osm_ftree_fabric_make_indexing() */
+
+/***************************************************/
+
+static boolean_t
+__osm_ftree_fabric_validate_topology(
+   IN   ftree_fabric_t * p_ftree)
+{
+   ftree_port_group_t * p_group;
+   ftree_port_group_t * p_ref_group;
+   ftree_sw_t         * p_sw;
+   ftree_sw_t         * p_next_sw;
+   ftree_sw_t        ** reference_sw_arr;
+   uint16_t             tree_rank = __osm_ftree_fabric_get_rank(p_ftree);
+   boolean_t            res = TRUE;
+   uint8_t              i;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_validate_topology);
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE, "__osm_ftree_fabric_validate_topology: "
+           "Validating fabric topology\n");
+
+   reference_sw_arr = (ftree_sw_t **)malloc(tree_rank * sizeof(ftree_sw_t *));
+   if ( reference_sw_arr == NULL )
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,"Fat-tree routing: Memory allocation failed\n");
+      return FALSE;
+   }
+   memset(reference_sw_arr, 0, tree_rank * sizeof(ftree_sw_t *));
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( res && 
+          p_next_sw != (ftree_sw_t *)cl_qmap_end( &p_ftree->sw_tbl ) )
+   {
+      p_sw = p_next_sw;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+
+      if (!reference_sw_arr[p_sw->rank])
+      {
+         /* This is the first switch in the current level that 
+            we're checking - use it as a reference */
+         reference_sw_arr[p_sw->rank] = p_sw;
+      }
+      else
+      {
+         /* compare this switch properties to the reference switch */
+
+         if ( reference_sw_arr[p_sw->rank]->up_port_groups_num != p_sw->up_port_groups_num )
+         {
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_validate_topology: "
+                    "ERR AB09: Different number of upward port groups on switches:\n"
+                    "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u groups\n"
+                    "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u groups\n",
+                    cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(reference_sw_arr[p_sw->rank]->p_osm_sw))),
+                    cl_ntoh16(reference_sw_arr[p_sw->rank]->base_lid),
+                    __osm_ftree_tuple_to_str(reference_sw_arr[p_sw->rank]->tuple),
+                    reference_sw_arr[p_sw->rank]->up_port_groups_num,
+                    cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+                    cl_ntoh16(p_sw->base_lid),
+                    __osm_ftree_tuple_to_str(p_sw->tuple),
+                    p_sw->up_port_groups_num);
+            res = FALSE;
+            break;
+         }
+
+         if ( p_sw->rank != (__osm_ftree_fabric_get_rank(p_ftree) - 1) &&
+              reference_sw_arr[p_sw->rank]->down_port_groups_num != p_sw->down_port_groups_num )
+         {
+            /* we're allowing some hca's to be missing */
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_validate_topology: "
+                    "ERR AB0A: Different number of downward port groups on switches:\n"
+                    "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u port groups\n"
+                    "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u port groups\n",
+                    cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(reference_sw_arr[p_sw->rank]->p_osm_sw))),
+                    cl_ntoh16(reference_sw_arr[p_sw->rank]->base_lid),
+                    __osm_ftree_tuple_to_str(reference_sw_arr[p_sw->rank]->tuple),
+                    reference_sw_arr[p_sw->rank]->down_port_groups_num,
+                    cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+                    cl_ntoh16(p_sw->base_lid),
+                    __osm_ftree_tuple_to_str(p_sw->tuple),
+                    p_sw->down_port_groups_num);
+            res = FALSE;
+            break;
+         }
+
+         if ( reference_sw_arr[p_sw->rank]->up_port_groups_num != 0 )
+         {
+            p_ref_group = reference_sw_arr[p_sw->rank]->up_port_groups[0];
+            for (i = 0; i < p_sw->up_port_groups_num; i++)
+            {
+                p_group = p_sw->up_port_groups[i];
+                if (cl_ptr_vector_get_size(&p_ref_group->ports) != cl_ptr_vector_get_size(&p_group->ports))
+                {
+                   osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_validate_topology: "
+                           "ERR AB0B: Different number of ports in an upward port group on switches:\n"
+                           "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u ports\n"
+                           "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u ports\n",
+                           cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(reference_sw_arr[p_sw->rank]->p_osm_sw))),
+                           cl_ntoh16(reference_sw_arr[p_sw->rank]->base_lid),
+                           __osm_ftree_tuple_to_str(reference_sw_arr[p_sw->rank]->tuple),
+                           cl_ptr_vector_get_size(&p_ref_group->ports),
+                           cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+                           cl_ntoh16(p_sw->base_lid),
+                           __osm_ftree_tuple_to_str(p_sw->tuple),
+                           cl_ptr_vector_get_size(&p_group->ports));
+                   res = FALSE;
+                   break;
+                }
+            }
+         }
+         if ( reference_sw_arr[p_sw->rank]->down_port_groups_num != 0 &&
+              p_sw->rank != (tree_rank - 1) )
+         {
+            /* we're allowing some hca's to be missing */
+            p_ref_group = reference_sw_arr[p_sw->rank]->down_port_groups[0];
+            for (i = 0; i < p_sw->down_port_groups_num; i++)
+            {
+                p_group = p_sw->down_port_groups[0];
+                if (cl_ptr_vector_get_size(&p_ref_group->ports) != cl_ptr_vector_get_size(&p_group->ports))
+                {
+                   osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_validate_topology: "
+                           "ERR AB0C: Different number of ports in an downward port group on switches:\n"
+                           "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u ports\n"
+                           "       GUID 0x%016" PRIx64 ", LID 0x%x, Index %s - %u ports\n",
+                           cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(reference_sw_arr[p_sw->rank]->p_osm_sw))),
+                           cl_ntoh16(reference_sw_arr[p_sw->rank]->base_lid),
+                           __osm_ftree_tuple_to_str(reference_sw_arr[p_sw->rank]->tuple),
+                           cl_ptr_vector_get_size(&p_ref_group->ports),
+                           cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+                           cl_ntoh16(p_sw->base_lid),
+                           __osm_ftree_tuple_to_str(p_sw->tuple),
+                           cl_ptr_vector_get_size(&p_group->ports));
+                   res = FALSE;
+                   break;
+                }
+            }
+         }
+      } /* end of else */
+   } /* end of while */
+
+   if (res == TRUE)
+      osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_fabric_validate_topology: "
+                    "Fabric topology has been identified as FatTree\n");
+   else
+      osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_validate_topology: "
+                    "ERR AB0D: Fabric topology hasn't been identified as FatTree\n");
+
+   free(reference_sw_arr);
+   OSM_LOG_EXIT(&(osm.log));
+   return res;
+} /* __osm_ftree_fabric_validate_topology() */
+
+/***************************************************
+ ***************************************************/
+
+static void
+__osm_ftree_set_sw_fwd_table(
+   IN  cl_map_item_t* const p_map_item, 
+   IN  void *context)
+{
+   ftree_sw_t * p_sw = (ftree_sw_t * const) p_map_item;
+   memcpy(osm.sm.ucast_mgr.lft_buf, p_sw->lft_buf, FTREE_FWD_TBL_LEN);
+   osm_ucast_mgr_set_fwd_table(&osm.sm.ucast_mgr,p_sw->p_osm_sw);
+}
+
+/***************************************************
+ ***************************************************/
+
+/*  
+ * Function: assign-up-going-port-by-descending-down
+ * Given   : a switch and a LID
+ * Pseudo code: 
+ *    foreach down-going-port-group (in indexing order)
+ *        skip this group if the LFT(LID) port is part of this group
+ *        find the least loaded port of the group (scan in indexing order)
+ *        r-port is the remote port connected to it
+ *        assign the remote switch node LFT(LID) to r-port
+ *        increase r-port usage counter
+ *        assign-up-going-port-by-descending-down to r-port node (recursion)
+ */
+
+static void
+__osm_ftree_fabric_route_upgoing_by_going_down(
+   IN  ftree_fabric_t * p_ftree,
+   IN  ftree_sw_t     * p_sw,
+   IN  ftree_sw_t     * p_prev_sw,
+   IN  ib_net16_t       target_lid,
+   IN  boolean_t        is_real_lid,
+   IN  boolean_t        is_main_path)
+{
+   ftree_sw_t          * p_remote_sw;
+   uint16_t              ports_num;
+   ftree_port_group_t  * p_group;
+   ftree_port_t        * p_port;
+   ftree_port_t        * p_min_port;
+   uint16_t              i;
+   uint16_t              j;
+
+   /* we shouldn't enter here if both real_lid and main_path are false */
+   CL_ASSERT(is_real_lid || is_main_path);
+
+   /* can't be here for leaf switch, */
+   CL_ASSERT(p_sw->rank != (__osm_ftree_fabric_get_rank(p_ftree) - 1));
+
+   /* if there is no down-going ports */
+   if (p_sw->down_port_groups_num == 0) 
+       return;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_route_upgoing_by_going_down);
+
+   /* foreach down-going port group (in indexing order) */
+   for (i = 0; i < p_sw->down_port_groups_num; i++)
+   {
+      p_group = p_sw->down_port_groups[i];
+
+      if ( p_prev_sw && (p_group->remote_base_lid == p_prev_sw->base_lid) ) 
+      {
+         /* This port group has a port that was used when we entered this switch,
+            which means that the current group points to the switch where we were
+            at the previous step of the algorithm (before going up).
+            Skipping this group. */
+            continue;
+      }
+
+      /* find the least loaded port of the group (in indexing order) */
+      p_min_port = NULL;
+      ports_num = cl_ptr_vector_get_size(&p_group->ports);
+      /* ToDo: no need to select a least loaded port for non-main path.
+         Think about optimization. */
+      for (j = 0; j < ports_num; j++) 
+      {
+          cl_ptr_vector_at(&p_group->ports, j, (void **)&p_port);
+          if (!p_min_port)
+          {
+             /* first port that we're checking - set as port with the lowest load */
+             p_min_port = p_port;
+          }
+          else if (p_port->counter_up < p_min_port->counter_up)
+          {
+             /* this port is less loaded - use it as min */
+             p_min_port = p_port;
+          }
+      }
+      /* At this point we have selected a port in this group with the 
+         lowest load of upgoing routes.
+         Set on the remote switch how to get to the target_lid -
+         set LFT(target_lid) on the remote switch to the remote port */
+      p_remote_sw = p_group->remote_hca_or_sw.remote_sw;
+
+      /* Four possible cases:
+       *
+       *  1. is_real_lid == TRUE && is_main_path == TRUE: 
+       *      - going DOWN(TRUE,TRUE) through ALL the groups
+       *         + promoting port counter
+       *         + setting path in remote switch fwd tbl
+       *      
+       *  2. is_real_lid == TRUE && is_main_path == FALSE: 
+       *      - going DOWN(TRUE,FALSE) through ALL the groups but only if
+       *        the remote (upper) switch hasn't been already configured 
+       *        for this target LID
+       *         + NOT promoting port counter
+       *         + setting path in remote switch fwd tbl if it hasn't been set yet
+       *
+       *  3. is_real_lid == FALSE && is_main_path == TRUE: 
+       *      - going DOWN(FALSE,TRUE) through ALL the groups
+       *         + promoting port counter
+       *         + NOT setting path in remote switch fwd tbl
+       *
+       *  4. is_real_lid == FALSE && is_main_path == FALSE: 
+       *      - illegal state - we shouldn't get here
+       */
+
+      /* second case: skip the port group if the remote (upper)
+         switch has been already configured for this target LID */
+      if ( is_real_lid && !is_main_path &&
+           __osm_ftree_sw_get_fwd_table_block(p_remote_sw,
+                                              cl_ntoh16(target_lid)) != OSM_NO_PATH )
+            continue;
+
+      /* setting fwd tbl port only if this is real LID */
+      if (is_real_lid)
+      {
+         __osm_ftree_sw_set_fwd_table_block(p_remote_sw,
+                                            cl_ntoh16(target_lid),
+                                            p_min_port->remote_port_num);
+         osm_log(&osm.log, OSM_LOG_DEBUG,
+                 "__osm_ftree_fabric_route_upgoing_by_going_down: "
+                 "Switch %s: set path to HCA LID 0x%x through port %u\n",
+                 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
+                 cl_ntoh16(target_lid),
+                 p_min_port->remote_port_num);
+      }
+   
+      /* The number of upgoing routes is tracked in the 
+         p_port->counter_up counter of the port that belongs to
+         the upper side of the link (on switch with lower rank).
+         Counter is promoted only if we're routing LID on the main
+         path (whether it's a real LID or a dummy one). */
+      if (is_main_path)
+         p_min_port->counter_up++;
+
+      /* Recursion step:
+         Assign upgoing ports by stepping down, starting on REMOTE switch.
+         Recursion stop condition - if the REMOTE switch is a leaf switch. */
+      if (p_remote_sw->rank != (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+      {
+         __osm_ftree_fabric_route_upgoing_by_going_down(
+               p_ftree,
+               p_remote_sw,   /* remote switch - used as a route-upgoing alg. start point */
+               NULL,          /* prev. position - NULL to mark that we went down and not up */
+               target_lid,    /* LID that we're routing to */
+               is_real_lid,   /* whether the target LID is real or dummy */
+               is_main_path); /* whether this is path to HCA that should by tracked by counters */
+      }
+   }
+   /* done scanning all the down-going port groups */
+
+   OSM_LOG_EXIT(&(osm.log));
+} /* __osm_ftree_fabric_route_upgoing_by_going_down() */
+
+/***************************************************/
+
+/*  
+ * Function: assign-down-going-port-by-descending-up
+ * Given   : a switch and a LID
+ * Pseudo code: 
+ *    find the least loaded port of all the upgoing groups (scan in indexing order)
+ *    assign the LFT(LID) of remote switch to that port
+ *    track that port usage
+ *    assign-up-going-port-by-descending-down on CURRENT switch
+ *    assign-down-going-port-by-descending-up on REMOTE switch (recursion)
+ */
+
+static void
+__osm_ftree_fabric_route_downgoing_by_going_up(
+   IN  ftree_fabric_t * p_ftree,
+   IN  ftree_sw_t     * p_sw,
+   IN  ftree_sw_t     * p_prev_sw,
+   IN  ib_net16_t       target_lid,
+   IN  boolean_t        is_real_lid,
+   IN  boolean_t        is_main_path)
+{
+   ftree_sw_t          * p_remote_sw;
+   uint16_t              ports_num;
+   ftree_port_group_t  * p_group;
+   ftree_port_t        * p_port;
+   ftree_port_group_t  * p_min_group;
+   ftree_port_t        * p_min_port;
+   uint16_t              i;
+   uint16_t              j;
+
+   /* we shouldn't enter here if both real_lid and main_path are false */
+   CL_ASSERT(is_real_lid || is_main_path);
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_route_downgoing_by_going_up);
+
+   /* If this switch isn't a leaf switch:
+      Assign upgoing ports by stepping down, starting on THIS switch. */
+   if (p_sw->rank != (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+   {
+      __osm_ftree_fabric_route_upgoing_by_going_down(
+         p_ftree,
+         p_sw,          /* local switch - used as a route-upgoing alg. start point */
+         p_prev_sw,     /* switch that we went up from (NULL means that we went down) */
+         target_lid,    /* LID that we're routing to */
+         is_real_lid,   /* whether this target LID is real or dummy */
+         is_main_path); /* whether this path to HCA should by tracked by counters */
+   }
+
+   /* recursion stop condition - if it's a root switch, */
+   if (p_sw->rank == 0)
+   {
+      OSM_LOG_EXIT(&(osm.log));
+      return;
+   }
+
+   /* Find the least loaded port of all the upgoing port groups
+      (in indexing order of the remote switches). */
+   p_min_group = NULL;
+   p_min_port = NULL;
+   for (i = 0; i < p_sw->up_port_groups_num; i++)
+   {
+      p_group = p_sw->up_port_groups[i];
+
+      ports_num = cl_ptr_vector_get_size(&p_group->ports);
+      for (j = 0; j < ports_num; j++)
+      {
+         cl_ptr_vector_at(&p_group->ports, j, (void **)&p_port);
+         if (!p_min_group)
+         {
+            /* first port that we're checking - use
+               it as a port with the lowest load */
+            p_min_group = p_group;
+            p_min_port = p_port;
+         }
+         else
+         { 
+            if ( p_port->counter_down < p_min_port->counter_down  )
+            {
+               /* this port is less loaded - use it as min */
+               p_min_group = p_group;
+               p_min_port = p_port;
+            }
+         }
+      }
+   }
+
+   /* At this point we have selected a group and port with the 
+      lowest load of downgoing routes.
+      Set on the remote switch how to get to the target_lid -
+      set LFT(target_lid) on the remote switch to the remote port */
+   p_remote_sw = p_min_group->remote_hca_or_sw.remote_sw;
+
+   /* Four possible cases:
+    *
+    *  1. is_real_lid == TRUE && is_main_path == TRUE: 
+    *      - going UP(TRUE,TRUE) on selected min_group and min_port
+    *         + promoting port counter
+    *         + setting path in remote switch fwd tbl
+    *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0
+    *         + NOT promoting port counter
+    *         + setting path in remote switch fwd tbl if it hasn't been set yet
+    *      
+    *  2. is_real_lid == TRUE && is_main_path == FALSE: 
+    *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
+    *        but only if the remote (upper) switch hasn't been already 
+    *        configured for this target LID
+    *         + NOT promoting port counter
+    *         + setting path in remote switch fwd tbl if it hasn't been set yet
+    *
+    *  3. is_real_lid == FALSE && is_main_path == TRUE: 
+    *      - going UP(FALSE,TRUE) ONLY on selected min_group and min_port
+    *         + promoting port counter
+    *         + NOT setting path in remote switch fwd tbl
+    *
+    *  4. is_real_lid == FALSE && is_main_path == FALSE: 
+    *      - illegal state - we shouldn't get here
+    */
+
+   /* covering first half of case 1, and case 3 */
+   if (is_main_path)
+   {
+      if (p_sw->rank == (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+      {
+         osm_log(&osm.log, OSM_LOG_DEBUG,
+                 "__osm_ftree_fabric_route_downgoing_by_going_up: "
+                 " - Routing MAIN path for %s HCA LID 0x%x: %s --> %s\n",
+                 (is_real_lid)? "real" : "DUMMY",
+                 cl_ntoh16(target_lid),
+                 __osm_ftree_tuple_to_str(p_sw->tuple),
+                 __osm_ftree_tuple_to_str(p_remote_sw->tuple));
+      }
+      /* The number of downgoing routes is tracked in the 
+         p_port->counter_down counter of the port that belongs to
+         the lower side of the link (on switch with higher rank) */
+      p_min_port->counter_down++;
+      if (is_real_lid)
+      {
+         __osm_ftree_sw_set_fwd_table_block(p_remote_sw,
+                                            cl_ntoh16(target_lid),
+                                            p_min_port->remote_port_num);
+         p_remote_sw->lft_buf[cl_ntoh16(target_lid)] = p_min_port->remote_port_num;
+         osm_log(&osm.log, OSM_LOG_DEBUG,
+                 "__osm_ftree_fabric_route_downgoing_by_going_up: "
+                 "Switch %s: set path to HCA LID 0x%x through port %u\n",
+                 __osm_ftree_tuple_to_str(p_remote_sw->tuple),
+                 cl_ntoh16(target_lid),p_min_port->remote_port_num);
+      }
+
+      /* Recursion step: 
+         Assign downgoing ports by stepping up, starting on REMOTE switch. */
+      __osm_ftree_fabric_route_downgoing_by_going_up(
+            p_ftree,
+            p_remote_sw,    /* remote switch - used as a route-downgoing alg. next step point */
+            p_sw,           /* this switch - prev. position switch for the function */
+            target_lid,     /* LID that we're routing to */
+            is_real_lid,    /* whether this target LID is real or dummy */
+            is_main_path);  /* whether this is path to HCA that should by tracked by counters */
+   }
+
+   /* we're done for the third case */
+   if (!is_real_lid)
+   {
+      OSM_LOG_EXIT(&(osm.log));
+      return;
+   }
+
+   /* What's left to do at this point:
+    *
+    *  1. is_real_lid == TRUE && is_main_path == TRUE: 
+    *      - going UP(TRUE,FALSE) on rest of the groups, each time on port 0, 
+    *        but only if the remote (upper) switch hasn't been already 
+    *        configured for this target LID
+    *         + NOT promoting port counter
+    *         + setting path in remote switch fwd tbl if it hasn't been set yet
+    *      
+    *  2. is_real_lid == TRUE && is_main_path == FALSE: 
+    *      - going UP(TRUE,FALSE) on ALL the groups, each time on port 0,
+    *        but only if the remote (upper) switch hasn't been already 
+    *        configured for this target LID
+    *         + NOT promoting port counter
+    *         + setting path in remote switch fwd tbl if it hasn't been set yet
+    *
+    *  These two rules can be rephrased this way:
+    *   - foreach UP port group
+    *      + if remote switch has been set with the target LID
+    *         - skip this port group
+    *      + else
+    *         - select port 0
+    *         - do NOT promote port counter
+    *         - set path in remote switch fwd tbl
+    *         - go UP(TRUE,FALSE) to the remote switch
+    */
+
+   for (i = 0; i < p_sw->up_port_groups_num; i++)
+   {
+      p_group = p_sw->up_port_groups[i];
+      p_remote_sw = p_group->remote_hca_or_sw.remote_sw;
+
+      /* skip if target lid has been already set on remote switch fwd tbl */
+      if (__osm_ftree_sw_get_fwd_table_block(
+                  p_remote_sw,cl_ntoh16(target_lid)) != OSM_NO_PATH)
+         continue;
+
+      if (p_sw->rank == (__osm_ftree_fabric_get_rank(p_ftree) - 1))
+      {
+         osm_log(&osm.log, OSM_LOG_DEBUG,
+                 "__osm_ftree_fabric_route_downgoing_by_going_up: "
+                 " - Routing SECONDARY path for LID 0x%x: %s --> %s\n",
+                cl_ntoh16(target_lid),
+                __osm_ftree_tuple_to_str(p_sw->tuple),
+                __osm_ftree_tuple_to_str(p_remote_sw->tuple));
+      }
+    
+      cl_ptr_vector_at(&p_group->ports, 0, (void **)&p_port);
+      __osm_ftree_sw_set_fwd_table_block(p_remote_sw,
+                                         cl_ntoh16(target_lid),
+                                         p_port->remote_port_num);
+      /* Recursion step: 
+         Assign downgoing ports by stepping up, starting on REMOTE switch. */
+      __osm_ftree_fabric_route_downgoing_by_going_up(
+            p_ftree,
+            p_remote_sw, /* remote switch - used as a route-downgoing alg. next step point */
+            p_sw,        /* this switch - prev. position switch for the function */
+            target_lid,  /* LID that we're routing to */
+            TRUE,        /* whether the target LID is real or dummy */
+            FALSE);      /* whether this is path to HCA that should by tracked by counters */
+   }
+
+   OSM_LOG_EXIT(&(osm.log));
+} /* ftree_fabric_route_downgoing_by_going_up() */
+
+/***************************************************/
+
+/*  
+ * Pseudo code: 
+ *    foreach leaf switch (in indexing order)
+ *       for each compute node (in indexing order)
+ *          obtain the LID of the compute node
+ *          set local LFT(LID) of the port connecting to compute node
+ *          call assign-down-going-port-by-descending-up(TRUE,TRUE) on CURRENT switch
+ *       for each MISSING compute node
+ *          call assign-down-going-port-by-descending-up(FALSE,TRUE) on CURRENT switch
+ */
+
+static void
+__osm_ftree_fabric_route_to_hcas(
+   IN  ftree_fabric_t * p_ftree)
+{
+   ftree_sw_t         * p_sw;
+   ftree_port_group_t * p_group;
+   ftree_port_t       * p_port;
+   uint32_t             i;
+   uint32_t             j;
+   ib_net16_t           remote_lid;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_route_to_hcas);
+
+   /* for each leaf switch (in indexing order) */
+   for(i = 0; i < p_ftree->leaf_switches_num; i++)
+   {
+      p_sw = p_ftree->leaf_switches[i];
+
+      /* for each HCA connected to this switch */
+      for (j = 0; j < p_sw->down_port_groups_num; j++)
+      {
+         /* obtain the LID of HCA port */
+         p_group = p_sw->down_port_groups[j];
+         remote_lid = p_group->remote_base_lid;
+
+         /* set local LFT(LID) to the port that is connected to HCA */
+         cl_ptr_vector_at(&p_group->ports, 0, (void **)&p_port);
+         __osm_ftree_sw_set_fwd_table_block(p_sw,
+                                            cl_ntoh16(remote_lid),
+                                            p_port->port_num);
+         osm_log(&osm.log, OSM_LOG_DEBUG,
+                 "__osm_ftree_fabric_route_to_hcas: "
+                 "Switch %s: set path to HCA LID 0x%x through port %u\n",
+                 __osm_ftree_tuple_to_str(p_sw->tuple),
+                 cl_ntoh16(remote_lid),
+                 p_port->port_num);
+
+         /* assign downgoing ports by stepping up */
+         __osm_ftree_fabric_route_downgoing_by_going_up(
+               p_ftree,
+               p_sw,       /* local switch - used as a route-downgoing alg. start point */
+               NULL,       /* prev. position switch */
+               remote_lid, /* LID that we're routing to */
+               TRUE,       /* whether this HCA LID is real or dummy */
+               TRUE);      /* whether this path to HCA should by tracked by counters */
+      }
+
+      /* We're done with the real HCAs. Now route the dummy HCAs that are missing.*/
+
+      if (p_ftree->max_hcas_per_leaf > p_sw->down_port_groups_num)
+      {
+         osm_log(&osm.log, OSM_LOG_DEBUG,"__osm_ftree_fabric_route_to_hcas: "
+                 "Routing %u dummy HCAs\n",
+                 p_ftree->max_hcas_per_leaf - p_sw->down_port_groups_num);
+         for (j = 0; j < (p_ftree->max_hcas_per_leaf - p_sw->down_port_groups_num); j++)
+         {
+            /* assign downgoing ports by stepping up */
+            __osm_ftree_fabric_route_downgoing_by_going_up(
+                  p_ftree,
+                  p_sw,    /* local switch - used as a route-downgoing alg. start point */
+                  NULL,    /* prev. position switch */
+                  0,       /* LID that we're routing to - ignored for dummy HCA */
+                  FALSE,   /* whether this HCA LID is real or dummy */
+                  TRUE);   /* whether this path to HCA should by tracked by counters */
+         }
+      }
+   }
+   /* done going through all the leaf switches */
+   OSM_LOG_EXIT(&(osm.log));
+} /* __osm_ftree_fabric_route_to_hcas() */
+
+/***************************************************/
+
+/*  
+ * Pseudo code: 
+ *    foreach switch in fabric
+ *       obtain its LID
+ *       set local LFT(LID) to port 0
+ *       call assign-down-going-port-by-descending-up(TRUE,FALSE) on CURRENT switch
+ *
+ * Routing to switch is similar to routing a REAL hca lid on SECONDARY path:
+ *   - we should set fwd tables
+ *   - we should NOT update port counters
+ */
+
+static void
+__osm_ftree_fabric_route_to_switches(
+   IN  ftree_fabric_t * p_ftree)
+{
+   ftree_sw_t         * p_sw;
+   ftree_sw_t         * p_next_sw;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_route_to_switches);
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( p_next_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl) )
+   {
+      p_sw = p_next_sw;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+
+      /* set local LFT(LID) to 0 (route to itself) */
+      __osm_ftree_sw_set_fwd_table_block(p_sw,
+                                         cl_ntoh16(p_sw->base_lid),
+                                         0);
+
+      osm_log(&osm.log, OSM_LOG_DEBUG,"__osm_ftree_fabric_route_to_switches: "
+              "Switch %s (LID 0x%x): routing switch-to-switch pathes\n",
+              __osm_ftree_tuple_to_str(p_sw->tuple),
+              cl_ntoh16(p_sw->base_lid));
+
+      __osm_ftree_fabric_route_downgoing_by_going_up(
+            p_ftree,
+            p_sw,           /* local switch - used as a route-downgoing alg. start point */
+            NULL,           /* prev. position switch */
+            p_sw->base_lid, /* LID that we're routing to */
+            TRUE,           /* whether the target LID is a real or dummy */
+            FALSE);         /* whether this path should by tracked by counters */
+   }
+
+   OSM_LOG_EXIT(&(osm.log));
+} /* __osm_ftree_fabric_route_to_switches() */
+
+/***************************************************
+ ***************************************************/
+
+static int 
+__osm_ftree_fabric_populate_switches(
+   IN  ftree_fabric_t * p_ftree)
+{
+   osm_switch_t * p_osm_sw;
+   osm_switch_t * p_next_osm_sw;
+   osm_opensm_t * p_osm = &osm;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_populate_switches);
+
+   p_next_osm_sw = (osm_switch_t *)cl_qmap_head(&p_osm->subn.sw_guid_tbl);
+   while( p_next_osm_sw != (osm_switch_t *)cl_qmap_end(&p_osm->subn.sw_guid_tbl) )
+   {
+      p_osm_sw = p_next_osm_sw;
+      p_next_osm_sw = (osm_switch_t *)cl_qmap_next(&p_osm_sw->map_item );
+      __osm_ftree_fabric_add_sw(p_ftree,p_osm_sw);
+   }
+   OSM_LOG_EXIT(&(osm.log));
+   return 0;
+} /* __osm_ftree_fabric_populate_switches() */
+
+/***************************************************
+ ***************************************************/
+
+static int 
+__osm_ftree_fabric_populate_hcas(
+   IN  ftree_fabric_t * p_ftree)
+{
+   osm_node_t   * p_osm_node;
+   osm_node_t   * p_next_osm_node;
+   osm_opensm_t * p_osm = &osm;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_populate_hcas);
+
+   p_next_osm_node = (osm_node_t *)cl_qmap_head(&p_osm->subn.node_guid_tbl);
+   while( p_next_osm_node != (osm_node_t *)cl_qmap_end(&p_osm->subn.node_guid_tbl) )
+   {
+      p_osm_node = p_next_osm_node;
+      p_next_osm_node = (osm_node_t *)cl_qmap_next(&p_osm_node->map_item);
+      switch (osm_node_get_type(p_osm_node))
+      {
+         case IB_NODE_TYPE_CA:
+            __osm_ftree_fabric_add_hca(p_ftree,p_osm_node);
+            break;
+         case IB_NODE_TYPE_ROUTER:
+            break;
+         case IB_NODE_TYPE_SWITCH:
+            /* all the switches added separately */
+            break;
+         default:
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_populate_hcas: ERR AB0E: "
+                    "Node GUID 0x%016" PRIx64 " - Unknown node type: %s\n",
+                    cl_ntoh64(osm_node_get_node_guid(p_osm_node)),
+                    ib_get_node_type_str(osm_node_get_type(p_osm_node)));
+            OSM_LOG_EXIT(&(osm.log));
+            return -1;
+      }
+   }
+
+   OSM_LOG_EXIT(&(osm.log));
+   return 0;
+} /* __osm_ftree_fabric_populate_hcas() */
+
+/***************************************************
+ ***************************************************/
+
+static void
+__osm_ftree_rank_from_switch(
+   IN  ftree_fabric_t * p_ftree, 
+   IN  ftree_sw_t *     p_starting_sw)
+{
+   ftree_sw_t   * p_sw;
+   ftree_sw_t   * p_remote_sw;
+   osm_node_t   * p_node;
+   osm_node_t   * p_remote_node;
+   osm_physp_t  * p_osm_port;
+   uint16_t       i;
+   cl_list_t      bfs_list;
+   ftree_sw_tbl_element_t * p_sw_tbl_element = NULL;
+
+   p_starting_sw->rank = 0;
+
+   /* Run BFS scan of the tree, starting from this switch */
+
+   cl_list_init(&bfs_list, cl_qmap_count(&p_ftree->sw_tbl));
+   cl_list_insert_tail(&bfs_list, &__osm_ftree_sw_tbl_element_create(p_starting_sw)->map_item);
+
+   while (!cl_is_list_empty(&bfs_list))
+   {
+      p_sw_tbl_element = (ftree_sw_tbl_element_t *)cl_list_remove_head(&bfs_list);
+      p_sw = p_sw_tbl_element->p_sw;
+      __osm_ftree_sw_tbl_element_destroy(p_sw_tbl_element);
+
+      p_node = osm_switch_get_node_ptr(p_sw->p_osm_sw);
+
+      /* note: skipping port 0 on switches */
+      for (i = 1; i < osm_node_get_num_physp(p_node); i++)
+      {
+         p_osm_port = osm_node_get_physp_ptr(p_node,i);
+         if (!osm_physp_is_valid(p_osm_port)) 
+            continue;
+         if (!osm_link_is_healthy(p_osm_port)) 
+            continue;
+
+         p_remote_node = osm_node_get_remote_node(p_node,i,NULL);
+         if (!p_remote_node)
+            continue;
+         if (osm_node_get_type(p_remote_node) != IB_NODE_TYPE_SWITCH)
+            continue;
+
+         p_remote_sw = (ftree_sw_t *)cl_qmap_get(&p_ftree->sw_tbl,
+                                                 osm_node_get_node_guid(p_remote_node));
+         if (p_remote_sw == (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl))
+         {
+            /* remote node is not a switch */
+            continue;
+         }
+         if (__osm_ftree_sw_ranked(p_remote_sw) && p_remote_sw->rank <= (p_sw->rank + 1))
+            continue;
+
+         /* rank the remote switch and add it to the BFS list */
+         p_remote_sw->rank = p_sw->rank + 1;
+         cl_list_insert_tail(&bfs_list, 
+                             &__osm_ftree_sw_tbl_element_create(p_remote_sw)->map_item);
+      }
+   }
+} /* __osm_ftree_rank_from_switch() */
+
+
+/***************************************************
+ ***************************************************/
+
+static int 
+__osm_ftree_rank_switches_from_hca(
+   IN  ftree_fabric_t * p_ftree,
+   IN  ftree_hca_t    * p_hca)
+{
+   ftree_sw_t     * p_sw;
+   osm_node_t     * p_osm_node = p_hca->p_osm_node;
+   osm_node_t     * p_remote_osm_node;
+   osm_physp_t    * p_osm_port;
+   static uint16_t i = 0;
+   int res = 0;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_rank_switches_from_hca);
+
+   for (i = 0; i < osm_node_get_num_physp(p_osm_node); i++)
+   {
+      p_osm_port = osm_node_get_physp_ptr(p_osm_node,i);
+      if (!osm_physp_is_valid(p_osm_port)) 
+         continue;
+      if (!osm_link_is_healthy(p_osm_port)) 
+         continue;
+
+      p_remote_osm_node = osm_node_get_remote_node(p_osm_node,i,NULL);
+
+      switch (osm_node_get_type(p_remote_osm_node))
+      {
+         case IB_NODE_TYPE_CA:
+            /* HCA connected directly to another HCA - not FatTree */
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_rank_switches_from_hca: ERR AB0F: "
+                    "HCA conected directly to another HCA: "
+                    "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
+                    cl_ntoh64(osm_node_get_node_guid(p_hca->p_osm_node)),
+                    cl_ntoh64(osm_node_get_node_guid(p_remote_osm_node)));
+            res = -1;
+            goto Exit;
+
+         case IB_NODE_TYPE_ROUTER:
+            /* leaving this port - proceeding to the next one */
+            continue;
+
+         case IB_NODE_TYPE_SWITCH:
+            /* continue with this port */
+            break;
+
+         default:
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_rank_switches_from_hca: ERR AB10: "
+                    "Node GUID 0x%016" PRIx64 " - Unknown node type: %s\n",
+                    cl_ntoh64(osm_node_get_node_guid(p_remote_osm_node)),
+                    ib_get_node_type_str(osm_node_get_type(p_remote_osm_node)));
+            res = -1;
+            goto Exit;
+      }
+
+      /* remote node is switch */
+
+      p_sw = (ftree_sw_t *)cl_qmap_get(&p_ftree->sw_tbl,
+                                       p_osm_port->p_remote_physp->p_node->node_info.node_guid);
+
+      CL_ASSERT(p_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl));
+
+      if (__osm_ftree_sw_ranked(p_sw) && p_sw->rank == 0)
+         continue;
+
+      osm_log(&osm.log, OSM_LOG_DEBUG,"__osm_ftree_rank_switches_from_hca: "
+              "Marking rank of switch that is directly connected to HCA:\n"
+              "                                            - HCA guid   : 0x%016" PRIx64 "\n"
+              "                                            - Switch guid: 0x%016" PRIx64 "\n"
+              "                                            - Switch LID : 0x%x\n",
+              cl_ntoh64(osm_node_get_node_guid(p_hca->p_osm_node)),
+              cl_ntoh64(osm_node_get_node_guid(osm_switch_get_node_ptr(p_sw->p_osm_sw))),
+              cl_ntoh16(p_sw->base_lid));
+      __osm_ftree_rank_from_switch(p_ftree, p_sw);
+   }
+
+ Exit:
+   OSM_LOG_EXIT(&(osm.log));
+   return res;
+} /* __osm_ftree_rank_switches_from_hca() */
+
+/***************************************************/
+
+static void 
+__osm_ftree_sw_reverse_rank(
+   IN  cl_map_item_t* const p_map_item, 
+   IN  void *context)
+{
+   ftree_fabric_t * p_ftree = (ftree_fabric_t *)context;
+   ftree_sw_t     * p_sw = (ftree_sw_t * const) p_map_item;
+   p_sw->rank = __osm_ftree_fabric_get_rank(p_ftree) - p_sw->rank - 1;
+}
+
+/***************************************************
+ ***************************************************/
+
+static int
+__osm_ftree_fabric_construct_hca_ports(
+   IN  ftree_fabric_t  * p_ftree, 
+   IN  ftree_hca_t     * p_hca)
+{
+   ftree_sw_t      * p_remote_sw;
+   osm_node_t      * p_node = p_hca->p_osm_node;
+   osm_node_t      * p_remote_node;
+   uint8_t           remote_node_type;
+   ib_net64_t        remote_node_guid;
+   osm_physp_t     * p_remote_osm_port;
+   uint16_t          i;
+   uint8_t           remote_port_num;
+   int res = 0;
+
+   for (i = 0; i < osm_node_get_num_physp(p_node); i++)
+   {
+      osm_physp_t * p_osm_port = osm_node_get_physp_ptr(p_node,i);
+
+      if (!osm_physp_is_valid(p_osm_port)) 
+         continue;
+      if (!osm_link_is_healthy(p_osm_port)) 
+         continue;
+
+      p_remote_osm_port = osm_physp_get_remote(p_osm_port);
+      p_remote_node = osm_node_get_remote_node(p_node,i,&remote_port_num);
+
+      if (!p_remote_osm_port)
+         continue;
+
+      remote_node_type = osm_node_get_type(p_remote_node);
+      remote_node_guid = osm_node_get_node_guid(p_remote_node);
+
+      switch (remote_node_type)
+      {
+         case IB_NODE_TYPE_ROUTER:
+            /* leaving this port - proceeding to the next one */
+            continue;
+
+         case IB_NODE_TYPE_CA:
+            /* HCA connected directly to another HCA - not FatTree */
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_construct_hca_ports: ERR AB11: "
+                    "HCA conected directly to another HCA: "
+                    "0x%016" PRIx64 " <---> 0x%016" PRIx64 "\n",
+                    cl_ntoh64(osm_node_get_node_guid(p_node)),
+                    cl_ntoh64(remote_node_guid));
+            res = -1;
+            goto Exit;
+
+         case IB_NODE_TYPE_SWITCH:
+            /* continue with this port */
+            break;
+
+         default:
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_construct_hca_ports: ERR AB12: "
+                    "Node GUID 0x%016" PRIx64 " - Unknown node type: %s\n",
+                    cl_ntoh64(remote_node_guid),
+                    ib_get_node_type_str(remote_node_type));
+            res = -1;
+            goto Exit;
+      }
+
+      /* remote node is switch */
+
+      p_remote_sw = (ftree_sw_t *)cl_qmap_get(&p_ftree->sw_tbl,remote_node_guid);
+      CL_ASSERT( p_remote_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl) );
+      CL_ASSERT( (p_remote_sw->rank + 1) == __osm_ftree_fabric_get_rank(p_ftree) );
+
+      __osm_ftree_hca_add_port(
+            p_hca,                                     /* local ftree_hca object */
+            i,                                         /* local port number */
+            remote_port_num,                           /* remote port number */
+            osm_node_get_base_lid(p_node, i),          /* local lid */
+            osm_node_get_lmc(p_node, i),               /* local lmc */
+            osm_node_get_base_lid(p_remote_node, 0),   /* remote lid */
+            osm_node_get_lmc(p_remote_node, 0),        /* remote lmc */
+            osm_physp_get_port_guid(p_osm_port),       /* local port guid */
+            osm_physp_get_port_guid(p_remote_osm_port),/* remote port guid */
+            remote_node_guid,                          /* remote node guid */
+            remote_node_type,                          /* remote node type */
+            (void *) p_remote_sw);                     /* remote ftree_hca/sw object */
+   }
+
+ Exit:
+   return res;
+} /* __osm_ftree_fabric_construct_hca_ports() */
+
+/***************************************************
+ ***************************************************/
+
+static int 
+__osm_ftree_fabric_construct_sw_ports(
+   IN  ftree_fabric_t  * p_ftree, 
+   IN  ftree_sw_t      * p_sw)
+{
+   ftree_hca_t       * p_remote_hca;
+   ftree_sw_t        * p_remote_sw;
+   osm_node_t        * p_node = osm_switch_get_node_ptr(p_sw->p_osm_sw);
+   osm_node_t        * p_remote_node;
+   ib_net16_t          remote_base_lid;
+   uint8_t             remote_lmc;
+   uint8_t             remote_node_type;
+   ib_net64_t          remote_node_guid;
+   osm_physp_t       * p_remote_osm_port;
+   ftree_direction_t   direction;
+   void              * p_remote_hca_or_sw;
+   uint16_t            i;
+   uint8_t             remote_port_num;
+   int res = 0;
+
+   CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
+
+   for (i = 0; i < osm_node_get_num_physp(p_node); i++)
+   {
+      osm_physp_t * p_osm_port = osm_node_get_physp_ptr(p_node,i);
+
+      if (!osm_physp_is_valid(p_osm_port)) 
+         continue;
+      if (!osm_link_is_healthy(p_osm_port)) 
+         continue;
+
+      p_remote_osm_port = osm_physp_get_remote(p_osm_port);
+      p_remote_node = osm_node_get_remote_node(p_node,i,&remote_port_num);
+
+      if (!p_remote_osm_port)
+         continue;
+
+      remote_node_type = osm_node_get_type(p_remote_node);
+      remote_node_guid = osm_node_get_node_guid(p_remote_node);
+
+      switch (remote_node_type)
+      {
+         case IB_NODE_TYPE_ROUTER:
+            /* leaving this port - proceeding to the next one */
+            continue;
+
+         case IB_NODE_TYPE_CA:
+            /* switch connected to hca */
+
+            CL_ASSERT((p_sw->rank + 1) == __osm_ftree_fabric_get_rank(p_ftree));
+
+            p_remote_hca = (ftree_hca_t *)cl_qmap_get(&p_ftree->hca_tbl,remote_node_guid);
+            CL_ASSERT(p_remote_hca != (ftree_hca_t *)cl_qmap_end(&p_ftree->hca_tbl));
+
+            p_remote_hca_or_sw = (void *)p_remote_hca;
+            direction = FTREE_DIRECTION_DOWN;
+
+            remote_base_lid = osm_physp_get_base_lid(p_remote_osm_port);
+            remote_lmc = osm_physp_get_lmc(p_remote_osm_port);
+            break;
+
+         case IB_NODE_TYPE_SWITCH:
+            /* switch connected to another switch */
+
+            p_remote_sw = (ftree_sw_t *)cl_qmap_get(&p_ftree->sw_tbl,remote_node_guid);
+            CL_ASSERT(p_remote_sw != (ftree_sw_t *)cl_qmap_end(&p_ftree->sw_tbl));
+            CL_ASSERT(abs(p_sw->rank - p_remote_sw->rank) == 1);
+            p_remote_hca_or_sw = (void *)p_remote_sw;
+
+            if (p_sw->rank > p_remote_sw->rank)
+               direction = FTREE_DIRECTION_UP;
+            else
+               direction = FTREE_DIRECTION_DOWN;
+
+            /* switch LID is only in port 0 port_info structure */
+            remote_base_lid = osm_node_get_base_lid(p_remote_node, 0);
+            remote_lmc = osm_node_get_lmc(p_remote_node, 0);
+
+            break;
+
+         default:
+            osm_log(&osm.log, OSM_LOG_ERROR,"__osm_ftree_fabric_construct_sw_ports: ERR AB13: "
+                    "Node GUID 0x%016" PRIx64 " - Unknown node type: %s\n",
+                    cl_ntoh64(remote_node_guid),
+                    ib_get_node_type_str(remote_node_type));
+            res = -1;
+            goto Exit;
+      }
+      __osm_ftree_sw_add_port(
+            p_sw,                                       /* local ftree_sw object */     
+            i,                                          /* local port number */          
+            remote_port_num,                            /* remote port number */         
+            p_sw->base_lid,                             /* local lid */                  
+            p_sw->lmc,                                  /* local lmc */                  
+            remote_base_lid,                            /* remote lid */                 
+            remote_lmc,                                 /* remote lmc */                 
+            osm_physp_get_port_guid(p_osm_port),        /* local port guid */            
+            osm_physp_get_port_guid(p_remote_osm_port), /* remote port guid */           
+            remote_node_guid,                           /* remote node guid */           
+            remote_node_type,                           /* remote node type */           
+            p_remote_hca_or_sw,                         /* remote ftree_hca/sw object */ 
+            direction);                                 /* port direction (up or down) */
+   }
+
+ Exit:
+   return res;
+} /* __osm_ftree_fabric_construct_sw_ports() */
+
+/***************************************************
+ ***************************************************/
+
+/* ToDo: improve ranking algorithm complexity
+   by propogating BFS from more nodes */ 
+static int
+__osm_ftree_fabric_perform_ranking(
+   IN  ftree_fabric_t * p_ftree)
+{
+   ftree_hca_t * p_hca;
+   ftree_hca_t * p_next_hca;
+   int res = 0;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_perform_ranking);
+
+   /* Mark REVERSED rank of all the switches in the subnet. 
+      Start from switches that are connected to hca's, and 
+      scan all the switches in the subnet. */
+   p_next_hca = (ftree_hca_t *)cl_qmap_head(&p_ftree->hca_tbl);
+   while( p_next_hca != (ftree_hca_t *)cl_qmap_end( &p_ftree->hca_tbl ) )
+   {
+      p_hca = p_next_hca;
+      p_next_hca = (ftree_hca_t *)cl_qmap_next(&p_hca->map_item );
+      if (__osm_ftree_rank_switches_from_hca(p_ftree,p_hca) != 0)
+      {
+         res = -1;
+         osm_log(&osm.log, OSM_LOG_ERROR, "__osm_ftree_fabric_perform_ranking: ERR AB14: "
+                 "Subnet ranking failed - subnet is not FatTree");
+         goto Exit;
+      }
+   }
+
+   /* calculate and set FatTree rank */
+   __osm_ftree_fabric_calculate_rank(p_ftree);
+   osm_log(&osm.log, OSM_LOG_INFO,"__osm_ftree_fabric_perform_ranking: "
+           "FatTree rank is %u\n", __osm_ftree_fabric_get_rank(p_ftree));
+   
+   /* fix ranking of the switches by reversing the ranking direction */
+   cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_sw_reverse_rank, (void *)p_ftree);
+
+   if ( __osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK ||
+        __osm_ftree_fabric_get_rank(p_ftree) < FAT_TREE_MIN_RANK )
+   {
+      osm_log(&osm.log, OSM_LOG_ERROR, "__osm_ftree_fabric_perform_ranking: ERR AB15: "
+              "Tree rank is %u (should be between %u and %u)\n",
+              __osm_ftree_fabric_get_rank(p_ftree),
+              FAT_TREE_MIN_RANK,
+              FAT_TREE_MAX_RANK);
+      res = -1;
+      goto Exit;
+   }
+
+  Exit:
+   OSM_LOG_EXIT(&(osm.log));
+   return res;
+} /* __osm_ftree_fabric_perform_ranking() */
+
+/***************************************************
+ ***************************************************/
+
+static int
+__osm_ftree_fabric_populate_ports(
+   IN  ftree_fabric_t * p_ftree)
+{
+   ftree_hca_t * p_hca;
+   ftree_hca_t * p_next_hca;
+   ftree_sw_t * p_sw;
+   ftree_sw_t * p_next_sw;
+   int res = 0;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_fabric_populate_ports);
+
+   p_next_hca = (ftree_hca_t *)cl_qmap_head(&p_ftree->hca_tbl);
+   while( p_next_hca != (ftree_hca_t *)cl_qmap_end( &p_ftree->hca_tbl ) )
+   {
+      p_hca = p_next_hca;
+      p_next_hca = (ftree_hca_t *)cl_qmap_next(&p_hca->map_item );
+      if (__osm_ftree_fabric_construct_hca_ports(p_ftree,p_hca) != 0)
+      {
+         res = -1;
+         goto Exit;
+      }
+   }
+
+   p_next_sw = (ftree_sw_t *)cl_qmap_head(&p_ftree->sw_tbl);
+   while( p_next_sw != (ftree_sw_t *)cl_qmap_end( &p_ftree->sw_tbl ) )
+   {
+      p_sw = p_next_sw;
+      p_next_sw = (ftree_sw_t *)cl_qmap_next(&p_sw->map_item );
+      if (__osm_ftree_fabric_construct_sw_ports(p_ftree,p_sw) != 0)
+      {
+         res = -1;
+         goto Exit;
+      }
+   }
+ Exit:
+   OSM_LOG_EXIT(&(osm.log));
+   return res;
+} /* __osm_ftree_fabric_populate_ports() */
+
+/***************************************************
+ ***************************************************/
+
+static int 
+__osm_ftree_do_routing(void *context)
+{
+   ftree_fabric_t * p_ftree = context;
+   int status = 0;
+
+   OSM_LOG_ENTER(&(osm.log), __osm_ftree_do_routing);
+
+   if ( cl_qmap_count(&osm.subn.sw_guid_tbl) < 2 )
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric has %u switches - topology is not fat-tree.\n"
+              "Falling back to default routing.\n",
+              cl_qmap_count(&osm.subn.sw_guid_tbl));
+      status = -1;
+      goto Exit;
+   }
+
+   if ( (cl_qmap_count(&osm.subn.node_guid_tbl) - 
+         cl_qmap_count(&osm.subn.sw_guid_tbl)) < 2)
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric has %u nodes (%u switches) - topology is not fat-tree.\n"
+              "Falling back to default routing.\n",
+              cl_qmap_count(&osm.subn.node_guid_tbl),
+              cl_qmap_count(&osm.subn.sw_guid_tbl));
+      status = -1;
+      goto Exit;
+   }
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: \n"
+           "                       |------------------------------|\n"
+           "                       |-  Starting FatTree Routing  -|\n"
+           "                       |------------------------------|\n\n");
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Populating FatTree switch table\n");
+   /* ToDo: now that the pointer from node to switch exists,  
+      no need to fill the switch table in a separate loop */
+   if (__osm_ftree_fabric_populate_switches(p_ftree) != 0)
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric topology is not fat-tree - "
+              "falling back to default routing\n");
+      status = -1;
+      goto Exit;
+   }
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Populating FatTree HCA table\n");
+   if (__osm_ftree_fabric_populate_hcas(p_ftree) != 0)
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric topology is not fat-tree - "
+              "falling back to default routing\n");
+      status = -1;
+      goto Exit;
+   }
+
+   if (cl_qmap_count(&p_ftree->hca_tbl) < 2)
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric has %u HCAa - topology is not fat-tree.\n"
+              "Falling back to default routing.\n",
+              cl_qmap_count(&p_ftree->hca_tbl));
+      status = -1;
+      goto Exit;
+   }
+
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Ranking FatTree\n");
+   if (__osm_ftree_fabric_perform_ranking(p_ftree) != 0)
+   {
+      if (__osm_ftree_fabric_get_rank(p_ftree) > FAT_TREE_MAX_RANK)
+         osm_log(&osm.log, OSM_LOG_SYS,
+                 "Fabric rank is %u (>%u) - "
+                 "fat-tree routing falls back to default routing\n",
+                 __osm_ftree_fabric_get_rank(p_ftree), FAT_TREE_MAX_RANK);
+      status = -1;
+      goto Exit;
+   }
+
+   /* For each hca and switch, construct array of ports.
+      This is done after the whole FatTree data structure is ready, because
+      we want the ports to have pointers to ftree_{sw,hca}_t objects.*/
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Populating HCA & switch ports\n");
+   if (__osm_ftree_fabric_populate_ports(p_ftree) != 0)
+   {
+      osm_log(&osm.log, OSM_LOG_SYS,
+              "Fabric topology is not a fat-tree - "
+              "routing falls back to default routing\n");
+      status = -1;
+      goto Exit;
+   }
+
+   /* Assign index to all the switches and hca's in the fabric.
+      This function also sorts all the port arrays of the switches
+      by the remote switch index, creates a leaf switch array
+      sorted by the switch index, and tracks the maximal number of
+      hcas per leaf switch. */
+   __osm_ftree_fabric_make_indexing(p_ftree);
+
+   /* print general info about fabric topology */
+   __osm_ftree_fabric_dump_general_info(p_ftree);
+
+   /* dump full tree topology */
+   if (osm_log_is_active(&osm.log, OSM_LOG_DEBUG))
+       __osm_ftree_fabric_dump(p_ftree);
+
+   if (! __osm_ftree_fabric_validate_topology(p_ftree))
+   {
+      status = -1;
+      goto Exit;
+   }
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Filling switch forwarding tables for routes to HCAs\n");
+   __osm_ftree_fabric_route_to_hcas(p_ftree);
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Filling switch forwarding tables for switch-to-switch pathes\n");
+   __osm_ftree_fabric_route_to_switches(p_ftree);
+
+   /* for each switch, set its fwd table */
+   cl_qmap_apply_func(&p_ftree->sw_tbl, __osm_ftree_set_sw_fwd_table, NULL);
+
+   /* write out hca ordering file */
+   __osm_ftree_fabric_dump_hca_ordering(p_ftree);
+
+ Exit:
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: "
+           "Clearing FatTree Fabric data structures\n");
+   __osm_ftree_fabric_clear(p_ftree);
+
+   osm_log(&osm.log, OSM_LOG_VERBOSE,"__osm_ftree_do_routing: \n"
+           "                       |---------------------------------------|\n"
+           "                       |-  Done FatTree Routing (status = %d)  -|\n"
+           "                       |---------------------------------------|\n\n", status);
+
+   OSM_LOG_EXIT(&(osm.log));
+   return status;
+}
+
+/***************************************************
+ ***************************************************/
+
+static void 
+__osm_ftree_delete(void * context)
+{
+   ftree_fabric_t * p_ftree = (ftree_fabric_t *)context;
+   if (!p_ftree)
+      return;
+
+   __osm_ftree_fabric_destroy(p_ftree);
+
+}
+
+/***************************************************
+ ***************************************************/
+
+int osm_ucast_ftree_setup(osm_opensm_t * p_osm)
+{
+   ftree_fabric_t * p_ftree = __osm_ftree_fabric_create();
+   if (!p_ftree)
+      return -1;
+
+   p_osm->routing_engine.context = (void *)p_ftree;
+   p_osm->routing_engine.ucast_build_fwd_tables = __osm_ftree_do_routing;
+   p_osm->routing_engine.delete = __osm_ftree_delete;
+   /* ToDo: fat-tree routing doesn't use min_hop tables, so we
+      shouldn't fill them (p_osm->routing_engine.build_lid_matrices) */
+   return 0;
+}
+
+/***************************************************
+ ***************************************************/
+
-- 
1.4.4.1.GIT





More information about the general mailing list