[ofa-general] [PATCHv3] opensm: Parallelize (Stripe) LFT sets across switches

Hal Rosenstock hnrose at comcast.net
Fri Aug 7 04:08:11 PDT 2009


Currently, MADs are pipelined to a single switch at a time which
effectively serializes these requests due to processing at the SMA.
This patch pipelines (stripes) them across the switches first before
proceeding with successive blocks. As a result of this striping,
multiple switches can process the set and respond concurrently
which results in an improvement to the subnet initialization time.

All unicast routing protocols are updated for this.

A similar subsequent change will do this for MFTs.

Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il> wrote:

With a small cluster of 17 IS4 switches and 11 HCAs and
to artificially increase the cluster, LMC of 7 was used
including EnhancedSwitchPort 0 LMC.

With the new code, LFT configuration is more than twice as
fast as with the old code :)
Current ucast manager ran on avarage for ~250msec, with the
new code - 110-120msec.

Routing calculation phase of the ucast manager took ~1200 usec,
the rest was sending the blocks and waiting for no more pending
transactions.

Here are some detailed results of different executions (the
number on the left is timer value in usec):

Current ucast manager (w/o the optimization):

000000 [LFT]: osm_ucast_mgr_process() - START
001131 [LFT]: ucast_mgr_process_tbl() - START
032251 [LFT]: ucast_mgr_process_tbl() - END
032263 [LFT]: osm_ucast_mgr_process() - END
253416 [LFT]: Done wait_for_pending_transactions()

New algorithm:

001417 [LFT]: osm_ucast_mgr_process() - START
002690 [LFT]: ucast_mgr_process_tbl() - START
032946 [LFT]: ucast_mgr_process_tbl() - END
032948 [LFT]: osm_ucast_pipeline_tbl() - START
033846 [LFT]: osm_ucast_pipeline_tbl() - END
033858 [LFT]: osm_ucast_mgr_process() - END
108203 [LFT]: Done wait_for_pending_transactions()

With IS3 based Qlogic switches, which do not handle DR packets forwarding
in HW, with a fabric of ~1100 HCAs, ~280 switches:

Current OSM configures LFTs in ~2 seconds.
New algorithm does the same job in 1.4-1.6 seconds (30%-20% speed up).

Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
---
Changes since v2:
Eliminated max_smps_per_node
Moved LFTs pushing up to ucast_mgr_route level from the individual routing engines

Changes since v1:
Added Yevgeny's performance data
No change to actual patch

diff --git a/opensm/include/opensm/osm_ucast_mgr.h b/opensm/include/opensm/osm_ucast_mgr.h
index a040476..4ef045c 100644
--- a/opensm/include/opensm/osm_ucast_mgr.h
+++ b/opensm/include/opensm/osm_ucast_mgr.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -242,16 +242,12 @@ osm_ucast_mgr_init(IN osm_ucast_mgr_t * const p_mgr, IN struct osm_sm * sm);
 *
 * SYNOPSIS
 */
-int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * const p_mgr,
-				IN osm_switch_t * const p_sw);
+void osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * const p_mgr);
 /*
 * PARAMETERS
 *	p_mgr
 *		[in] Pointer to an osm_ucast_mgr_t object.
 *
-*	p_mgr
-*		[in] Pointer to an osm_switch_t object.
-*
 * SEE ALSO
 *	Unicast Manager
 *********/
diff --git a/opensm/opensm/osm_ucast_cache.c b/opensm/opensm/osm_ucast_cache.c
index 216b496..30a3c1d 100644
--- a/opensm/opensm/osm_ucast_cache.c
+++ b/opensm/opensm/osm_ucast_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008      Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2008,2009 Mellanox Technologies LTD. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -1085,9 +1085,10 @@ int osm_ucast_cache_process(osm_ucast_mgr_t * p_mgr)
 			memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
 		}
 
-		osm_ucast_mgr_set_fwd_table(p_mgr, p_sw);
 	}
 
+	osm_ucast_mgr_set_fwd_table(p_mgr);
+
 	return 0;
 }
 
diff --git a/opensm/opensm/osm_ucast_file.c b/opensm/opensm/osm_ucast_file.c
index 2505c46..5b73ca5 100644
--- a/opensm/opensm/osm_ucast_file.c
+++ b/opensm/opensm/osm_ucast_file.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2006,2007 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2008      Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2008,2009 Mellanox Technologies LTD. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -167,9 +167,6 @@ static int do_ucast_file_load(void *context)
 				"skipping parsing. Using default "
 				"routing algorithm\n");
 		} else if (!strncmp(p, "Unicast lids", 12)) {
-			if (p_sw)
-				osm_ucast_mgr_set_fwd_table(&p_osm->sm.
-							    ucast_mgr, p_sw);
 			q = strstr(p, " guid 0x");
 			if (!q) {
 				OSM_LOG(&p_osm->log, OSM_LOG_ERROR,
@@ -220,7 +217,7 @@ static int do_ucast_file_load(void *context)
 				return -1;
 			}
 			p = q;
-			/* additionally try to exract guid */
+			/* additionally try to extract guid */
 			q = strstr(p, " portguid 0x");
 			if (!q) {
 				OSM_LOG(&p_osm->log, OSM_LOG_VERBOSE,
@@ -246,9 +243,6 @@ static int do_ucast_file_load(void *context)
 		}
 	}
 
-	if (p_sw)
-		osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw);
-
 	fclose(file);
 	return 0;
 }
diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c
index bde6dbd..6ec6bc7 100644
--- a/opensm/opensm/osm_ucast_ftree.c
+++ b/opensm/opensm/osm_ucast_ftree.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -1905,8 +1905,6 @@ static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
 	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
 
 	p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
-	osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr,
-				    p_sw->p_osm_sw);
 }
 
 /***************************************************
diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c
index b3107f0..0a567b3 100644
--- a/opensm/opensm/osm_ucast_lash.c
+++ b/opensm/opensm/osm_ucast_lash.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2007      Simula Research Laboratory. All rights reserved.
  * Copyright (c) 2007      Silicon Graphics Inc. All rights reserved.
@@ -990,7 +990,6 @@ static void populate_fwd_tbls(lash_t * p_lash)
 {
 	osm_log_t *p_log = &p_lash->p_osm->log;
 	osm_subn_t *p_subn = &p_lash->p_osm->subn;
-	osm_opensm_t *p_osm = p_lash->p_osm;
 	osm_switch_t *p_sw, *p_next_sw, *p_dst_sw;
 	osm_port_t *port;
 	uint16_t max_lid_ho, lid;
@@ -1054,7 +1053,6 @@ static void populate_fwd_tbls(lash_t * p_lash)
 					physical_egress_port);
 			}
 		}		/* for */
-		osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw);
 	}
 	OSM_LOG_EXIT(p_log);
 }
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index 78a7031..e28752a 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -315,16 +315,13 @@ Exit:
 
 /**********************************************************************
  **********************************************************************/
-int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr,
-				IN osm_switch_t * p_sw)
+static int set_fwd_tbl_top(IN osm_ucast_mgr_t * p_mgr, IN osm_switch_t * p_sw)
 {
 	osm_node_t *p_node;
 	osm_dr_path_t *p_path;
 	osm_madw_context_t context;
 	ib_api_status_t status;
 	ib_switch_info_t si;
-	uint16_t block_id_ho = 0;
-	uint8_t block[IB_SMP_DATA_SIZE];
 	boolean_t set_swinfo_require = FALSE;
 	uint16_t lin_top;
 	uint8_t life_state;
@@ -382,48 +379,6 @@ int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr,
 				ib_get_err_str(status));
 	}
 
-	/*
-	   Send linear forwarding table blocks to the switch
-	   as long as the switch indicates it has blocks needing
-	   configuration.
-	 */
-
-	context.lft_context.node_guid = osm_node_get_node_guid(p_node);
-	context.lft_context.set_method = TRUE;
-
-	if (!p_sw->new_lft) {
-		/* any routing should provide the new_lft */
-		CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
-			  p_mgr->cache_valid && !p_sw->need_update);
-		goto Exit;
-	}
-
-	for (block_id_ho = 0;
-	     osm_switch_get_lft_block(p_sw, block_id_ho, block);
-	     block_id_ho++) {
-		if (!p_sw->need_update && !p_mgr->p_subn->need_update &&
-		    !memcmp(block,
-			    p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
-			    IB_SMP_DATA_SIZE))
-			continue;
-
-		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
-			"Writing FT block %u\n", block_id_ho);
-
-		status = osm_req_set(p_mgr->sm, p_path,
-				     p_sw->new_lft +
-				     block_id_ho * IB_SMP_DATA_SIZE,
-				     sizeof(block), IB_MAD_ATTR_LIN_FWD_TBL,
-				     cl_hton32(block_id_ho), CL_DISP_MSGID_NONE,
-				     &context);
-
-		if (status != IB_SUCCESS)
-			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: "
-				"Sending linear fwd. tbl. block failed (%s)\n",
-				ib_get_err_str(status));
-	}
-
-Exit:
 	OSM_LOG_EXIT(p_mgr->p_log);
 	return 0;
 }
@@ -508,7 +463,7 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
 		}
 	}
 
-	osm_ucast_mgr_set_fwd_table(p_mgr, p_sw);
+	set_fwd_tbl_top(p_mgr, p_sw);
 
 	if (p_mgr->p_subn->opt.lmc)
 		free_ports_priv(p_mgr);
@@ -516,6 +471,101 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
 	OSM_LOG_EXIT(p_mgr->p_log);
 }
 
+static void ucast_mgr_process_top(IN cl_map_item_t * p_map_item,
+				  IN void *context)
+{
+	osm_ucast_mgr_t *p_mgr = context;
+	osm_switch_t *const p_sw = (osm_switch_t *) p_map_item;
+
+	set_fwd_tbl_top(p_mgr, p_sw);
+}
+
+static boolean_t set_next_lft_block(IN osm_switch_t * p_sw, IN osm_sm_t * p_sm,
+				    IN uint8_t * p_block,
+				    IN osm_dr_path_t * p_path,
+				    IN uint16_t block_id_ho,
+				    IN osm_madw_context_t * p_context)
+{
+	ib_api_status_t status;
+	boolean_t sts;
+
+	OSM_LOG_ENTER(p_sm->p_log);
+
+	for (;
+	     (sts = osm_switch_get_lft_block(p_sw, block_id_ho, p_block));
+	     block_id_ho++) {
+		if (!p_sw->need_update && !p_sm->p_subn->need_update &&
+		    !memcmp(p_block,
+			    p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
+			    IB_SMP_DATA_SIZE))
+			continue;
+
+		OSM_LOG(p_sm->p_log, OSM_LOG_DEBUG,
+			"Writing FT block %u to switch 0x%" PRIx64 "\n",
+			block_id_ho,
+			cl_ntoh64(p_context->lft_context.node_guid));
+
+		status = osm_req_set(p_sm, p_path,
+				     p_sw->new_lft +
+				     block_id_ho * IB_SMP_DATA_SIZE,
+				     IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL,
+				     cl_hton32(block_id_ho),
+				     CL_DISP_MSGID_NONE, p_context);
+
+		if (status != IB_SUCCESS)
+			OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR 3A05: "
+				"Sending linear fwd. tbl. block failed (%s)\n",
+				ib_get_err_str(status));
+		break;
+	}
+
+	OSM_LOG_EXIT(p_sm->p_log);
+	return sts;
+}
+
+static boolean_t pipeline_next_lft_block(IN osm_switch_t *p_sw,
+					 IN osm_ucast_mgr_t *p_mgr,
+					 IN uint16_t block_id_ho)
+{
+	osm_dr_path_t *p_path;
+	osm_madw_context_t context;
+	uint8_t block[IB_SMP_DATA_SIZE];
+	boolean_t status;
+
+	OSM_LOG_ENTER(p_mgr->p_log);
+
+	CL_ASSERT(p_sw && p_sw->p_node);
+
+	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
+		"Processing switch 0x%" PRIx64 "\n",
+		cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
+
+	/*
+	   Send linear forwarding table blocks to the switch
+	   as long as the switch indicates it has blocks needing
+	   configuration.
+	 */
+	if (!p_sw->new_lft) {
+		/* any routing should provide the new_lft */
+		CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
+			  p_mgr->cache_valid && !p_sw->need_update);
+		status = FALSE;
+		goto Exit;
+	}
+
+	p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_sw->p_node, 0));
+
+	context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node);
+	context.lft_context.set_method = TRUE;
+
+	status = set_next_lft_block(p_sw, p_mgr->sm, &block[0], p_path,
+				    block_id_ho, &context);
+
+Exit:
+	OSM_LOG_EXIT(p_mgr->p_log);
+	return status;
+}
+
 /**********************************************************************
  **********************************************************************/
 static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item,
@@ -731,7 +781,6 @@ static int ucast_mgr_setup_all_switches(osm_subn_t * p_subn)
 
 /**********************************************************************
  **********************************************************************/
-
 static int add_guid_to_order_list(void *ctx, uint64_t guid, char *p)
 {
 	osm_ucast_mgr_t *m = ctx;
@@ -870,6 +919,30 @@ static void sort_ports_by_switch_load(osm_ucast_mgr_t * m)
 		add_sw_endports_to_order_list(s[i], m);
 }
 
+static void ucast_mgr_pipeline_fwd_tbl(osm_ucast_mgr_t * p_mgr)
+{
+	cl_qmap_t *p_sw_tbl;
+	osm_switch_t *p_sw;
+	uint16_t block_id_ho = 0;
+	int sws_notdone;
+	boolean_t sts;
+
+	p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
+	while (1) {
+		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
+		sws_notdone = 0;		
+		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
+			sts = pipeline_next_lft_block(p_sw, p_mgr, block_id_ho);
+			if (sts)
+				sws_notdone++;
+			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
+		}
+		if (!sws_notdone)
+			break;
+		block_id_ho++;
+	}
+}
+
 static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
 {
 	cl_qlist_init(&p_mgr->port_order_list);
@@ -904,6 +977,8 @@ static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
 	cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl,
 			   p_mgr);
 
+	ucast_mgr_pipeline_fwd_tbl(p_mgr);
+
 	cl_qlist_remove_all(&p_mgr->port_order_list);
 
 	return 0;
@@ -911,6 +986,16 @@ static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
 
 /**********************************************************************
  **********************************************************************/
+void osm_ucast_mgr_set_fwd_table(osm_ucast_mgr_t * p_mgr)
+{
+	cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl,
+			   ucast_mgr_process_top, p_mgr);
+
+	ucast_mgr_pipeline_fwd_tbl(p_mgr);
+}
+
+/**********************************************************************
+ **********************************************************************/
 static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm)
 {
 	int ret;
@@ -940,6 +1025,9 @@ static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm)
 
 	osm->routing_engine_used = osm_routing_engine_type(r->name);
 
+	if (r->ucast_build_fwd_tables)
+		osm_ucast_mgr_set_fwd_table(&osm->sm.ucast_mgr);
+
 	return 0;
 }
 



More information about the general mailing list