[ofa-general] [PATCHv2] opensm: Parallelize (Stripe) LFT sets across switches

Hal Rosenstock hnrose at comcast.net
Thu Jul 30 16:28:48 PDT 2009


Currently, MADs are pipelined to a single switch at a time which
effectively serializes these requests due to processing at the SMA.
This patch pipelines (stripes) them across the switches first before
proceeding with successive blocks. As a result of this striping,
multiple switches can process the set and respond concurrently
which results in an improvement to the subnet initialization time.

This patch also introduces a new config option (max_smps_per_node)
which indicates how deep the per node pipeline is (current default is 4).
This also has the effect of limiting the number of times that the switch
list is traversed. Maybe this embellishment is unnecessary.

All unicast routing protocols are updated for this with the exception
of file.

A similar subsequent change will do this for MFTs.

Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il> wrote:

With a small cluster of 17 IS4 switches and 11 HCAs and
to artificially increase the cluster, LMC of 7 was used
including EnhancedSwitchPort 0 LMC.

With the new code, LFT configuration is more than twice as
fast as with the old code :)
Current ucast manager ran on avarage for ~250msec, with the
new code - 110-120msec.

Routing calculation phase of the ucast manager took ~1200 usec,
the rest was sending the blocks and waiting for no more pending
transactions.

No noticeable difference between various max_smps_per_node values
was observed.

Here are some detailed results of different executions (the
number on the left is timer value in usec):

Current ucast manager (w/o the optimization):

000000 [LFT]: osm_ucast_mgr_process() - START
001131 [LFT]: ucast_mgr_process_tbl() - START
032251 [LFT]: ucast_mgr_process_tbl() - END
032263 [LFT]: osm_ucast_mgr_process() - END
253416 [LFT]: Done wait_for_pending_transactions()

New code, max_smps_per_node=0:

001417 [LFT]: osm_ucast_mgr_process() - START (0 max_smps_per_node)
002690 [LFT]: ucast_mgr_process_tbl() - START
032946 [LFT]: ucast_mgr_process_tbl() - END
032948 [LFT]: osm_ucast_pipeline_tbl() - START
033846 [LFT]: osm_ucast_pipeline_tbl() - END
033858 [LFT]: osm_ucast_mgr_process() - END
108203 [LFT]: Done wait_for_pending_transactions()

New code, max_smps_per_node=1:

007474 [LFT]: osm_ucast_mgr_process() - START (1 max_smps_per_node)
008735 [LFT]: ucast_mgr_process_tbl() - START
040071 [LFT]: ucast_mgr_process_tbl() - END
040074 [LFT]: osm_ucast_pipeline_tbl() - START
040103 [LFT]: osm_ucast_pipeline_tbl() - END
040114 [LFT]: osm_ucast_mgr_process() - END
120097 [LFT]: Done wait_for_pending_transactions()

New code, max_smps_per_node=4:

004137 [LFT]: osm_ucast_mgr_process() - START (4 max_smps_per_node)
005380 [LFT]: ucast_mgr_process_tbl() - START
037436 [LFT]: ucast_mgr_process_tbl() - END
037439 [LFT]: osm_ucast_pipeline_tbl() - START
037495 [LFT]: osm_ucast_pipeline_tbl() - END
037506 [LFT]: osm_ucast_mgr_process() - END
114983 [LFT]: Done wait_for_pending_transactions()


With IS3 based Qlogic switches, which do not handle DR packets forwarding
in HW, with a fabric of ~1100 HCAs, ~280 switches:

Current OSM configures LFTs in ~2 seconds.
New algorithm does the same job in 1.4-1.6 seconds (30%-20% speed up),
depending on the max_smps_per_node value.

As in case of IS4 switches, the shortest config time was obtained with
max_smps_per_node=0, which is unlimited pipeline.


Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
---
Changes since v1:
Added Yevgeny's performance data to patch description above
No change to actual patch

diff --git a/opensm/include/opensm/osm_base.h b/opensm/include/opensm/osm_base.h
index 0537002..617e8a9 100644
--- a/opensm/include/opensm/osm_base.h
+++ b/opensm/include/opensm/osm_base.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
  *
@@ -449,6 +449,18 @@ BEGIN_C_DECLS
 */
 #define OSM_DEFAULT_SMP_MAX_ON_WIRE 4
 /***********/
+/****d* OpenSM: Base/OSM_DEFAULT_SMP_MAX_PER_NODE
+* NAME
+*	OSM_DEFAULT_SMP_MAX_PER_NODE
+*
+* DESCRIPTION
+*	Specifies the default number of VL15 SMP MADs allowed
+*	per node for certain attributes.
+*
+* SYNOPSIS
+*/
+#define OSM_DEFAULT_SMP_MAX_PER_NODE 4
+/***********/
 /****d* OpenSM: Base/OSM_SM_DEFAULT_QP0_RCV_SIZE
 * NAME
 *	OSM_SM_DEFAULT_QP0_RCV_SIZE
diff --git a/opensm/include/opensm/osm_sm.h b/opensm/include/opensm/osm_sm.h
index cc8321d..1776380 100644
--- a/opensm/include/opensm/osm_sm.h
+++ b/opensm/include/opensm/osm_sm.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -130,6 +130,7 @@ typedef struct osm_sm {
 	osm_sm_mad_ctrl_t mad_ctrl;
 	osm_lid_mgr_t lid_mgr;
 	osm_ucast_mgr_t ucast_mgr;
+	boolean_t lfts_updated;
 	cl_disp_reg_handle_t sweep_fail_disp_h;
 	cl_disp_reg_handle_t ni_disp_h;
 	cl_disp_reg_handle_t pi_disp_h;
@@ -524,6 +525,45 @@ osm_resp_send(IN osm_sm_t * sm,
 *
 *********/
 
+/****f* OpenSM: SM/osm_sm_set_next_lft_block
+* NAME
+*	osm_sm_set_next_lft_block
+*
+* DESCRIPTION
+*	Set the next LFT (LinearForwardingTable) block in the indicated switch.
+*
+* SYNOPSIS
+*/
+void
+osm_sm_set_next_lft_block(IN osm_sm_t *p_sm, IN osm_switch_t *p_sw,
+			  IN uint8_t *p_block, IN osm_dr_path_t *p_path,
+			  IN osm_madw_context_t *p_context);
+/*
+* PARAMETERS
+*	p_sm
+*		[in] Pointer to an osm_sm_t object.
+*
+*	p_switch
+*		[in] Pointer to the switch object.
+*
+*	p_block
+*		[in] Pointer to the forwarding table block.
+*
+*	p_path
+*		[in] Pointer to a directed route path object.
+*
+*	p_context
+*		[in] Mad wrapper context structure to be copied into the wrapper
+*		context, and thus visible to the recipient of the response.
+*
+* RETURN VALUES
+*	None
+*
+* NOTES
+*
+* SEE ALSO
+*********/
+
 /****f* OpenSM: SM/osm_sm_mcgrp_join
 * NAME
 *	osm_sm_mcgrp_join
diff --git a/opensm/include/opensm/osm_subnet.h b/opensm/include/opensm/osm_subnet.h
index 59a32ad..f12afae 100644
--- a/opensm/include/opensm/osm_subnet.h
+++ b/opensm/include/opensm/osm_subnet.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
  *
@@ -147,6 +147,7 @@ typedef struct osm_subn_opt {
 	uint32_t sweep_interval;
 	uint32_t max_wire_smps;
 	uint32_t transaction_timeout;
+	uint32_t max_smps_per_node;
 	uint8_t sm_priority;
 	uint8_t lmc;
 	boolean_t lmc_esp0;
diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
index 7ce28c5..e12113f 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -102,6 +102,7 @@ typedef struct osm_switch {
 	osm_port_profile_t *p_prof;
 	uint8_t *lft;
 	uint8_t *new_lft;
+	uint16_t lft_block_id_ho;
 	osm_mcast_tbl_t mcast_tbl;
 	unsigned endport_links;
 	unsigned need_update;
diff --git a/opensm/include/opensm/osm_ucast_mgr.h b/opensm/include/opensm/osm_ucast_mgr.h
index a040476..fdea49a 100644
--- a/opensm/include/opensm/osm_ucast_mgr.h
+++ b/opensm/include/opensm/osm_ucast_mgr.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -233,17 +233,42 @@ osm_ucast_mgr_init(IN osm_ucast_mgr_t * const p_mgr, IN struct osm_sm * sm);
 *	osm_ucast_mgr_destroy
 *********/
 
-/****f* OpenSM: Unicast Manager/osm_ucast_mgr_set_fwd_table
+/****f* OpenSM: Unicast Manager/osm_ucast_pipeline_tbl
 * NAME
-*	osm_ucast_mgr_set_fwd_table
+*	osm_ucast_pipeline_tbl
 *
 * DESCRIPTION
-*	Setup forwarding table for the switch (from prepared new_lft).
+*	The osm_ucast_pipeline_tbl function pipelines the LFT
+*	(LinearForwardingTable) sets across the switches
+*	(from prepared new_lft).
 *
 * SYNOPSIS
 */
-int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * const p_mgr,
-				IN osm_switch_t * const p_sw);
+void osm_ucast_pipeline_tbl(IN osm_ucast_mgr_t * p_mgr);
+/*
+* PARAMETERS
+*	p_mgr
+*		[in] Pointer to an osm_ucast_mgr_t object.
+*
+* RETURN VALUES
+*	None.
+*
+* NOTES
+*
+* SEE ALSO
+*********/
+
+/****f* OpenSM: Unicast Manager/osm_ucast_mgr_set_fwd_tbl_top
+* NAME
+*	osm_ucast_mgr_set_fwd_tbl_top
+*
+* DESCRIPTION
+*	Setup LinearFDBTop for the switch.
+*
+* SYNOPSIS
+*/
+int osm_ucast_mgr_set_fwd_tbl_top(IN osm_ucast_mgr_t * const p_mgr,
+				  IN osm_switch_t * const p_sw);
 /*
 * PARAMETERS
 *	p_mgr
diff --git a/opensm/opensm/osm_lin_fwd_rcv.c b/opensm/opensm/osm_lin_fwd_rcv.c
index 2edb8d3..cb131b4 100644
--- a/opensm/opensm/osm_lin_fwd_rcv.c
+++ b/opensm/opensm/osm_lin_fwd_rcv.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -36,7 +36,7 @@
 /*
  * Abstract:
  *    Implementation of osm_lft_rcv_t.
- * This object represents the NodeDescription Receiver object.
+ * This object represents the Linear Forwarding Table Receiver object.
  * This object is part of the opensm family of objects.
  */
 
@@ -55,6 +55,7 @@ void osm_lft_rcv_process(IN void *context, IN void *data)
 {
 	osm_sm_t *sm = context;
 	osm_madw_t *p_madw = data;
+	osm_dr_path_t *p_path;
 	ib_smp_t *p_smp;
 	uint32_t block_num;
 	osm_switch_t *p_sw;
@@ -62,6 +63,8 @@ void osm_lft_rcv_process(IN void *context, IN void *data)
 	uint8_t *p_block;
 	ib_net64_t node_guid;
 	ib_api_status_t status;
+	uint8_t block[IB_SMP_DATA_SIZE];
+	osm_madw_context_t mad_context;
 
 	CL_ASSERT(sm);
 
@@ -94,6 +97,16 @@ void osm_lft_rcv_process(IN void *context, IN void *data)
 				"\n\t\t\t\tSwitch 0x%" PRIx64 "\n",
 				ib_get_err_str(status), cl_ntoh64(node_guid));
 		}
+
+		p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_sw->p_node, 0));
+
+		mad_context.lft_context.node_guid = node_guid;
+		mad_context.lft_context.set_method = TRUE;
+
+		osm_sm_set_next_lft_block(sm, p_sw, &block[0], p_path,
+					  &mad_context);
+
+		p_sw->lft_block_id_ho++;
 	}
 
 	CL_PLOCK_RELEASE(sm->p_lock);
diff --git a/opensm/opensm/osm_sm.c b/opensm/opensm/osm_sm.c
index daa60ff..4e0fd2a 100644
--- a/opensm/opensm/osm_sm.c
+++ b/opensm/opensm/osm_sm.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2005 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
  *
@@ -441,6 +441,45 @@ Exit:
 
 /**********************************************************************
  **********************************************************************/
+void osm_sm_set_next_lft_block(IN osm_sm_t *p_sm, IN osm_switch_t *p_sw,
+			       IN uint8_t *p_block, IN osm_dr_path_t *p_path,
+			       IN osm_madw_context_t *context)
+{
+	ib_api_status_t status;
+
+	for (;
+	     osm_switch_get_lft_block(p_sw, p_sw->lft_block_id_ho, p_block);
+	     p_sw->lft_block_id_ho++) {
+		if (!p_sw->need_update && !p_sm->p_subn->need_update &&
+		    !memcmp(p_block,
+			    p_sw->new_lft + p_sw->lft_block_id_ho * IB_SMP_DATA_SIZE,
+			    IB_SMP_DATA_SIZE))
+			continue;
+
+		p_sm->lfts_updated = 1;
+
+		OSM_LOG(p_sm->p_log, OSM_LOG_DEBUG,
+			"Writing FT block %u to switch 0x%" PRIx64 "\n",
+			p_sw->lft_block_id_ho,
+			cl_ntoh64(context->lft_context.node_guid));
+
+		status = osm_req_set(p_sm, p_path,
+				     p_sw->new_lft +
+				     p_sw->lft_block_id_ho * IB_SMP_DATA_SIZE,
+				     IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL,
+				     cl_hton32(p_sw->lft_block_id_ho),
+				     CL_DISP_MSGID_NONE, context);
+
+		if (status != IB_SUCCESS)
+			OSM_LOG(p_sm->p_log, OSM_LOG_ERROR, "ERR 2E11: "
+				"Sending linear fwd. tbl. block failed (%s)\n",
+				ib_get_err_str(status));
+		break;
+	}
+}
+
+/**********************************************************************
+ **********************************************************************/
 static ib_api_status_t sm_mgrp_process(IN osm_sm_t * p_sm,
 				       IN osm_mgrp_t * p_mgrp)
 {
diff --git a/opensm/opensm/osm_subnet.c b/opensm/opensm/osm_subnet.c
index ec15f8a..1964b7f 100644
--- a/opensm/opensm/osm_subnet.c
+++ b/opensm/opensm/osm_subnet.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
  *
@@ -295,6 +295,7 @@ static const opt_rec_t opt_tbl[] = {
 	{ "m_key_lease_period", OPT_OFFSET(m_key_lease_period), opts_parse_net16, NULL, 1 },
 	{ "sweep_interval", OPT_OFFSET(sweep_interval), opts_parse_uint32, NULL, 1 },
 	{ "max_wire_smps", OPT_OFFSET(max_wire_smps), opts_parse_uint32, NULL, 1 },
+	{ "max_smps_per_node", OPT_OFFSET(max_smps_per_node), opts_parse_uint32, NULL, 1 },
 	{ "console", OPT_OFFSET(console), opts_parse_charp, NULL, 0 },
 	{ "console_port", OPT_OFFSET(console_port), opts_parse_uint16, NULL, 0 },
 	{ "transaction_timeout", OPT_OFFSET(transaction_timeout), opts_parse_uint32, NULL, 1 },
@@ -671,6 +672,7 @@ void osm_subn_set_default_opt(IN osm_subn_opt_t * const p_opt)
 	p_opt->m_key_lease_period = 0;
 	p_opt->sweep_interval = OSM_DEFAULT_SWEEP_INTERVAL_SECS;
 	p_opt->max_wire_smps = OSM_DEFAULT_SMP_MAX_ON_WIRE;
+	p_opt->max_smps_per_node = OSM_DEFAULT_SMP_MAX_PER_NODE;
 	p_opt->console = strdup(OSM_DEFAULT_CONSOLE);
 	p_opt->console_port = OSM_DEFAULT_CONSOLE_PORT;
 	p_opt->transaction_timeout = OSM_DEFAULT_TRANS_TIMEOUT_MILLISEC;
@@ -1461,6 +1463,10 @@ int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t *const p_opts)
 		"max_wire_smps %u\n\n"
 		"# The maximum time in [msec] allowed for a transaction to complete\n"
 		"transaction_timeout %u\n\n"
+		"# Maximum number of SMPs per node sent in parallel\n"
+		"# (0 means unlimited)\n"
+		"# Only applies to certain attributes\n"
+		"max_smps_per_node %u\n\n"
 		"# Maximal time in [msec] a message can stay in the incoming message queue.\n"
 		"# If there is more than one message in the queue and the last message\n"
 		"# stayed in the queue more than this value, any SA request will be\n"
@@ -1470,6 +1476,7 @@ int osm_subn_output_conf(FILE *out, IN osm_subn_opt_t *const p_opts)
 		"single_thread %s\n\n",
 		p_opts->max_wire_smps,
 		p_opts->transaction_timeout,
+		p_opts->max_smps_per_node,
 		p_opts->max_msg_fifo_timeout,
 		p_opts->single_thread ? "TRUE" : "FALSE");
 
diff --git a/opensm/opensm/osm_ucast_cache.c b/opensm/opensm/osm_ucast_cache.c
index 216b496..31c930b 100644
--- a/opensm/opensm/osm_ucast_cache.c
+++ b/opensm/opensm/osm_ucast_cache.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008      Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2008,2009 Mellanox Technologies LTD. All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
@@ -1085,9 +1085,11 @@ int osm_ucast_cache_process(osm_ucast_mgr_t * p_mgr)
 			memset(p_sw->lft, OSM_NO_PATH, IB_LID_UCAST_END_HO + 1);
 		}
 
-		osm_ucast_mgr_set_fwd_table(p_mgr, p_sw);
+		osm_ucast_mgr_set_fwd_tbl_top(p_mgr, p_sw);
 	}
 
+	osm_ucast_pipeline_tbl(p_mgr);
+
 	return 0;
 }
 
diff --git a/opensm/opensm/osm_ucast_file.c b/opensm/opensm/osm_ucast_file.c
index 2505c46..099e8ba 100644
--- a/opensm/opensm/osm_ucast_file.c
+++ b/opensm/opensm/osm_ucast_file.c
@@ -168,8 +168,8 @@ static int do_ucast_file_load(void *context)
 				"routing algorithm\n");
 		} else if (!strncmp(p, "Unicast lids", 12)) {
 			if (p_sw)
-				osm_ucast_mgr_set_fwd_table(&p_osm->sm.
-							    ucast_mgr, p_sw);
+				osm_ucast_mgr_set_fwd_tbl_top(&p_osm->sm.
+							      ucast_mgr, p_sw);
 			q = strstr(p, " guid 0x");
 			if (!q) {
 				OSM_LOG(&p_osm->log, OSM_LOG_ERROR,
@@ -247,7 +247,7 @@ static int do_ucast_file_load(void *context)
 	}
 
 	if (p_sw)
-		osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw);
+		osm_ucast_mgr_set_fwd_tbl_top(&p_osm->sm.ucast_mgr, p_sw);
 
 	fclose(file);
 	return 0;
diff --git a/opensm/opensm/osm_ucast_ftree.c b/opensm/opensm/osm_ucast_ftree.c
index bde6dbd..d65c685 100644
--- a/opensm/opensm/osm_ucast_ftree.c
+++ b/opensm/opensm/osm_ucast_ftree.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2009 Simula Research Laboratory. All rights reserved.
  * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved.
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2007 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -1905,8 +1905,8 @@ static void set_sw_fwd_table(IN cl_map_item_t * const p_map_item,
 	ftree_fabric_t *p_ftree = (ftree_fabric_t *) context;
 
 	p_sw->p_osm_sw->max_lid_ho = p_ftree->lft_max_lid;
-	osm_ucast_mgr_set_fwd_table(&p_ftree->p_osm->sm.ucast_mgr,
-				    p_sw->p_osm_sw);
+	osm_ucast_mgr_set_fwd_tbl_top(&p_ftree->p_osm->sm.ucast_mgr,
+				      p_sw->p_osm_sw);
 }
 
 /***************************************************
@@ -4005,6 +4005,8 @@ static int do_routing(IN void *context)
 	/* for each switch, set its fwd table */
 	cl_qmap_apply_func(&p_ftree->sw_tbl, set_sw_fwd_table, (void *)p_ftree);
 
+	osm_ucast_pipeline_tbl(&p_ftree->p_osm->sm.ucast_mgr);
+
 	/* write out hca ordering file */
 	fabric_dump_hca_ordering(p_ftree);
 
diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c
index 12b5e34..adf5f6c 100644
--- a/opensm/opensm/osm_ucast_lash.c
+++ b/opensm/opensm/osm_ucast_lash.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2007      Simula Research Laboratory. All rights reserved.
  * Copyright (c) 2007      Silicon Graphics Inc. All rights reserved.
@@ -1045,8 +1045,11 @@ static void populate_fwd_tbls(lash_t * p_lash)
 					physical_egress_port);
 			}
 		}		/* for */
-		osm_ucast_mgr_set_fwd_table(&p_osm->sm.ucast_mgr, p_sw);
+		osm_ucast_mgr_set_fwd_tbl_top(&p_osm->sm.ucast_mgr, p_sw);
 	}
+
+	osm_ucast_pipeline_tbl(&p_osm->sm.ucast_mgr);
+
 	OSM_LOG_EXIT(p_log);
 }
 
diff --git a/opensm/opensm/osm_ucast_mgr.c b/opensm/opensm/osm_ucast_mgr.c
index 78a7031..86d1c98 100644
--- a/opensm/opensm/osm_ucast_mgr.c
+++ b/opensm/opensm/osm_ucast_mgr.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -315,16 +315,14 @@ Exit:
 
 /**********************************************************************
  **********************************************************************/
-int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr,
-				IN osm_switch_t * p_sw)
+int osm_ucast_mgr_set_fwd_tbl_top(IN osm_ucast_mgr_t * p_mgr,
+				  IN osm_switch_t * p_sw)
 {
 	osm_node_t *p_node;
 	osm_dr_path_t *p_path;
 	osm_madw_context_t context;
 	ib_api_status_t status;
 	ib_switch_info_t si;
-	uint16_t block_id_ho = 0;
-	uint8_t block[IB_SMP_DATA_SIZE];
 	boolean_t set_swinfo_require = FALSE;
 	uint16_t lin_top;
 	uint8_t life_state;
@@ -382,48 +380,8 @@ int osm_ucast_mgr_set_fwd_table(IN osm_ucast_mgr_t * p_mgr,
 				ib_get_err_str(status));
 	}
 
-	/*
-	   Send linear forwarding table blocks to the switch
-	   as long as the switch indicates it has blocks needing
-	   configuration.
-	 */
-
-	context.lft_context.node_guid = osm_node_get_node_guid(p_node);
-	context.lft_context.set_method = TRUE;
-
-	if (!p_sw->new_lft) {
-		/* any routing should provide the new_lft */
-		CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
-			  p_mgr->cache_valid && !p_sw->need_update);
-		goto Exit;
-	}
-
-	for (block_id_ho = 0;
-	     osm_switch_get_lft_block(p_sw, block_id_ho, block);
-	     block_id_ho++) {
-		if (!p_sw->need_update && !p_mgr->p_subn->need_update &&
-		    !memcmp(block,
-			    p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
-			    IB_SMP_DATA_SIZE))
-			continue;
-
-		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
-			"Writing FT block %u\n", block_id_ho);
-
-		status = osm_req_set(p_mgr->sm, p_path,
-				     p_sw->new_lft +
-				     block_id_ho * IB_SMP_DATA_SIZE,
-				     sizeof(block), IB_MAD_ATTR_LIN_FWD_TBL,
-				     cl_hton32(block_id_ho), CL_DISP_MSGID_NONE,
-				     &context);
+	p_sw->lft_block_id_ho = 0;
 
-		if (status != IB_SUCCESS)
-			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: "
-				"Sending linear fwd. tbl. block failed (%s)\n",
-				ib_get_err_str(status));
-	}
-
-Exit:
 	OSM_LOG_EXIT(p_mgr->p_log);
 	return 0;
 }
@@ -508,7 +466,7 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
 		}
 	}
 
-	osm_ucast_mgr_set_fwd_table(p_mgr, p_sw);
+	osm_ucast_mgr_set_fwd_tbl_top(p_mgr, p_sw);
 
 	if (p_mgr->p_subn->opt.lmc)
 		free_ports_priv(p_mgr);
@@ -516,6 +474,47 @@ static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
 	OSM_LOG_EXIT(p_mgr->p_log);
 }
 
+static void ucast_mgr_pipeline_tbl(IN osm_switch_t *p_sw,
+				   IN osm_ucast_mgr_t *p_mgr)
+{
+	osm_dr_path_t *p_path;
+	osm_madw_context_t mad_context;
+	uint8_t block[IB_SMP_DATA_SIZE];
+
+	OSM_LOG_ENTER(p_mgr->p_log);
+
+	CL_ASSERT(p_sw && p_sw->p_node);
+
+	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
+		"Processing switch 0x%" PRIx64 "\n",
+		cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
+
+	/*
+	   Send linear forwarding table blocks to the switch
+	   as long as the switch indicates it has blocks needing
+	   configuration.
+	 */
+	if (!p_sw->new_lft) {
+		/* any routing should provide the new_lft */
+		CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
+			  p_mgr->cache_valid && !p_sw->need_update);
+		goto Exit;
+	}
+
+	p_path = osm_physp_get_dr_path_ptr(osm_node_get_physp_ptr(p_sw->p_node, 0));
+
+	mad_context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node);
+	mad_context.lft_context.set_method = TRUE;
+
+	osm_sm_set_next_lft_block(p_mgr->sm, p_sw, &block[0], p_path,
+				  &mad_context);
+
+	p_sw->lft_block_id_ho++;
+
+Exit:
+	OSM_LOG_EXIT(p_mgr->p_log);
+}
+
 /**********************************************************************
  **********************************************************************/
 static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item,
@@ -870,6 +869,28 @@ static void sort_ports_by_switch_load(osm_ucast_mgr_t * m)
 		add_sw_endports_to_order_list(s[i], m);
 }
 
+void osm_ucast_pipeline_tbl(osm_ucast_mgr_t * p_mgr)
+{
+	cl_qmap_t *p_sw_tbl;
+	osm_switch_t *p_sw;
+	int i;
+
+	for (i = 0;
+	     !p_mgr->p_subn->opt.max_smps_per_node ||
+	     i < p_mgr->p_subn->opt.max_smps_per_node;
+	     i++) {
+		p_mgr->sm->lfts_updated = 0;
+		p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
+		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
+		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
+			ucast_mgr_pipeline_tbl(p_sw, p_mgr);
+			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
+		}
+		if (!p_mgr->sm->lfts_updated)
+			break;
+	}
+}
+
 static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
 {
 	cl_qlist_init(&p_mgr->port_order_list);
@@ -904,6 +925,8 @@ static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
 	cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl,
 			   p_mgr);
 
+	osm_ucast_pipeline_tbl(p_mgr);
+
 	cl_qlist_remove_all(&p_mgr->port_order_list);
 
 	return 0;



More information about the general mailing list