[ofa-general] [PATCH] opensm: Parallelize (Stripe) MFT sets across switches

Hal Rosenstock hnrose at comcast.net
Fri Aug 7 09:41:27 PDT 2009


Similar to previous patch to "Parallelize (Stripe) LFT sets across switches".
Currently, MADs are pipelined to a single switch first which effectively
serializes these requests. This patch pipelines the MFT set MADs across
switches first (before cycling to the next MFT block) so that multiple
switches can be responding concurrently. Speedup is dependent on number
of MFT blocks in use (number of MLIDs) which is dependent on the number
of multicast groups.

Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
---
diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
index 7ce28c5..e281842 100644
--- a/opensm/include/opensm/osm_switch.h
+++ b/opensm/include/opensm/osm_switch.h
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  *
  * This software is available to you under a choice of one of two
@@ -103,6 +103,8 @@ typedef struct osm_switch {
 	uint8_t *lft;
 	uint8_t *new_lft;
 	osm_mcast_tbl_t mcast_tbl;
+	uint32_t mft_block_num;
+	uint32_t mft_position;
 	unsigned endport_links;
 	unsigned need_update;
 	void *priv;
diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
index 4dbbaa0..f91c6b6 100644
--- a/opensm/opensm/osm_mcast_mgr.c
+++ b/opensm/opensm/osm_mcast_mgr.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
- * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
+ * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
  * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
  * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
  *
@@ -325,15 +325,12 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
 {
 	osm_node_t *p_node;
 	osm_dr_path_t *p_path;
-	osm_madw_context_t mad_context;
+	osm_madw_context_t context;
 	ib_api_status_t status;
-	uint32_t block_id_ho = 0;
-	int16_t block_num = 0;
-	uint32_t position = 0;
-	uint32_t max_position;
+	uint32_t block_id_ho;
 	osm_mcast_tbl_t *p_tbl;
 	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
-	int ret = 0;
+	int ret = -1;
 
 	CL_ASSERT(sm);
 
@@ -353,36 +350,34 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
 	   configuration.
 	 */
 
-	mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node);
-	mad_context.mft_context.set_method = TRUE;
+	context.mft_context.node_guid = osm_node_get_node_guid(p_node);
+	context.mft_context.set_method = TRUE;
 
 	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
-	max_position = p_tbl->max_position;
 
-	while (osm_mcast_tbl_get_block(p_tbl, block_num,
-				       (uint8_t) position, block)) {
-		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
-			"Writing MFT block 0x%X\n", block_id_ho);
+	if (p_sw->mft_position <= p_tbl->max_position &&
+	    osm_mcast_tbl_get_block(p_tbl, p_sw->mft_block_num,
+				    (uint8_t) p_sw->mft_position, block)) {
+
+		block_id_ho = p_sw->mft_block_num + (p_sw->mft_position << 28);
 
-		block_id_ho = block_num + (position << 28);
+		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
+			"Writing MFT block %u position %u to switch 0x%" PRIx64 "\n",
+			p_sw->mft_block_num, p_sw->mft_position,
+			cl_ntoh64(context.lft_context.node_guid));
 
 		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
 				     IB_MAD_ATTR_MCAST_FWD_TBL,
 				     cl_hton32(block_id_ho), CL_DISP_MSGID_NONE,
-				     &mad_context);
+				     &context);
 
-		if (status != IB_SUCCESS) {
+		if (status != IB_SUCCESS)
 			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
-				"Sending multicast fwd. tbl. block failed (%s)\n",
+				"Sending MFT block failed (%s)\n",
 				ib_get_err_str(status));
-			ret = -1;
-		}
 
-		if (++position > max_position) {
-			position = 0;
-			block_num++;
-		}
-	}
+	} else
+		ret = 0;
 
 	OSM_LOG_EXIT(sm->p_log);
 	return ret;
@@ -1077,7 +1072,8 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
 	cl_qmap_t *p_sw_tbl;
 	cl_qlist_t *p_list = &sm->mgrp_list;
 	osm_mgrp_t *p_mgrp;
-	int i, ret = 0;
+	osm_mcast_tbl_t *p_tbl;
+	int sws_notdone, i, ret = 0;
 
 	OSM_LOG_ENTER(sm->p_log);
 
@@ -1114,11 +1110,30 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
 	 */
 	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
 	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
-		if (mcast_mgr_set_tbl(sm, p_sw))
-			ret = -1;
+		p_sw->mft_block_num = 0;
+		p_sw->mft_position = 0;
 		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
 	}
 
+	while (1) {
+		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
+		sws_notdone = 0;
+		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
+			if (mcast_mgr_set_tbl(sm, p_sw))
+				sws_notdone++;
+			p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
+			if (++p_sw->mft_position > p_tbl->max_position) {
+				p_sw->mft_position = 0;
+				p_sw->mft_block_num++;
+			}
+			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
+		}
+		if (!sws_notdone) {
+			ret = -1;
+			break;
+		}
+	}
+
 	while (!cl_is_qlist_empty(p_list)) {
 		cl_list_item_t *p = cl_qlist_remove_head(p_list);
 		free(p);
@@ -1142,9 +1157,10 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
 	osm_switch_t *p_sw;
 	cl_qmap_t *p_sw_tbl;
 	osm_mgrp_t *p_mgrp;
+	osm_mcast_tbl_t *p_tbl;
 	ib_net16_t mlid;
 	osm_mcast_mgr_ctxt_t *ctx;
-	int ret = 0;
+	int sws_notdone, ret = 0;
 
 	OSM_LOG_ENTER(sm->p_log);
 
@@ -1195,11 +1211,30 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
 	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
 	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
 	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
-		if (mcast_mgr_set_tbl(sm, p_sw))
-			ret = -1;
+		p_sw->mft_block_num = 0;
+		p_sw->mft_position = 0;
 		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
 	}
 
+	while (1) {
+		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
+		sws_notdone = 0;
+		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
+			if (mcast_mgr_set_tbl(sm, p_sw))
+				sws_notdone++;
+			p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
+			if (++p_sw->mft_position > p_tbl->max_position) {
+				p_sw->mft_position = 0;
+				p_sw->mft_block_num++;
+			}
+			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
+		}
+		if (!sws_notdone) {
+			ret = -1;
+			break;
+		}
+	}
+
 	osm_dump_mcast_routes(sm->p_subn->p_osm);
 
 exit:



More information about the general mailing list