[ofa-general] Re: [PATCH] opensm: Parallelize (Stripe) MFT sets across switches

Sasha Khapyorsky sashak at voltaire.com
Sun Aug 30 04:26:24 PDT 2009


Hi Hal,

On 12:41 Fri 07 Aug     , Hal Rosenstock wrote:
> 
> Similar to previous patch to "Parallelize (Stripe) LFT sets across switches".
> Currently, MADs are pipelined to a single switch first which effectively
> serializes these requests. This patch pipelines the MFT set MADs across
> switches first (before cycling to the next MFT block) so that multiple
> switches can be responding concurrently. Speedup is dependent on number
> of MFT blocks in use (number of MLIDs) which is dependent on the number
> of multicast groups.
> 
> Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
> ---
> diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
> index 7ce28c5..e281842 100644
> --- a/opensm/include/opensm/osm_switch.h
> +++ b/opensm/include/opensm/osm_switch.h
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   *
>   * This software is available to you under a choice of one of two
> @@ -103,6 +103,8 @@ typedef struct osm_switch {
>  	uint8_t *lft;
>  	uint8_t *new_lft;
>  	osm_mcast_tbl_t mcast_tbl;
> +	uint32_t mft_block_num;
> +	uint32_t mft_position;
>  	unsigned endport_links;
>  	unsigned need_update;
>  	void *priv;
> diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
> index 4dbbaa0..f91c6b6 100644
> --- a/opensm/opensm/osm_mcast_mgr.c
> +++ b/opensm/opensm/osm_mcast_mgr.c
> @@ -1,6 +1,6 @@
>  /*
>   * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
>   * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
>   * Copyright (c) 2008 Xsigo Systems Inc.  All rights reserved.
>   *
> @@ -325,15 +325,12 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
>  {
>  	osm_node_t *p_node;
>  	osm_dr_path_t *p_path;
> -	osm_madw_context_t mad_context;
> +	osm_madw_context_t context;
>  	ib_api_status_t status;
> -	uint32_t block_id_ho = 0;
> -	int16_t block_num = 0;
> -	uint32_t position = 0;
> -	uint32_t max_position;
> +	uint32_t block_id_ho;
>  	osm_mcast_tbl_t *p_tbl;
>  	ib_net16_t block[IB_MCAST_BLOCK_SIZE];
> -	int ret = 0;
> +	int ret = -1;
>  
>  	CL_ASSERT(sm);
>  
> @@ -353,36 +350,34 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
>  	   configuration.
>  	 */
>  
> -	mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node);
> -	mad_context.mft_context.set_method = TRUE;
> +	context.mft_context.node_guid = osm_node_get_node_guid(p_node);
> +	context.mft_context.set_method = TRUE;
>  
>  	p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> -	max_position = p_tbl->max_position;
>  
> -	while (osm_mcast_tbl_get_block(p_tbl, block_num,
> -				       (uint8_t) position, block)) {
> -		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> -			"Writing MFT block 0x%X\n", block_id_ho);
> +	if (p_sw->mft_position <= p_tbl->max_position &&
> +	    osm_mcast_tbl_get_block(p_tbl, p_sw->mft_block_num,
> +				    (uint8_t) p_sw->mft_position, block)) {
> +
> +		block_id_ho = p_sw->mft_block_num + (p_sw->mft_position << 28);
>  
> -		block_id_ho = block_num + (position << 28);
> +		OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> +			"Writing MFT block %u position %u to switch 0x%" PRIx64 "\n",
> +			p_sw->mft_block_num, p_sw->mft_position,
> +			cl_ntoh64(context.lft_context.node_guid));
>  
>  		status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
>  				     IB_MAD_ATTR_MCAST_FWD_TBL,
>  				     cl_hton32(block_id_ho), CL_DISP_MSGID_NONE,
> -				     &mad_context);
> +				     &context);
>  
> -		if (status != IB_SUCCESS) {
> +		if (status != IB_SUCCESS)
>  			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
> -				"Sending multicast fwd. tbl. block failed (%s)\n",
> +				"Sending MFT block failed (%s)\n",
>  				ib_get_err_str(status));
> -			ret = -1;
> -		}
>  
> -		if (++position > max_position) {
> -			position = 0;
> -			block_num++;
> -		}
> -	}
> +	} else
> +		ret = 0;
>  
>  	OSM_LOG_EXIT(sm->p_log);
>  	return ret;
> @@ -1077,7 +1072,8 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
>  	cl_qmap_t *p_sw_tbl;
>  	cl_qlist_t *p_list = &sm->mgrp_list;
>  	osm_mgrp_t *p_mgrp;
> -	int i, ret = 0;
> +	osm_mcast_tbl_t *p_tbl;
> +	int sws_notdone, i, ret = 0;
>  
>  	OSM_LOG_ENTER(sm->p_log);
>  
> @@ -1114,11 +1110,30 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
>  	 */
>  	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
>  	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> -		if (mcast_mgr_set_tbl(sm, p_sw))
> -			ret = -1;
> +		p_sw->mft_block_num = 0;
> +		p_sw->mft_position = 0;
>  		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
>  	}
>  
> +	while (1) {
> +		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> +		sws_notdone = 0;
> +		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> +			if (mcast_mgr_set_tbl(sm, p_sw))
> +				sws_notdone++;
> +			p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> +			if (++p_sw->mft_position > p_tbl->max_position) {
> +				p_sw->mft_position = 0;
> +				p_sw->mft_block_num++;
> +			}
> +			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> +		}
> +		if (!sws_notdone) {
> +			ret = -1;
> +			break;
> +		}

So osm_mcast_mgr_process() will always return -1 value? Why?

> +	}
> +
>  	while (!cl_is_qlist_empty(p_list)) {
>  		cl_list_item_t *p = cl_qlist_remove_head(p_list);
>  		free(p);
> @@ -1142,9 +1157,10 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
>  	osm_switch_t *p_sw;
>  	cl_qmap_t *p_sw_tbl;
>  	osm_mgrp_t *p_mgrp;
> +	osm_mcast_tbl_t *p_tbl;
>  	ib_net16_t mlid;
>  	osm_mcast_mgr_ctxt_t *ctx;
> -	int ret = 0;
> +	int sws_notdone, ret = 0;
>  
>  	OSM_LOG_ENTER(sm->p_log);
>  
> @@ -1195,11 +1211,30 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
>  	p_sw_tbl = &sm->p_subn->sw_guid_tbl;
>  	p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
>  	while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> -		if (mcast_mgr_set_tbl(sm, p_sw))
> -			ret = -1;
> +		p_sw->mft_block_num = 0;
> +		p_sw->mft_position = 0;
>  		p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
>  	}
>  
> +	while (1) {
> +		p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> +		sws_notdone = 0;
> +		while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> +			if (mcast_mgr_set_tbl(sm, p_sw))
> +				sws_notdone++;
> +			p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> +			if (++p_sw->mft_position > p_tbl->max_position) {
> +				p_sw->mft_position = 0;
> +				p_sw->mft_block_num++;
> +			}
> +			p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> +		}
> +		if (!sws_notdone) {
> +			ret = -1;
> +			break;

Ditto.

> +		}
> +	}
> +

Could you consolidate this code which is equivalent with one in
osm_mcast_mgr_process() in single function say
mcast_mgr_set_mftables()?

Also similar to LFTs case it would be nice to simplify this tables setup
loop.

Sasha

>  	osm_dump_mcast_routes(sm->p_subn->p_osm);
>  
>  exit:
> 



More information about the general mailing list