[ofa-general] Re: [PATCH] opensm: Parallelize (Stripe) MFT sets across switches
Sasha Khapyorsky
sashak at voltaire.com
Sun Aug 30 04:26:24 PDT 2009
Hi Hal,
On 12:41 Fri 07 Aug , Hal Rosenstock wrote:
>
> Similar to previous patch to "Parallelize (Stripe) LFT sets across switches".
> Currently, MADs are pipelined to a single switch first which effectively
> serializes these requests. This patch pipelines the MFT set MADs across
> switches first (before cycling to the next MFT block) so that multiple
> switches can be responding concurrently. Speedup is dependent on number
> of MFT blocks in use (number of MLIDs) which is dependent on the number
> of multicast groups.
>
> Signed-off-by: Hal Rosenstock <hal.rosenstock at gmail.com>
> ---
> diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
> index 7ce28c5..e281842 100644
> --- a/opensm/include/opensm/osm_switch.h
> +++ b/opensm/include/opensm/osm_switch.h
> @@ -1,6 +1,6 @@
> /*
> * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2008 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
> * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
> *
> * This software is available to you under a choice of one of two
> @@ -103,6 +103,8 @@ typedef struct osm_switch {
> uint8_t *lft;
> uint8_t *new_lft;
> osm_mcast_tbl_t mcast_tbl;
> + uint32_t mft_block_num;
> + uint32_t mft_position;
> unsigned endport_links;
> unsigned need_update;
> void *priv;
> diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
> index 4dbbaa0..f91c6b6 100644
> --- a/opensm/opensm/osm_mcast_mgr.c
> +++ b/opensm/opensm/osm_mcast_mgr.c
> @@ -1,6 +1,6 @@
> /*
> * Copyright (c) 2004-2008 Voltaire, Inc. All rights reserved.
> - * Copyright (c) 2002-2006 Mellanox Technologies LTD. All rights reserved.
> + * Copyright (c) 2002-2009 Mellanox Technologies LTD. All rights reserved.
> * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
> * Copyright (c) 2008 Xsigo Systems Inc. All rights reserved.
> *
> @@ -325,15 +325,12 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
> {
> osm_node_t *p_node;
> osm_dr_path_t *p_path;
> - osm_madw_context_t mad_context;
> + osm_madw_context_t context;
> ib_api_status_t status;
> - uint32_t block_id_ho = 0;
> - int16_t block_num = 0;
> - uint32_t position = 0;
> - uint32_t max_position;
> + uint32_t block_id_ho;
> osm_mcast_tbl_t *p_tbl;
> ib_net16_t block[IB_MCAST_BLOCK_SIZE];
> - int ret = 0;
> + int ret = -1;
>
> CL_ASSERT(sm);
>
> @@ -353,36 +350,34 @@ static int mcast_mgr_set_tbl(osm_sm_t * sm, IN osm_switch_t * p_sw)
> configuration.
> */
>
> - mad_context.mft_context.node_guid = osm_node_get_node_guid(p_node);
> - mad_context.mft_context.set_method = TRUE;
> + context.mft_context.node_guid = osm_node_get_node_guid(p_node);
> + context.mft_context.set_method = TRUE;
>
> p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> - max_position = p_tbl->max_position;
>
> - while (osm_mcast_tbl_get_block(p_tbl, block_num,
> - (uint8_t) position, block)) {
> - OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> - "Writing MFT block 0x%X\n", block_id_ho);
> + if (p_sw->mft_position <= p_tbl->max_position &&
> + osm_mcast_tbl_get_block(p_tbl, p_sw->mft_block_num,
> + (uint8_t) p_sw->mft_position, block)) {
> +
> + block_id_ho = p_sw->mft_block_num + (p_sw->mft_position << 28);
>
> - block_id_ho = block_num + (position << 28);
> + OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
> + "Writing MFT block %u position %u to switch 0x%" PRIx64 "\n",
> + p_sw->mft_block_num, p_sw->mft_position,
> + cl_ntoh64(context.lft_context.node_guid));
>
> status = osm_req_set(sm, p_path, (void *)block, sizeof(block),
> IB_MAD_ATTR_MCAST_FWD_TBL,
> cl_hton32(block_id_ho), CL_DISP_MSGID_NONE,
> - &mad_context);
> + &context);
>
> - if (status != IB_SUCCESS) {
> + if (status != IB_SUCCESS)
> OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 0A02: "
> - "Sending multicast fwd. tbl. block failed (%s)\n",
> + "Sending MFT block failed (%s)\n",
> ib_get_err_str(status));
> - ret = -1;
> - }
>
> - if (++position > max_position) {
> - position = 0;
> - block_num++;
> - }
> - }
> + } else
> + ret = 0;
>
> OSM_LOG_EXIT(sm->p_log);
> return ret;
> @@ -1077,7 +1072,8 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
> cl_qmap_t *p_sw_tbl;
> cl_qlist_t *p_list = &sm->mgrp_list;
> osm_mgrp_t *p_mgrp;
> - int i, ret = 0;
> + osm_mcast_tbl_t *p_tbl;
> + int sws_notdone, i, ret = 0;
>
> OSM_LOG_ENTER(sm->p_log);
>
> @@ -1114,11 +1110,30 @@ int osm_mcast_mgr_process(osm_sm_t * sm)
> */
> p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> - if (mcast_mgr_set_tbl(sm, p_sw))
> - ret = -1;
> + p_sw->mft_block_num = 0;
> + p_sw->mft_position = 0;
> p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> }
>
> + while (1) {
> + p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> + sws_notdone = 0;
> + while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> + if (mcast_mgr_set_tbl(sm, p_sw))
> + sws_notdone++;
> + p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> + if (++p_sw->mft_position > p_tbl->max_position) {
> + p_sw->mft_position = 0;
> + p_sw->mft_block_num++;
> + }
> + p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> + }
> + if (!sws_notdone) {
> + ret = -1;
> + break;
> + }
So osm_mcast_mgr_process() will always return -1 value? Why?
> + }
> +
> while (!cl_is_qlist_empty(p_list)) {
> cl_list_item_t *p = cl_qlist_remove_head(p_list);
> free(p);
> @@ -1142,9 +1157,10 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
> osm_switch_t *p_sw;
> cl_qmap_t *p_sw_tbl;
> osm_mgrp_t *p_mgrp;
> + osm_mcast_tbl_t *p_tbl;
> ib_net16_t mlid;
> osm_mcast_mgr_ctxt_t *ctx;
> - int ret = 0;
> + int sws_notdone, ret = 0;
>
> OSM_LOG_ENTER(sm->p_log);
>
> @@ -1195,11 +1211,30 @@ int osm_mcast_mgr_process_mgroups(osm_sm_t * sm)
> p_sw_tbl = &sm->p_subn->sw_guid_tbl;
> p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> - if (mcast_mgr_set_tbl(sm, p_sw))
> - ret = -1;
> + p_sw->mft_block_num = 0;
> + p_sw->mft_position = 0;
> p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> }
>
> + while (1) {
> + p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> + sws_notdone = 0;
> + while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> + if (mcast_mgr_set_tbl(sm, p_sw))
> + sws_notdone++;
> + p_tbl = osm_switch_get_mcast_tbl_ptr(p_sw);
> + if (++p_sw->mft_position > p_tbl->max_position) {
> + p_sw->mft_position = 0;
> + p_sw->mft_block_num++;
> + }
> + p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> + }
> + if (!sws_notdone) {
> + ret = -1;
> + break;
Ditto.
> + }
> + }
> +
Could you consolidate this code which is equivalent with one in
osm_mcast_mgr_process() in single function say
mcast_mgr_set_mftables()?
Also similar to LFTs case it would be nice to simplify this tables setup
loop.
Sasha
> osm_dump_mcast_routes(sm->p_subn->p_osm);
>
> exit:
>
More information about the general
mailing list