[ofa-general] Re: [PATCH RFC] opensm: mcast mgr improvements
Sasha Khapyorsky
sashak at voltaire.com
Wed Jan 9 09:04:23 PST 2008
Hi Yevgeny,
On 13:44 Wed 09 Jan , Yevgeny Kliteynik wrote:
> Hi Sasha,
>
> Please see below.
>
> Sasha Khapyorsky wrote:
> > This improves handling of mcast join/leave requests storming. Now mcast
> > routing will be recalculated for all mcast groups where changes occurred
> > and not one by one. For this it queues mcast groups instead of mcast
> > rerouting requests, this also makes state_mgr idle queue obsolete.
> > Signed-off-by: Sasha Khapyorsky <sashak at voltaire.com>
> > ---
> > Hi Yevgeny,
> > For me it looks that it should solve the original problem (mcast group
> > list is purged in osm_mcast_mgr_process()). Could you review and ideally
> > test it? Thanks.
> > Sasha
> > diff --git a/opensm/opensm/osm_mcast_mgr.c b/opensm/opensm/osm_mcast_mgr.c
> > index 50b95fd..f51a45a 100644
> > --- a/opensm/opensm/osm_mcast_mgr.c
> > +++ b/opensm/opensm/osm_mcast_mgr.c
> > @@ -1254,63 +1254,55 @@ osm_mcast_mgr_process_tree(IN osm_mcast_mgr_t *
> > const p_mgr,
> > port_guid);
> > }
> > - Exit:
> > +Exit:
> > OSM_LOG_EXIT(p_mgr->p_log);
> > return (status);
> > }
> > /**********************************************************************
> > Process the entire group.
> > -
> > NOTE : The lock should be held externally!
> > **********************************************************************/
> > -static osm_signal_t
> > -osm_mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
> > - IN osm_mgrp_t * const p_mgrp,
> > - IN osm_mcast_req_type_t req_type,
> > - IN ib_net64_t port_guid)
> > +static ib_api_status_t
> > +mcast_mgr_process_mgrp(IN osm_mcast_mgr_t * const p_mgr,
> > + IN osm_mgrp_t * const p_mgrp,
> > + IN osm_mcast_req_type_t req_type,
> > + IN ib_net64_t port_guid)
> > {
> > - osm_signal_t signal = OSM_SIGNAL_DONE;
> > ib_api_status_t status;
> > - osm_switch_t *p_sw;
> > - cl_qmap_t *p_sw_tbl;
> > - boolean_t pending_transactions = FALSE;
> > OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process_mgrp);
> > - p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
> > -
> > status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp, req_type, port_guid);
> > if (status != IB_SUCCESS) {
> > osm_log(p_mgr->p_log, OSM_LOG_ERROR,
> > - "osm_mcast_mgr_process_mgrp: ERR 0A19: "
> > + "mcast_mgr_process_mgrp: ERR 0A19: "
> > "Unable to create spanning tree (%s)\n",
> > ib_get_err_str(status));
> > -
> > goto Exit;
> > }
> > + p_mgrp->last_tree_id = p_mgrp->last_change_id;
> > - /*
> > - Walk the switches and download the tables for each.
> > + /* Remove MGRP only if osm_mcm_port_t count is 0 and
> > + * Not a well known group
> > */
> > - p_sw = (osm_switch_t *) cl_qmap_head(p_sw_tbl);
> > - while (p_sw != (osm_switch_t *) cl_qmap_end(p_sw_tbl)) {
> > - signal = __osm_mcast_mgr_set_tbl(p_mgr, p_sw);
> > - if (signal == OSM_SIGNAL_DONE_PENDING)
> > - pending_transactions = TRUE;
> > -
> > - p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> > + if (cl_qmap_count(&p_mgrp->mcm_port_tbl) == 0 && !p_mgrp->well_known) {
> > + osm_log(p_mgr->p_log, OSM_LOG_DEBUG,
> > + "mcast_mgr_process_mgrp: "
> > + "Destroying mgrp with lid:0x%X\n",
> > + cl_ntoh16(p_mgrp->mlid));
> > + /* Send a Report to any InformInfo registered for
> > + Trap 67 : MCGroup delete */
> > + osm_mgrp_send_delete_notice(p_mgr->p_subn, p_mgr->p_log,
> > + p_mgrp);
> > + cl_qmap_remove_item(&p_mgr->p_subn->mgrp_mlid_tbl,
> > + (cl_map_item_t *) p_mgrp);
> > + osm_mgrp_delete(p_mgrp);
>
> If the group is empty, p_mgrp is deleted
>
> > @@ -1321,14 +1313,13 @@ osm_signal_t osm_mcast_mgr_process(IN
> > osm_mcast_mgr_t * const p_mgr)
> > osm_switch_t *p_sw;
> > cl_qmap_t *p_sw_tbl;
> > cl_qmap_t *p_mcast_tbl;
> > + cl_qlist_t *p_list = &p_mgr->p_subn->p_osm->sm.mgrp_list;
> > osm_mgrp_t *p_mgrp;
> > - ib_api_status_t status;
> > boolean_t pending_transactions = FALSE;
> > OSM_LOG_ENTER(p_mgr->p_log, osm_mcast_mgr_process);
> > p_sw_tbl = &p_mgr->p_subn->sw_guid_tbl;
> > -
> > p_mcast_tbl = &p_mgr->p_subn->mgrp_mlid_tbl;
> > /*
> > While holding the lock, iterate over all the established
> > @@ -1343,16 +1334,8 @@ osm_signal_t osm_mcast_mgr_process(IN
> > osm_mcast_mgr_t * const p_mgr)
> > /* We reached here due to some change that caused a heavy sweep
> > of the subnet. Not due to a specific multicast request.
> > So the request type is subnet_change and the port guid is 0. */
> > - status = osm_mcast_mgr_process_tree(p_mgr, p_mgrp,
> > - OSM_MCAST_REQ_TYPE_SUBNET_CHANGE,
> > - 0);
> > - if (status != IB_SUCCESS) {
> > - osm_log(p_mgr->p_log, OSM_LOG_ERROR,
> > - "osm_mcast_mgr_process: ERR 0A20: "
> > - "Unable to create spanning tree (%s)\n",
> > - ib_get_err_str(status));
> > - }
> > -
> > + mcast_mgr_process_mgrp(p_mgr, p_mgrp,
> > + OSM_MCAST_REQ_TYPE_SUBNET_CHANGE, 0);
> > p_mgrp = (osm_mgrp_t *) cl_qmap_next(&p_mgrp->map_item);
>
> And here there's a call to 'next' on a p_mgrp that was freed,
> which eventually causes osm to crash on some segfault or assert
> at some point.
Nice catch! Thanks for the fix!
Sasha
More information about the general
mailing list