[ofa-general] Re: [PATCH 1/2] opensm: avoid LASH use-after-free when switch is deleted from fabric.

Sasha Khapyorsky sashak at voltaire.com
Tue Sep 22 11:50:14 PDT 2009


Hi Jim,

On 13:08 Fri 28 Aug     , Jim Schutt wrote:
> When LASH is run against ibsim, valgrind reports the following
> (on x86_64) after a switch is removed from the fabric:
> 
> ==15699== Invalid write of size 8
> ==15699==    at 0x45FD8A: switch_delete (osm_ucast_lash.c:648)
> ==15699==    by 0x461483: lash_cleanup (osm_ucast_lash.c:1123)
> ==15699==    by 0x461848: lash_process (osm_ucast_lash.c:1230)
> ==15699==    by 0x45C043: ucast_mgr_route (osm_ucast_mgr.c:1016)
> ==15699==    by 0x45C1A0: osm_ucast_mgr_process (osm_ucast_mgr.c:1057)
> ==15699==    by 0x44F11B: do_sweep (osm_state_mgr.c:1283)
> ==15699==    by 0x44F539: osm_state_mgr_process (osm_state_mgr.c:1398)
> ==15699==    by 0x447296: sm_process (osm_sm.c:90)
> ==15699==    by 0x4473FE: sm_sweeper (osm_sm.c:130)
> ==15699==    by 0x5023505: __cl_thread_wrapper (cl_thread.c:57)
> ==15699==    by 0x37AC006366: start_thread (in /lib64/libpthread-2.5.so)
> ==15699==    by 0x37AB4D30AC: clone (in /lib64/libc-2.5.so)
> ==15699==  Address 0x9B28198 is 152 bytes inside a block of size 160 free'd
> ==15699==    at 0x4A0541E: free (vg_replace_malloc.c:233)
> ==15699==    by 0x453866: osm_switch_delete (osm_switch.c:97)
> ==15699==    by 0x4116AA: drop_mgr_remove_switch (osm_drop_mgr.c:290)
> ==15699==    by 0x411820: drop_mgr_process_node (osm_drop_mgr.c:339)
> ==15699==    by 0x411D0C: osm_drop_mgr_process (osm_drop_mgr.c:465)
> ==15699==    by 0x44EF97: do_sweep (osm_state_mgr.c:1231)
> ==15699==    by 0x44F539: osm_state_mgr_process (osm_state_mgr.c:1398)
> ==15699==    by 0x447296: sm_process (osm_sm.c:90)
> ==15699==    by 0x4473FE: sm_sweeper (osm_sm.c:130)
> ==15699==    by 0x5023505: __cl_thread_wrapper (cl_thread.c:57)
> ==15699==    by 0x37AC006366: start_thread (in /lib64/libpthread-2.5.so)
> ==15699==    by 0x37AB4D30AC: clone (in /lib64/libc-2.5.so)
> 
> The root cause is that in order to perform SL lookup for path record
> queries, LASH needs to keep persistent data between calls to the
> routing engine.
> 
> LASH uses the osm_switch_t:priv member to speed lookup of the LASH
> switch_t objects it needs to perform SL lookup, and has a corresponding
> switch_t:p_sw member to point to the corresponding osm_switch_t object.
> 
> When a switch is deleted from the fabric, the switch_t:p_sw value becomes
> invalid, but LASH's switch_delete() uses it to clear the corresponding
> osm_switch_t:priv value.

Ok. I see the issue. This 'p_sw->priv = NULL' line was not in the
original "priv" introduction code, but was added by mistake (AFAIR for
"for sure" reason :)) by some subsequent patch.

Why to not fix this by just removing this not actually needed statement?

Sasha

> 
> Solve this problem by adding a priv_release function pointer that
> is set when osm_switch_t:priv is set.  This allows the opensm core to
> clean up after any routing engine that is using priv to access
> persistent data (LASH seems to be the only one so far), without
> knowing the details of how to do so.
> 
> When multiple routing engines are configured, it also allows a routing
> engine using osm_switch_t:priv to clean up if some other routing engine
> using priv fails in an unexpected way.
> 
> With this addition, the rules for using osm_switch_t:priv become:
> 1) Never assign to priv without also assigning to priv_release.
> 2) Always use priv_release() before assigning to priv; this
>    prevents memory issues due to unexpected errors in a
>    routing engine using priv.
> 3) Always use priv_release() to clean up after a use of priv.
> 
> Since updn uses osm_switch_t:priv, fix it up to follow the above
> rules as well, for consistency.
> 
> Signed-off-by: Jim Schutt <jaschut at sandia.gov>
> ---
>  opensm/include/opensm/osm_switch.h |    1 +
>  opensm/opensm/osm_switch.c         |    2 ++
>  opensm/opensm/osm_ucast_lash.c     |   24 ++++++++++++++++++++----
>  opensm/opensm/osm_ucast_updn.c     |   15 +++++++++++----
>  4 files changed, 34 insertions(+), 8 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_switch.h b/opensm/include/opensm/osm_switch.h
> index 7ce28c5..d48f8c6 100644
> --- a/opensm/include/opensm/osm_switch.h
> +++ b/opensm/include/opensm/osm_switch.h
> @@ -106,6 +106,7 @@ typedef struct osm_switch {
>  	unsigned endport_links;
>  	unsigned need_update;
>  	void *priv;
> +	void (*priv_release)(struct osm_switch *p_sw);
>  } osm_switch_t;
>  /*
>  * FIELDS
> diff --git a/opensm/opensm/osm_switch.c b/opensm/opensm/osm_switch.c
> index ce1ca63..fbf3973 100644
> --- a/opensm/opensm/osm_switch.c
> +++ b/opensm/opensm/osm_switch.c
> @@ -94,6 +94,8 @@ void osm_switch_delete(IN OUT osm_switch_t ** const pp_sw)
>  				free(p_sw->hops[i]);
>  		free(p_sw->hops);
>  	}
> +	if (p_sw->priv_release)
> +		p_sw->priv_release(p_sw);
>  	free(*pp_sw);
>  	*pp_sw = NULL;
>  }
> diff --git a/opensm/opensm/osm_ucast_lash.c b/opensm/opensm/osm_ucast_lash.c
> index 0a567b3..ceae7d8 100644
> --- a/opensm/opensm/osm_ucast_lash.c
> +++ b/opensm/opensm/osm_ucast_lash.c
> @@ -603,6 +603,17 @@ static int balance_virtual_lanes(lash_t * p_lash, unsigned lanes_needed)
>  	return 0;
>  }
>  
> +static void lash_switch_priv_release(osm_switch_t *osm_sw)
> +{
> +	switch_t *sw = osm_sw->priv;
> +
> +	osm_sw->priv_release = NULL;
> +	osm_sw->priv = NULL;
> +
> +	if (sw && sw->p_sw == osm_sw)
> +		sw->p_sw = NULL;
> +}
> +
>  static switch_t *switch_create(lash_t * p_lash, unsigned id, osm_switch_t * p_sw)
>  {
>  	unsigned num_switches = p_lash->num_switches;
> @@ -628,8 +639,12 @@ static switch_t *switch_create(lash_t * p_lash, unsigned id, osm_switch_t * p_sw
>  	}
>  
>  	sw->p_sw = p_sw;
> -	if (p_sw)
> +	if (p_sw) {
> +		if (p_sw->priv_release)
> +			p_sw->priv_release(p_sw);
>  		p_sw->priv = sw;
> +		p_sw->priv_release = lash_switch_priv_release;
> +	}
>  
>  	if (osm_mesh_node_create(p_lash, sw)) {
>  		free(sw->dij_channels);
> @@ -644,8 +659,8 @@ static void switch_delete(lash_t *p_lash, switch_t * sw)
>  {
>  	if (sw->dij_channels)
>  		free(sw->dij_channels);
> -	if (sw->p_sw)
> -		sw->p_sw->priv = NULL;
> +	if (sw->p_sw && sw->p_sw->priv_release)
> +		sw->p_sw->priv_release(sw->p_sw);
>  	free(sw);
>  }
>  
> @@ -1113,7 +1128,8 @@ static void lash_cleanup(lash_t * p_lash)
>  	while (p_next_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl)) {
>  		p_sw = p_next_sw;
>  		p_next_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item);
> -		p_sw->priv = NULL;
> +		if (p_sw->priv_release)
> +			p_sw->priv_release(p_sw);
>  	}
>  
>  	if (p_lash->switches) {
> diff --git a/opensm/opensm/osm_ucast_updn.c b/opensm/opensm/osm_ucast_updn.c
> index bb9ccda..dc5f459 100644
> --- a/opensm/opensm/osm_ucast_updn.c
> +++ b/opensm/opensm/osm_ucast_updn.c
> @@ -404,10 +404,13 @@ static struct updn_node *create_updn_node(osm_switch_t * sw)
>  	return u;
>  }
>  
> -static void delete_updn_node(struct updn_node *u)
> +static void updn_sw_priv_release(osm_switch_t *sw)
>  {
> -	u->sw->priv = NULL;
> -	free(u);
> +	if (sw->priv)
> +		free(sw->priv);
> +
> +	sw->priv_release = NULL;
> +	sw->priv = NULL;
>  }
>  
>  /**********************************************************************
> @@ -589,6 +592,8 @@ static int updn_lid_matrices(void *ctx)
>  	     item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
>  	     item = cl_qmap_next(item)) {
>  		p_sw = (osm_switch_t *)item;
> +		if (p_sw->priv_release)
> +			p_sw->priv_release(p_sw);
>  		p_sw->priv = create_updn_node(p_sw);
>  		if (!p_sw->priv) {
>  			OSM_LOG(&(p_updn->p_osm->log), OSM_LOG_ERROR, "ERR AA0C: "
> @@ -596,6 +601,7 @@ static int updn_lid_matrices(void *ctx)
>  			OSM_LOG_EXIT(&p_updn->p_osm->log);
>  			return -1;
>  		}
> +		p_sw->priv_release = updn_sw_priv_release;
>  	}
>  
>  	/* First setup root nodes */
> @@ -653,7 +659,8 @@ static int updn_lid_matrices(void *ctx)
>  	     item != cl_qmap_end(&p_updn->p_osm->subn.sw_guid_tbl);
>  	     item = cl_qmap_next(item)) {
>  		p_sw = (osm_switch_t *) item;
> -		delete_updn_node(p_sw->priv);
> +		if (p_sw->priv_release)
> +			p_sw->priv_release(p_sw);
>  	}
>  
>  	OSM_LOG_EXIT(&p_updn->p_osm->log);
> -- 
> 1.5.6.GIT
> 
> 



More information about the general mailing list