[ofa-general] Re: [PATCH 3/6] opensm/Unicast Routing Cache: add osm_ucast_cache.{c,h} files

Sasha Khapyorsky sashak at voltaire.com
Thu Oct 9 11:32:23 PDT 2008


Hi Yevgeny,

Comments are below.

Also I made couple of incremental changes (which were obvious IMHO),
will post later...

On 03:28 Mon 06 Oct     , Yevgeny Kliteynik wrote:
> Implementation of the osm unicast routing cache.

Would be nice to have more detailed comment here and also for the
"integration" patch.

> 
> Signed-off-by: Yevgeny Kliteynik <kliteyn at dev.mellanox.co.il>
> ---
>  opensm/include/opensm/osm_ucast_cache.h |  439 ++++++++++++
>  opensm/opensm/osm_ucast_cache.c         | 1176 +++++++++++++++++++++++++++++++
>  2 files changed, 1615 insertions(+), 0 deletions(-)
>  create mode 100644 opensm/include/opensm/osm_ucast_cache.h
>  create mode 100644 opensm/opensm/osm_ucast_cache.c
> 
> diff --git a/opensm/include/opensm/osm_ucast_cache.h b/opensm/include/opensm/osm_ucast_cache.h
> new file mode 100644
> index 0000000..2dc1c4e
> --- /dev/null
> +++ b/opensm/include/opensm/osm_ucast_cache.h

[snip...]

> +/****s* OpenSM: Unicast Cache/osm_ucast_cache_t
> +* NAME
> +*	osm_ucast_cache_t
> +*
> +* DESCRIPTION
> +*	Unicast Cache structure.
> +*
> +*	This object should be treated as opaque and should
> +*	be manipulated only through the provided functions.
> +*
> +* SYNOPSIS
> +*/
> +typedef struct _osm_ucast_cache {

There are no _osm_* structure names in OpenSM. Please keep things
consistent. (Will post the patch)

> +	cl_qmap_t sw_tbl;
> +	boolean_t valid;
> +	struct osm_ucast_mgr * p_ucast_mgr;
> +} osm_ucast_cache_t;

The object itself is pretty small, actually there are only sw_tbl map
and valid flag. Why to not do it as part of struct osm_ucast_mgr and to
save a lot of code like p_cache->p_ucast_mgr->..., construct, etc...?

[snip...]

> diff --git a/opensm/opensm/osm_ucast_cache.c b/opensm/opensm/osm_ucast_cache.c
> new file mode 100644
> index 0000000..2c2154a
> --- /dev/null
> +++ b/opensm/opensm/osm_ucast_cache.c

[snip...]

> +static cache_switch_t *
> +__cache_sw_new(uint16_t lid_ho)
> +{
> +	cache_switch_t * p_cache_sw =
> +		(cache_switch_t *)malloc(sizeof(cache_switch_t));
> +	if (!p_cache_sw)
> +		return NULL;
> +
> +	memset(p_cache_sw, 0, sizeof(cache_switch_t));
> +
> +	p_cache_sw->ports = (cache_port_t *)malloc(sizeof(cache_port_t));
> +	if (!p_cache_sw->ports) {
> +		free(p_cache_sw);
> +		return NULL;
> +	}

Is it really helpful to alloc only one port at init time and realloc
later? Normally cache will not be huge, OTOH it saves some flow like
(port_num >= p_cache_sw->num_ports). Maybe I'm missing more complicated
cases?

> +
> +	/* port[0] fields represent this switch details - lid and type */
> +	p_cache_sw->ports[0].remote_lid_ho = lid_ho;
> +	p_cache_sw->ports[0].is_leaf = FALSE;
> +
> +	return p_cache_sw;
> +}

[snip...]

> +/**********************************************************************
> + **********************************************************************/
> +
> +static void
> +__cache_add_port(osm_ucast_cache_t * p_cache,
> +		 uint16_t lid_ho,
> +		 uint8_t port_num,
> +		 uint16_t remote_lid_ho,
> +		 boolean_t is_ca)
> +{
> +	cache_switch_t * p_cache_sw;
> +
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +
> +	if (!lid_ho || !remote_lid_ho || !port_num)
> +		goto Exit;
> +
> +	OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_VERBOSE,
> +		"Caching switch port: lid %u [port %u] -> lid %u (%s)\n",
> +		lid_ho, port_num, remote_lid_ho,
> +		(is_ca)? "CA/RTR" : "SW");
> +
> +	p_cache_sw = __cache_get_or_add_sw(p_cache, lid_ho);
> +	if (!p_cache_sw) {
> +		OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_ERROR,
> +			"ERR AD01: Out of memory - cache is invalid\n");
> +		osm_ucast_cache_invalidate(p_cache);
> +		goto Exit;
> +	}
> +
> +	if (port_num >= p_cache_sw->num_ports) {
> +		cache_port_t * ports = (cache_port_t *)
> +			malloc(sizeof(cache_port_t)*(port_num+1));

As Hal already noted, no malloc() result check.

Also here and in other places - malloc() return 'void *' so why casting?
IMO it is confused.

> +		memset(ports, 0, sizeof(cache_port_t)*(port_num+1));

[snip...]

> +static void
> +__cache_check_link_change(osm_ucast_cache_t * p_cache,
> +			  osm_physp_t * p_physp_1,
> +			  osm_physp_t * p_physp_2)
> +{
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +	CL_ASSERT(p_physp_1 && p_physp_2);
> +
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	if (!p_physp_1->p_remote_physp && !p_physp_2->p_remote_physp)
> +		/* both ports were down - new link */
> +		goto Exit;
> +
> +	/* unicast cache cannot tolerate any link location change */
> +
> +	if ((p_physp_1->p_remote_physp &&
> +	     p_physp_1->p_remote_physp->p_remote_physp) ||
> +	    (p_physp_2->p_remote_physp &&
> +	     p_physp_2->p_remote_physp->p_remote_physp)) {

Will this handle port moving during discovery? When duplicated guid
detection is passed ports may have old "remotes".

> +		OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +			"Link location change discovered - cache is invalid\n");

Here and in other places in this file: OSM_LOG_INFO is pretty overused
(it is likely this file has more OSM_LOG_INFO than all other OpenSM
parts together),

I think all OSM_LOG_VERBOSE should actually be replaced by OSM_LOG_DEBUG
and all OSM_LOG_INFO should become OSM_LOG_VERBOSE.

> +		osm_ucast_cache_invalidate(p_cache);
> +		goto Exit;
> +	}
> +Exit:
> +   	OSM_LOG_EXIT(p_cache->p_ucast_mgr->p_log);
> +}

[snip...]

> +static void
> +__cache_restore_ucast_info(osm_ucast_cache_t * p_cache,
> +			   cache_switch_t * p_cache_sw,
> +			   osm_switch_t * p_sw)
> +{
> +	if (!p_cache->valid)
> +		return;
> +
> +	/* when seting unicast info, the cached port
> +	   should have all the required info */
> +	CL_ASSERT(p_cache_sw->max_lid_ho && p_cache_sw->lft &&
> +		  p_cache_sw->num_hops && p_cache_sw->hops);
> +
> +	p_sw->max_lid_ho = p_cache_sw->max_lid_ho;
> +
> +	if (p_sw->lft_buf)
> +		free(p_sw->lft_buf);
> +	p_sw->lft_buf = p_cache_sw->lft;
> +	p_cache_sw->lft = NULL;
> +
> +	p_sw->num_hops = p_cache_sw->num_hops;
> +	p_cache_sw->num_hops = 0;
> +	if (p_sw->hops)
> +		free(p_sw->hops);
> +	p_sw->hops = p_cache_sw->hops;
> +	p_cache_sw->hops = NULL;

This is nice :).

sw->hops is array of pointers which could be allocated by routing
engine so in generic case we will need to free all sub-buffers first. As
far as I can see this function will be used for freshly discovered
switches only, if so it looks correct for me. Just to be sure...

[snip...]

> +void
> +osm_ucast_cache_validate(osm_ucast_cache_t * p_cache)
> +{
> +	cache_switch_t * p_cache_sw;
> +	cache_switch_t * p_remote_cache_sw;
> +	unsigned         port_num;
> +	unsigned         max_ports;
> +	uint8_t          remote_node_type;
> +	uint16_t         lid_ho;
> +	uint16_t         remote_lid_ho;
> +	osm_switch_t   * p_sw;
> +	osm_switch_t   * p_remote_sw;
> +	osm_node_t     * p_node;
> +	osm_physp_t    * p_physp;
> +	osm_physp_t    * p_remote_physp;
> +	osm_port_t     * p_remote_port;
> +	cl_qmap_t      * p_node_guid_tbl;
> +
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	/*
> +	 * Scan all the physical switch ports in the subnet.
> +	 * If the port need_update flag is on, check whether
> +	 * it's just some node/port reset or a cached topology
> +	 * change. Otherwise the cache is invalid.
> +	 */
> +	p_node_guid_tbl = &p_cache->p_ucast_mgr->p_subn->node_guid_tbl;

Then it should be:

	p_sw_tbl = &p_cache->p_ucast_mgr->p_subn->sw_guid_tbl...

Will send the patch...

> +	for (p_node = (osm_node_t *) cl_qmap_head(p_node_guid_tbl);
> +	     p_node != (osm_node_t *) cl_qmap_end(p_node_guid_tbl);
> +	     p_node = (osm_node_t *) cl_qmap_next(&p_node->map_item)) {
> +
> +		if (osm_node_get_type(p_node) != IB_NODE_TYPE_SWITCH)
> +			continue;
> +
> +		lid_ho = cl_ntoh16(osm_node_get_base_lid(p_node,0));
> +		p_cache_sw = __cache_get_sw(p_cache, lid_ho);
> +
> +		p_sw = p_node->sw;
> +		max_ports = osm_node_get_num_physp(p_node);
> +
> +		/* skip port 0 */
> +		for (port_num = 1; port_num < max_ports; port_num++) {
> +
> +			p_physp = osm_node_get_physp_ptr(p_node, port_num);
> +
> +			if (!p_physp || !p_physp->p_remote_physp ||
> +			    !osm_physp_link_exists(p_physp, p_physp->p_remote_physp))
> +				/* no valid link */
> +				continue;
> +
> +			/*
> +			 * While scanning all the physical ports in the subnet,
> +			 * mark corresponding leaf switches in the cache.
> +			 */
> +			if (p_cache_sw &&
> +			    !p_cache_sw->dropped &&
> +			    !__cache_sw_is_leaf(p_cache_sw) &&
> +			    p_physp->p_remote_physp->p_node &&
> +			    osm_node_get_type(
> +				    p_physp->p_remote_physp->p_node) !=
> +					IB_NODE_TYPE_SWITCH)
> +				__cache_sw_set_leaf(p_cache_sw);
> +
> +			if (!p_physp->need_update)
> +				continue;
> +
> +			OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_VERBOSE,
> +				"Checking switch lid %u, port %u\n",
> +				lid_ho, port_num);
> +
> +			p_remote_physp = osm_physp_get_remote(p_physp);
> +			remote_node_type = osm_node_get_type(p_remote_physp->p_node);
> +
> +			if (remote_node_type == IB_NODE_TYPE_SWITCH)
> +				remote_lid_ho = cl_ntoh16(osm_node_get_base_lid(
> +					p_remote_physp->p_node, 0));
> +			else
> +				remote_lid_ho = cl_ntoh16(osm_node_get_base_lid(
> +					p_remote_physp->p_node,
> +					osm_physp_get_port_num(p_remote_physp)));
> +
> +			if (!p_cache_sw ||
> +			    port_num >= p_cache_sw->num_ports ||
> +			    !p_cache_sw->ports[port_num].remote_lid_ho) {
> +				/*
> +				 * There is some uncached change on the port.
> +				 * In general, the reasons might be as follows:
> +				 *  - switch reset
> +				 *  - port reset (or port down/up)
> +				 *  - quick connection location change
> +				 *  - new link (or new switch)
> +				 *
> +				 * First two reasons allow cache usage, while
> +				 * the last two reasons should invalidate cache.
> +				 *
> +				 * In case of quick connection location change,
> +				 * cache would have been invalidated by
> +				 * osm_ucast_cache_check_new_link() function.
> +				 *
> +				 * In case of new link between two known nodes,
> +				 * cache also would have been invalidated by
> +				 * osm_ucast_cache_check_new_link() function.
> +				 *
> +				 * Another reason is cached link between two
> +				 * known switches went back. In this case the
> +				 * osm_ucast_cache_check_new_link() function would
> +				 * clear both sides of the link from the cache
> +				 * during the discovery process, so effectively
> +				 * this would be equivalent to port reset.
> +				 *
> +				 * So three possible reasons remain:
> +				 *  - switch reset
> +				 *  - port reset (or port down/up)
> +				 *  - link of a new switch
> +				 *
> +				 * To validate cache, we need to check only the
> +				 * third reason - link of a new node/switch:
> +				 *  - If this is the local switch that is new,
> +				 *    then it should have (p_sw->need_update == 2).
> +				 *  - If the remote node is switch and it's new,
> +				 *    then it also should have
> +				 *    (p_sw->need_update == 2).
> +				 *  - If the remote node is CA/RTR and it's new,
> +				 *    then its port should have is_new flag on.
> +				 */
> +				if (p_sw->need_update == 2) {
> +					OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +						"New switch found (lid %u) - "
> +						"cache is invalid\n",
> +						lid_ho);
> +					osm_ucast_cache_invalidate(p_cache);
> +					goto Exit;
> +				}
> +
> +				if (remote_node_type == IB_NODE_TYPE_SWITCH) {
> +
> +					p_remote_sw = p_remote_physp->p_node->sw;
> +                                        if (p_remote_sw->need_update == 2) {
> +						/* this could also be case of
> +						   switch coming back with an
> +						   additional link that it
> +						   didn't have before */
> +						OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +							"New switch/link found (lid %u) - "
> +							"cache is invalid\n",
> +						       remote_lid_ho);
> +						osm_ucast_cache_invalidate(p_cache);
> +						goto Exit;
> +					}

Maybe not related to cache directly, but anyway related to "fast" switch
reset. When it happened we need to resend whole LFTs (recalculated or
from cache) to this switch. Is it handled? Was it handled?

[snip...]

> +void
> +osm_ucast_cache_check_new_link(osm_ucast_cache_t * p_cache,
> +			       osm_node_t * p_node_1,
> +			       uint8_t port_num_1,
> +			       osm_node_t * p_node_2,
> +			       uint8_t port_num_2)
> +{
> +	uint16_t lid_ho_1;
> +	uint16_t lid_ho_2;
> +
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	__cache_check_link_change(p_cache,
> +				  osm_node_get_physp_ptr(p_node_1, port_num_1),
> +				  osm_node_get_physp_ptr(p_node_2, port_num_2));
> +
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	if (osm_node_get_type(p_node_1) != IB_NODE_TYPE_SWITCH &&
> +	    osm_node_get_type(p_node_2) != IB_NODE_TYPE_SWITCH) {
> +		OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +			"Found CA/RTR-2-CA/RTR link - cache is invalid\n");
> +		osm_ucast_cache_invalidate(p_cache);
> +		goto Exit;
> +	}

Here and in other places, should we care about back-to-back connections?
Maybe we need just to disable cache at all there is no switches in a
fabric...

[snip...]

> +void
> +osm_ucast_cache_add_link(osm_ucast_cache_t * p_cache,
> +			 osm_node_t * p_node_1,
> +			 uint8_t port_num_1,
> +			 osm_node_t * p_node_2,
> +			 uint8_t port_num_2)

I looked at places where this function is used and think it would be
simpler to use prototype like:

osm_ucast_cache_add_link(osm_ucast_cache_t * p_cache,
			 osm_physp_t *physp1, osm_physp_t *physp2)

Will send the patch.

> +{
> +	uint16_t lid_ho_1;
> +	uint16_t lid_ho_2;
> +
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	if (osm_node_get_type(p_node_1) != IB_NODE_TYPE_SWITCH &&
> +	    osm_node_get_type(p_node_2) != IB_NODE_TYPE_SWITCH) {
> +		OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +			"Dropping CA-2-CA link - cache invalid\n");
> +		osm_ucast_cache_invalidate(p_cache);
> +		goto Exit;
> +	}

back-to-back again...

> +
> +	if (((osm_node_get_type(p_node_1) == IB_NODE_TYPE_SWITCH) &&
> +	     (!osm_node_get_physp_ptr(p_node_1, 0) ||
> +	      !osm_physp_is_valid(osm_node_get_physp_ptr(p_node_1, 0)))) ||
> +	    ((osm_node_get_type(p_node_2) == IB_NODE_TYPE_SWITCH) &&
> +	     (!osm_node_get_physp_ptr(p_node_2, 0) ||
> +	      !osm_physp_is_valid(osm_node_get_physp_ptr(p_node_2, 0))))) {
> +		/* we're caching a link when one of the nodes
> +		   has already been dropped and cached */

osm_node_get_physp_ptr() already checks port validity and return NULL if
it is not. Patch...

[snip...]

> +void
> +osm_ucast_cache_add_node(osm_ucast_cache_t * p_cache,
> +			 osm_node_t * p_node)
> +{
> +	uint16_t lid_ho;
> +	uint8_t max_ports;
> +	uint8_t port_num;
> +	osm_physp_t * p_physp;
> +	osm_node_t * p_remote_node;
> +	cache_switch_t * p_cache_sw;
> +
> +	OSM_LOG_ENTER(p_cache->p_ucast_mgr->p_log);
> +
> +	if (!p_cache->valid)
> +		goto Exit;
> +
> +	if (osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH) {
> +
> +		lid_ho = cl_ntoh16(osm_node_get_base_lid(p_node,0));
> +
> +		OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_VERBOSE,
> +			"Caching dropped switch lid %u\n", lid_ho);
> +
> +		if (!p_node->sw) {
> +			/* something is wrong - forget about cache */
> +			OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_ERROR,
> +				"ERR AD02: no switch info for node lid %u -"
> +				" clearing cache\n", lid_ho);
> +			osm_ucast_cache_invalidate(p_cache);
> +			goto Exit;
> +		}
> +
> +		/* unlink (add to cache) all the ports of this switch */
> +		max_ports = osm_node_get_num_physp(p_node);
> +		for (port_num = 1; port_num < max_ports; port_num++) {
> +
> +			p_physp = osm_node_get_physp_ptr(p_node, port_num);
> +			if (!p_physp || !p_physp->p_node ||
> +			    !p_physp->p_remote_physp ||
> +			    !p_physp->p_remote_physp->p_node)

Can p_physp->p_node be NULL?

> +				continue;
> +
> +			osm_ucast_cache_add_link(p_cache, p_node, port_num,
> +						 p_physp->p_remote_physp->p_node,
> +						 p_physp->p_remote_physp->port_num);
> +		}
> +
> +		/*
> +		 * All the ports have been dropped (cached).
> +		 * If one of the ports was connected to CA/RTR,
> +		 * then the cached switch would be marked as leaf.
> +		 * If it isn't, then the dropped switch isn't a leaf,
> +		 * and cache can't handle it.
> +		 */
> +
> +		p_cache_sw = __cache_get_sw(p_cache, lid_ho);
> +		CL_ASSERT(p_cache_sw);
> +
> +		if (!__cache_sw_is_leaf(p_cache_sw)) {
> +			OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +				"Dropped non-leaf switch (lid %u) - "
> +				"cache is invalid\n", lid_ho);
> +			osm_ucast_cache_invalidate(p_cache);
> +			goto Exit;
> +		}
> +
> +		p_cache_sw->dropped = TRUE;
> +
> +		if (!p_node->sw->num_hops || !p_node->sw->hops) {
> +			OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +				"No LID matrices for switch lid %u - "
> +				"cache is invalid\n", lid_ho);
> +			osm_ucast_cache_invalidate(p_cache);
> +			goto Exit;
> +		}
> +
> +		/* lid matrices */
> +
> +		p_cache_sw->num_hops = p_node->sw->num_hops;
> +		p_node->sw->num_hops = 0;
> +		p_cache_sw->hops = p_node->sw->hops;
> +		p_node->sw->hops = NULL;
> +
> +		/* linear forwarding table */
> +
> +		p_cache_sw->lft = p_node->sw->lft_buf;
> +		p_node->sw->lft_buf = NULL;
> +		p_cache_sw->max_lid_ho = p_node->sw->max_lid_ho;
> +	}
> +	else {
> +		/* dropping CA/RTR: add to cache all the ports of this switch */
> +		max_ports = osm_node_get_num_physp(p_node);
> +		for (port_num = 0; port_num < max_ports; port_num++) {

Any reason to start from port 0 and not 1?

> +
> +			p_physp = osm_node_get_physp_ptr(p_node, port_num);
> +			if (!p_physp || !p_physp->p_node ||
> +			    !p_physp->p_remote_physp ||
> +			    !p_physp->p_remote_physp->p_node)
> +				continue;
> +
> +			p_remote_node = p_physp->p_remote_physp->p_node;
> +			if (osm_node_get_type(p_remote_node) !=
> +			    IB_NODE_TYPE_SWITCH) {
> +				/* CA/RTR to CA/RTR connection */
> +				OSM_LOG(p_cache->p_ucast_mgr->p_log, OSM_LOG_INFO,
> +					"Dropping CA/RTR to CA/RTR connection - "
> +					"cache is invalid\n");
> +				osm_ucast_cache_invalidate(p_cache);
> +				goto Exit;
> +			}

back-to-back connection?

Will send the patches.

Sasha



More information about the general mailing list