[ofa-general] Re: [Repost][PATCH] opensm: Added support for select counters (xmit_wait)

Sasha Khapyorsky sashak at voltaire.com
Sun Jul 19 06:53:34 PDT 2009


Hi Nicolas,

On 17:26 Tue 14 Jul     , Nicolas Morey-Chaisemartin wrote:
> Support for xmit_wait counters was missing in the perfmgr though it was read from the mad and event plugin interface already handles it.
> This patch adds support for it (tested and working with an event plugin)
> 
> Tested-by: Jean-Vincent Ficet <jean-vincent.ficet at bull.net>
> Signed-off-by: Nicolas Morey-Chaisemartin <nicolas at morey-chaisemartin.com>
> ---
> I think emails got mixed up the first time so here it is again.
> 
>  opensm/include/opensm/osm_perfmgr_db.h |   23 ++++++-
>  opensm/opensm/osm_perfmgr.c            |   30 +++++++-
>  opensm/opensm/osm_perfmgr_db.c         |  124 +++++++++++++++++++++++++++++--
>  3 files changed, 166 insertions(+), 11 deletions(-)
> 
> diff --git a/opensm/include/opensm/osm_perfmgr_db.h b/opensm/include/opensm/osm_perfmgr_db.h
> index 42a47bd..35b5ac3 100644
> --- a/opensm/include/opensm/osm_perfmgr_db.h
> +++ b/opensm/include/opensm/osm_perfmgr_db.h
> @@ -109,6 +109,14 @@ typedef struct {
>  } perfmgr_db_data_cnt_reading_t;
>  
>  /** =========================================================================
> + * Port select count reading
> + */
> +typedef struct {
> +	uint64_t xmit_wait;
> +	time_t time;
> +} perfmgr_db_sel_reading_t;
> +

Why do we need a separate structure for this counter?

Sasha


> +/** =========================================================================
>   * Dump output options
>   */
>  typedef enum {
> @@ -125,6 +133,8 @@ typedef struct db_port {
>  	perfmgr_db_err_reading_t err_previous;
>  	perfmgr_db_data_cnt_reading_t dc_total;
>  	perfmgr_db_data_cnt_reading_t dc_previous;
> +	perfmgr_db_sel_reading_t ps_total;
> +	perfmgr_db_sel_reading_t ps_previous;
>  	time_t last_reset;
>  } db_port_t;
>  
> @@ -179,7 +189,16 @@ perfmgr_db_err_t perfmgr_db_get_prev_dc(perfmgr_db_t * db, uint64_t guid,
>  					reading);
>  perfmgr_db_err_t perfmgr_db_clear_prev_dc(perfmgr_db_t * db, uint64_t guid,
>  					  uint8_t port);
> -
> +perfmgr_db_err_t perfmgr_db_add_ps_reading(perfmgr_db_t * db, uint64_t guid,
> +					   uint8_t port,
> +					   perfmgr_db_sel_reading_t *
> +					   reading);
> +perfmgr_db_err_t perfmgr_db_get_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +					uint8_t port,
> +					perfmgr_db_sel_reading_t *
> +					reading);
> +perfmgr_db_err_t perfmgr_db_clear_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +					  uint8_t port);
>  void perfmgr_db_clear_counters(perfmgr_db_t * db);
>  perfmgr_db_err_t perfmgr_db_dump(perfmgr_db_t * db, char *file,
>  				 perfmgr_db_dump_t dump_type);
> @@ -196,6 +215,8 @@ void perfmgr_db_fill_data_cnt_read_pc(ib_port_counters_t * wire_read,
>  				      perfmgr_db_data_cnt_reading_t * reading);
>  void perfmgr_db_fill_data_cnt_read_epc(ib_port_counters_ext_t * wire_read,
>  				       perfmgr_db_data_cnt_reading_t * reading);
> +void perfmgr_db_fill_sel_read(ib_port_counters_t * wire_read,
> +				      perfmgr_db_sel_reading_t * reading);
>  
>  END_C_DECLS
>  
> diff --git a/opensm/opensm/osm_perfmgr.c b/opensm/opensm/osm_perfmgr.c
> index ecfdbda..8a9eb12 100644
> --- a/opensm/opensm/osm_perfmgr.c
> +++ b/opensm/opensm/osm_perfmgr.c
> @@ -853,10 +853,12 @@ void osm_perfmgr_destroy(osm_perfmgr_t * pm)
>  static void perfmgr_check_oob_clear(osm_perfmgr_t * pm,
>  				    monitored_node_t * mon_node, uint8_t port,
>  				    perfmgr_db_err_reading_t * cr,
> -				    perfmgr_db_data_cnt_reading_t * dc)
> +				    perfmgr_db_data_cnt_reading_t * dc,
> +				    perfmgr_db_sel_reading_t * ps)
>  {
>  	perfmgr_db_err_reading_t prev_err;
>  	perfmgr_db_data_cnt_reading_t prev_dc;
> +	perfmgr_db_sel_reading_t prev_ps;
>  
>  	if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
>  	    != PERFMGR_EVENT_DB_SUCCESS) {
> @@ -905,6 +907,23 @@ static void perfmgr_check_oob_clear(osm_perfmgr_t * pm,
>  			mon_node->name, mon_node->guid, port);
>  		perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
>  	}
> +
> +	if (perfmgr_db_get_prev_ps(pm->db, mon_node->guid, port, &prev_ps)
> +	    != PERFMGR_EVENT_DB_SUCCESS) {
> +		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
> +			"Failed to find previous select count "
> +			"reading for %s (0x%" PRIx64 ") port %u\n",
> +			mon_node->name, mon_node->guid, port);
> +		return;
> +	}
> +
> +	if (ps->xmit_wait < prev_ps.xmit_wait) {
> +		OSM_LOG(pm->log, OSM_LOG_ERROR,
> +			"PerfMgr: ERR 4C17: Detected an out of band select counter "
> +			"clear on node %s (0x%" PRIx64 ") port %u\n",
> +			mon_node->name, mon_node->guid, port);
> +		perfmgr_db_clear_prev_ps(pm->db, mon_node->guid, port);
> +	}
>  }
>  
>  /**********************************************************************
> @@ -1062,6 +1081,8 @@ static void pc_recv_process(void *context, void *data)
>  	uint8_t port = mad_context->perfmgr_context.port;
>  	perfmgr_db_err_reading_t err_reading;
>  	perfmgr_db_data_cnt_reading_t data_reading;
> +	perfmgr_db_sel_reading_t select_reading;
> +
>  	cl_map_item_t *p_node;
>  	monitored_node_t *p_mon_node;
>  
> @@ -1148,10 +1169,12 @@ static void pc_recv_process(void *context, void *data)
>  	 */
>  	perfmgr_db_fill_data_cnt_read_pc(wire_read, &data_reading);
>  
> +	perfmgr_db_fill_sel_read(wire_read, &select_reading);
> +
>  	/* detect an out of band clear on the port */
>  	if (mad_context->perfmgr_context.mad_method != IB_MAD_METHOD_SET)
>  		perfmgr_check_oob_clear(pm, p_mon_node, port, &err_reading,
> -					&data_reading);
> +					&data_reading, &select_reading);
>  
>  	/* log any critical events from this reading */
>  	perfmgr_log_events(pm, p_mon_node, port, &err_reading);
> @@ -1161,9 +1184,12 @@ static void pc_recv_process(void *context, void *data)
>  					   &err_reading);
>  		perfmgr_db_add_dc_reading(pm->db, node_guid, port,
>  					  &data_reading);
> +		perfmgr_db_add_ps_reading(pm->db, node_guid, port,
> +					  &select_reading);
>  	} else {
>  		perfmgr_db_clear_prev_err(pm->db, node_guid, port);
>  		perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
> +		perfmgr_db_clear_prev_ps(pm->db, node_guid, port);
>  	}
>  
>  	perfmgr_check_overflow(pm, p_mon_node, port, wire_read);
> diff --git a/opensm/opensm/osm_perfmgr_db.c b/opensm/opensm/osm_perfmgr_db.c
> index e5dfc19..132c2fb 100644
> --- a/opensm/opensm/osm_perfmgr_db.c
> +++ b/opensm/opensm/osm_perfmgr_db.c
> @@ -486,6 +486,102 @@ Exit:
>  	return (rc);
>  }
>  
> +static inline void
> +debug_dump_ps_reading(perfmgr_db_t * db, uint64_t guid, uint8_t port_num,
> +		      db_port_t * port, perfmgr_db_sel_reading_t * cur)
> +{
> +	osm_log_t *log = db->perfmgr->log;
> +	if (!osm_log_is_active(log, OSM_LOG_DEBUG))
> +		return;
> +
> +	osm_log(log, OSM_LOG_DEBUG,
> +		"xd %" PRIu64 " <-- %" PRIu64 " (%" PRIu64 ")\n",
> +		cur->xmit_wait, port->ps_previous.xmit_wait,
> +		port->ps_total.xmit_wait);
> +}
> +
> +/**********************************************************************
> + * perfmgr_db_sel_reading_t functions
> + **********************************************************************/
> +perfmgr_db_err_t
> +perfmgr_db_add_ps_reading(perfmgr_db_t * db, uint64_t guid, uint8_t port,
> +			  perfmgr_db_sel_reading_t * reading)
> +{
> +	db_port_t *p_port = NULL;
> +	db_node_t *node = NULL;
> +	perfmgr_db_sel_reading_t *previous = NULL;
> +	perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +	osm_epi_ps_event_t epi_ps_data;
> +
> +	cl_plock_excl_acquire(&db->lock);
> +	node = get(db, guid);
> +	if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +		goto Exit;
> +
> +	p_port = &node->ports[port];
> +	previous = &node->ports[port].ps_previous;
> +
> +	debug_dump_ps_reading(db, guid, port, p_port, reading);
> +
> +	epi_ps_data.time_diff_s = reading->time - previous->time;
> +	osm_epi_create_port_id(&epi_ps_data.port_id, guid, port,
> +			       node->node_name);
> +
> +	/* calculate changes from previous reading */
> +	epi_ps_data.xmit_wait = reading->xmit_wait - previous->xmit_wait;
> +	p_port->ps_total.xmit_wait += epi_ps_data.xmit_wait;
> +
> +	p_port->ps_previous = *reading;
> +	osm_opensm_report_event(db->perfmgr->osm,
> +				OSM_EVENT_ID_PORT_SELECT, &epi_ps_data);
> +
> +Exit:
> +	cl_plock_release(&db->lock);
> +	return (rc);
> +}
> +
> +perfmgr_db_err_t perfmgr_db_get_prev_ps(perfmgr_db_t * db, uint64_t guid,
> +					uint8_t port,
> +					perfmgr_db_sel_reading_t * reading)
> +{
> +	db_node_t *node = NULL;
> +	perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +
> +	cl_plock_acquire(&db->lock);
> +
> +	node = get(db, guid);
> +	if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +		goto Exit;
> +
> +	*reading = node->ports[port].ps_previous;
> +
> +Exit:
> +	cl_plock_release(&db->lock);
> +	return (rc);
> +}
> +
> +perfmgr_db_err_t
> +perfmgr_db_clear_prev_ps(perfmgr_db_t * db, uint64_t guid, uint8_t port)
> +{
> +	db_node_t *node = NULL;
> +	perfmgr_db_sel_reading_t *previous = NULL;
> +	perfmgr_db_err_t rc = PERFMGR_EVENT_DB_SUCCESS;
> +
> +	cl_plock_excl_acquire(&db->lock);
> +	node = get(db, guid);
> +	if ((rc = bad_node_port(node, port)) != PERFMGR_EVENT_DB_SUCCESS)
> +		goto Exit;
> +
> +	previous = &node->ports[port].ps_previous;
> +
> +	memset(previous, 0, sizeof(*previous));
> +	node->ports[port].ps_previous.time = time(NULL);
> +
> +Exit:
> +	cl_plock_release(&db->lock);
> +	return (rc);
> +}
> +
>  static void clear_counters(cl_map_item_t * const p_map_item, void *context)
>  {
>  	db_node_t *node = (db_node_t *) p_map_item;
> @@ -517,6 +613,8 @@ static void clear_counters(cl_map_item_t * const p_map_item, void *context)
>  		node->ports[i].dc_total.multicast_rcv_pkts = 0;
>  		node->ports[i].dc_total.time = ts;
>  
> +		node->ports[i].ps_total.xmit_wait = 0;
> +
>  		node->ports[i].last_reset = ts;
>  	}
>  }
> @@ -546,7 +644,7 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>  		"%s\t%s\t"
>  		"%s\t%s\t%s\t%s\t%s\t%s\t%s\t"
>  		"%s\t%s\t%s\t%s\t%s\t%s\t%s\t"
> -		"%s\t%s\t%s\t%s\n",
> +		"%s\t%s\t%s\t%s\t%s\n",
>  		"symbol_err_cnt",
>  		"link_err_recover",
>  		"link_downed",
> @@ -565,8 +663,7 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>  		"rcv_pkts",
>  		"unicast_xmit_pkts",
>  		"unicast_rcv_pkts",
> -		"multicast_xmit_pkts",
> -		"multicast_rcv_pkts");
> +		"multicast_xmit_pkts", "multicast_rcv_pkts", "xmit_wait");
>  	for (i = (node->esp0) ? 0 : 1; i < node->num_ports; i++) {
>  		char *since = ctime(&node->ports[i].last_reset);
>  		since[strlen(since) - 1] = '\0';	/* remove \n */
> @@ -577,8 +674,8 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>  			"%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64
>  			"\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64 "\t%" PRIu64
>  			"\t%" PRIu64 "\t%" PRIu64 "\t" "%" PRIu64 "\t%" PRIu64
> -			"\t%" PRIu64 "\t%" PRIu64 "\n", node->node_name,
> -			node->node_guid, i, since,
> +			"\t%" PRIu64 "\t%" PRIu64 "\t%" PRIu64 "\n",
> +			node->node_name, node->node_guid, i, since,
>  			node->ports[i].err_total.symbol_err_cnt,
>  			node->ports[i].err_total.link_err_recover,
>  			node->ports[i].err_total.link_downed,
> @@ -598,7 +695,8 @@ static void dump_node_mr(db_node_t * node, FILE * fp)
>  			node->ports[i].dc_total.unicast_xmit_pkts,
>  			node->ports[i].dc_total.unicast_rcv_pkts,
>  			node->ports[i].dc_total.multicast_xmit_pkts,
> -			node->ports[i].dc_total.multicast_rcv_pkts);
> +			node->ports[i].dc_total.multicast_rcv_pkts,
> +			node->ports[i].ps_total.xmit_wait);
>  	}
>  }
>  
> @@ -634,7 +732,8 @@ static void dump_node_hr(db_node_t * node, FILE * fp)
>  			"     unicast_xmit_pkts    : %" PRIu64 "\n"
>  			"     unicast_rcv_pkts     : %" PRIu64 "\n"
>  			"     multicast_xmit_pkts  : %" PRIu64 "\n"
> -			"     multicast_rcv_pkts   : %" PRIu64 "\n",
> +			"     multicast_rcv_pkts   : %" PRIu64 "\n"
> +			"     xmit_wait            : %" PRIu64 "\n",
>  			node->node_name,
>  			node->node_guid,
>  			i,
> @@ -658,7 +757,8 @@ static void dump_node_hr(db_node_t * node, FILE * fp)
>  			node->ports[i].dc_total.unicast_xmit_pkts,
>  			node->ports[i].dc_total.unicast_rcv_pkts,
>  			node->ports[i].dc_total.multicast_xmit_pkts,
> -			node->ports[i].dc_total.multicast_rcv_pkts);
> +			node->ports[i].dc_total.multicast_rcv_pkts,
> +			node->ports[i].ps_total.xmit_wait);
>  	}
>  }
>  
> @@ -809,4 +909,12 @@ perfmgr_db_fill_data_cnt_read_epc(ib_port_counters_ext_t * wire_read,
>  	reading->multicast_rcv_pkts = cl_ntoh64(wire_read->multicast_rcv_pkts);
>  	reading->time = time(NULL);
>  }
> +
> +void
> +perfmgr_db_fill_sel_read(ib_port_counters_t * wire_read,
> +			 perfmgr_db_sel_reading_t * reading)
> +{
> +	reading->xmit_wait = cl_ntoh32(wire_read->xmit_wait);
> +	reading->time = time(NULL);
> +}
>  #endif				/* ENABLE_OSM_PERF_MGR */
> 



More information about the general mailing list