[ofa-general] [PATCH 3/5] Use monitored map lookup to get the name of the node for recieved mad processing in perfmgr
Ira Weiny
weiny2 at llnl.gov
Wed Nov 7 18:58:21 PST 2007
>From 003d4eb171cbad92c61fb4f0fd4c96b7efe3ff6a Mon Sep 17 00:00:00 2001
From: Ira K. Weiny <weiny2 at llnl.gov>
Date: Tue, 6 Nov 2007 19:10:10 -0800
Subject: [PATCH] Use monitored map lookup to get the name of the node for recieved mad
processing in perfmgr
Signed-off-by: Ira K. Weiny <weiny2 at llnl.gov>
---
opensm/opensm/osm_perfmgr.c | 153 +++++++++++++++++++++++--------------------
1 files changed, 82 insertions(+), 71 deletions(-)
diff --git a/opensm/opensm/osm_perfmgr.c b/opensm/opensm/osm_perfmgr.c
index d78d747..767ba8d 100644
--- a/opensm/opensm/osm_perfmgr.c
+++ b/opensm/opensm/osm_perfmgr.c
@@ -199,36 +199,38 @@ osm_perfmgr_mad_send_err_callback(void *bind_context, osm_madw_t * p_madw)
osm_madw_context_t *context = &(p_madw->context);
uint64_t node_guid = context->perfmgr_context.node_guid;
uint8_t port = context->perfmgr_context.port;
+ cl_map_item_t *p_node;
+ __monitored_node_t *p_mon_node;
OSM_LOG_ENTER(pm->log, osm_perfmgr_mad_send_err_callback);
+ /* go ahead and get the monitored node struct to have the printable
+ * name if needed in messages
+ */
+ if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
+ cl_qmap_end(&(pm->monitored_map))) {
+ osm_log(pm->log, OSM_LOG_ERROR,
+ "osm_pc_rcv_process: ERR 4C12: GUID 0x%016"
+ PRIx64 " not found in monitored map\n",
+ node_guid);
+ goto Exit;
+ }
+ p_mon_node = (__monitored_node_t *) p_node;
+
osm_log(pm->log, OSM_LOG_ERROR,
- "osm_perfmgr_mad_send_err_callback: ERR 4C02: 0x%" PRIx64
- " port %d\n", node_guid, port);
+ "osm_perfmgr_mad_send_err_callback: ERR 4C02: %s (0x%" PRIx64
+ ") port %d\n", p_mon_node->name, p_mon_node->guid, port);
if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
- cl_map_item_t *p_node;
- __monitored_node_t *p_mon_node;
-
/* First, find the node in the monitored map */
cl_plock_acquire(pm->lock);
- if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
- cl_qmap_end(&(pm->monitored_map))) {
- cl_plock_release(pm->lock);
- osm_log(pm->log, OSM_LOG_ERROR,
- "osm_perfmgr_mad_send_err_callback: ERR 4C15: GUID 0x%016"
- PRIx64 " not found in monitored map\n",
- node_guid);
- goto Exit;
- }
- p_mon_node = (__monitored_node_t *) p_node;
/* Now, validate port number */
if (port > p_mon_node->redir_tbl_size) {
cl_plock_release(pm->lock);
osm_log(pm->log, OSM_LOG_ERROR,
- "osm_perfmgr_mad_send_err_callback: ERR 4C16: Invalid port num %d for GUID 0x%016"
- PRIx64 " num ports %d\n", port, node_guid,
- p_mon_node->redir_tbl_size);
+ "osm_perfmgr_mad_send_err_callback: ERR 4C16: Invalid port num %d for %s (GUID 0x%016"
+ PRIx64 ") num ports %d\n", port, p_mon_node->name,
+ p_mon_node->guid, p_mon_node->redir_tbl_size);
goto Exit;
}
/* Clear redirection info */
@@ -902,18 +904,19 @@ void osm_perfmgr_destroy(osm_perfmgr_t * const pm)
* will be missed.
**********************************************************************/
static void
-osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, uint64_t node_guid,
+osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
uint8_t port, perfmgr_db_err_reading_t * cr,
perfmgr_db_data_cnt_reading_t * dc)
{
perfmgr_db_err_reading_t prev_err;
perfmgr_db_data_cnt_reading_t prev_dc;
- if (perfmgr_db_get_prev_err(pm->db, node_guid, port, &prev_err)
+ if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
!= PERFMGR_EVENT_DB_SUCCESS) {
osm_log(pm->log, OSM_LOG_VERBOSE,
- "osm_perfmgr_check_oob_clear: Failed to find previous error reading for 0x%"
- PRIx64 " port %u\n", node_guid, port);
+ "osm_perfmgr_check_oob_clear: Failed to find previous "
+ "error reading for %s (guid 0x%" PRIx64 ") port %u\n",
+ mon_node->name, mon_node->guid, port);
return;
}
@@ -930,17 +933,19 @@ osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, uint64_t node_guid,
cr->buffer_overrun < prev_err.buffer_overrun ||
cr->vl15_dropped < prev_err.vl15_dropped) {
osm_log(pm->log, OSM_LOG_ERROR,
- "PerfMgr: ERR 4C0A: Detected an out of band error clear on node 0x%"
- PRIx64 " port %u\n", node_guid, port);
- perfmgr_db_clear_prev_err(pm->db, node_guid, port);
+ "PerfMgr: ERR 4C0A: Detected an out of band error clear "
+ "on %s (0x%" PRIx64 ") port %u\n",
+ mon_node->name, mon_node->guid, port);
+ perfmgr_db_clear_prev_err(pm->db, mon_node->guid, port);
}
/* FIXME handle extended counters */
- if (perfmgr_db_get_prev_dc(pm->db, node_guid, port, &prev_dc)
+ if (perfmgr_db_get_prev_dc(pm->db, mon_node->guid, port, &prev_dc)
!= PERFMGR_EVENT_DB_SUCCESS) {
osm_log(pm->log, OSM_LOG_VERBOSE,
- "osm_perfmgr_check_oob_clear: Failed to find previous data count reading for 0x%"
- PRIx64 " port %u\n", node_guid, port);
+ "osm_perfmgr_check_oob_clear: Failed to find previous data count "
+ "reading for %s (0x%" PRIx64 ") port %u\n",
+ mon_node->name, mon_node->guid, port);
return;
}
@@ -949,9 +954,10 @@ osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, uint64_t node_guid,
dc->xmit_pkts < prev_dc.xmit_pkts ||
dc->rcv_pkts < prev_dc.rcv_pkts) {
osm_log(pm->log, OSM_LOG_ERROR,
- "PerfMgr: ERR 4C0B: Detected an out of band data counter clear on node 0x%"
- PRIx64 " port %u\n", node_guid, port);
- perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
+ "PerfMgr: ERR 4C0B: Detected an out of band data counter "
+ "clear on node %s (0x%" PRIx64 ") port %u\n",
+ mon_node->name, mon_node->guid, port);
+ perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
}
}
@@ -983,7 +989,7 @@ int counter_overflow_32(ib_net32_t val)
* MAD to the port.
**********************************************************************/
static void
-osm_perfmgr_check_overflow(osm_perfmgr_t * pm, uint64_t node_guid,
+osm_perfmgr_check_overflow(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
uint8_t port, ib_port_counters_t * pc)
{
osm_madw_context_t mad_context;
@@ -1012,26 +1018,27 @@ osm_perfmgr_check_overflow(osm_perfmgr_t * pm, uint64_t node_guid,
ib_net16_t lid = 0;
osm_log(pm->log, OSM_LOG_INFO,
- "PerfMgr: Counter overflow: 0x%" PRIx64
- " port %d; clearing counters\n", node_guid, port);
+ "PerfMgr: Counter overflow: %s (0x%" PRIx64
+ ") port %d; clearing counters\n",
+ mon_node->name, mon_node->guid, port);
cl_plock_acquire(pm->lock);
- p_node = osm_get_node_by_guid(pm->subn, cl_hton64(node_guid));
+ p_node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
/* Could find monitored node for this rather than */
/* potentially redoing redirection */
lid = get_lid(p_node, port, NULL);
cl_plock_release(pm->lock);
if (lid == 0) {
osm_log(pm->log, OSM_LOG_ERROR,
- "PerfMgr: ERR 4C0C: Failed to clear counters for node 0x%"
- PRIx64 " port %d; failed to get lid\n",
- node_guid, port);
+ "PerfMgr: ERR 4C0C: Failed to clear counters for %s (0x%"
+ PRIx64 ") port %d; failed to get lid\n",
+ mon_node->name, mon_node->guid, port);
goto Exit;
}
remote_qp = get_qp(NULL, port);
- mad_context.perfmgr_context.node_guid = node_guid;
+ mad_context.perfmgr_context.node_guid = mon_node->guid;
mad_context.perfmgr_context.port = port;
mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
/* clear port counters */
@@ -1040,10 +1047,11 @@ osm_perfmgr_check_overflow(osm_perfmgr_t * pm, uint64_t node_guid,
IB_MAD_METHOD_SET, &mad_context);
if (status != IB_SUCCESS)
osm_log(pm->log, OSM_LOG_ERROR,
- "PerfMgr: ERR 4C11: Failed to send clear counters MAD for node 0x%"
- PRIx64 " port %d\n", node_guid, port);
+ "PerfMgr: ERR 4C11: Failed to send clear counters MAD for %s (0x%"
+ PRIx64 ") port %d\n",
+ mon_node->name, mon_node->guid, port);
- perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
+ perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
}
Exit:
@@ -1054,18 +1062,19 @@ osm_perfmgr_check_overflow(osm_perfmgr_t * pm, uint64_t node_guid,
* Check values for logging of errors
**********************************************************************/
static void
-osm_perfmgr_log_events(osm_perfmgr_t * pm, uint64_t node_guid, uint8_t port,
+osm_perfmgr_log_events(osm_perfmgr_t * pm, __monitored_node_t *mon_node, uint8_t port,
perfmgr_db_err_reading_t * reading)
{
perfmgr_db_err_reading_t prev_read;
time_t time_diff = 0;
perfmgr_db_err_t err =
- perfmgr_db_get_prev_err(pm->db, node_guid, port, &prev_read);
+ perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_read);
if (err != PERFMGR_EVENT_DB_SUCCESS) {
osm_log(pm->log, OSM_LOG_VERBOSE,
- "osm_perfmgr_log_events: Failed to find previous reading for 0x%"
- PRIx64 " port %u\n", node_guid, port);
+ "osm_perfmgr_log_events: Failed to find previous "
+ "reading for %s (0x%" PRIx64 ") port %u\n",
+ mon_node->name, mon_node->guid, port);
return;
}
time_diff = (reading->time - prev_read.time);
@@ -1075,26 +1084,26 @@ osm_perfmgr_log_events(osm_perfmgr_t * pm, uint64_t node_guid, uint8_t port,
if (reading->symbol_err_cnt > prev_read.symbol_err_cnt)
osm_log(pm->log, OSM_LOG_ERROR,
"osm_perfmgr_log_events: ERR 4C0D: "
- "Found %" PRIu64 " Symbol errors in %lu sec on node 0x%"
- PRIx64 " port %u\n",
+ "Found %" PRIu64 " Symbol errors in %lu sec on %s (0x%"
+ PRIx64 ") port %u\n",
(reading->symbol_err_cnt - prev_read.symbol_err_cnt),
- time_diff, node_guid, port);
+ time_diff, mon_node->name, mon_node->guid, port);
if (reading->rcv_err > prev_read.rcv_err)
osm_log(pm->log, OSM_LOG_ERROR,
"osm_perfmgr_log_events: ERR 4C0E: "
"Found %" PRIu64
- " Receive errors in %lu sec on node 0x%" PRIx64
- " port %u\n", (reading->rcv_err - prev_read.rcv_err),
- time_diff, node_guid, port);
+ " Receive errors in %lu sec on %s (0x%" PRIx64
+ ") port %u\n", (reading->rcv_err - prev_read.rcv_err),
+ time_diff, mon_node->name, mon_node->guid, port);
if (reading->xmit_discards > prev_read.xmit_discards)
osm_log(pm->log, OSM_LOG_ERROR,
"osm_perfmgr_log_events: ERR 4C0F: "
- "Found %" PRIu64 " Xmit Discards in %lu sec on node 0x%"
- PRIx64 " port %u\n",
+ "Found %" PRIu64 " Xmit Discards in %lu sec on %s (0x%"
+ PRIx64 ") port %u\n",
(reading->xmit_discards - prev_read.xmit_discards),
- time_diff, node_guid, port);
+ time_diff, mon_node->name, mon_node->guid, port);
}
/**********************************************************************
@@ -1114,9 +1123,24 @@ static void osm_pc_rcv_process(void *context, void *data)
uint8_t port = mad_context->perfmgr_context.port;
perfmgr_db_err_reading_t err_reading;
perfmgr_db_data_cnt_reading_t data_reading;
+ cl_map_item_t *p_node;
+ __monitored_node_t *p_mon_node;
OSM_LOG_ENTER(pm->log, osm_pc_rcv_process);
+ /* go ahead and get the monitored node struct to have the printable
+ * name if needed in messages
+ */
+ if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
+ cl_qmap_end(&(pm->monitored_map))) {
+ osm_log(pm->log, OSM_LOG_ERROR,
+ "osm_pc_rcv_process: ERR 4C12: GUID 0x%016"
+ PRIx64 " not found in monitored map\n",
+ node_guid);
+ goto Exit;
+ }
+ p_mon_node = (__monitored_node_t *) p_node;
+
osm_log(pm->log, OSM_LOG_VERBOSE,
"osm_pc_rcv_process: Processing received MAD status 0x%x context 0x%"
PRIx64 " port %u\n", p_mad->status, node_guid, port);
@@ -1127,8 +1151,6 @@ static void osm_pc_rcv_process(void *context, void *data)
ib_class_port_info_t *cpi =
(ib_class_port_info_t *) &
(osm_madw_get_perfmgt_mad_ptr(p_madw)->data);
- cl_map_item_t *p_node;
- __monitored_node_t *p_mon_node;
ib_api_status_t status;
osm_log(pm->log, OSM_LOG_VERBOSE,
@@ -1152,18 +1174,7 @@ static void osm_pc_rcv_process(void *context, void *data)
goto ReIssue;
/* LID redirection support (easier than GID redirection) */
- /* First, find the node in the monitored map */
cl_plock_acquire(pm->lock);
- if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
- cl_qmap_end(&(pm->monitored_map))) {
- cl_plock_release(pm->lock);
- osm_log(pm->log, OSM_LOG_ERROR,
- "osm_pc_rcv_process: ERR 4C12: GUID 0x%016"
- PRIx64 " not found in monitored map\n",
- node_guid);
- goto Exit;
- }
- p_mon_node = (__monitored_node_t *) p_node;
/* Now, validate port number */
if (port > p_mon_node->redir_tbl_size) {
cl_plock_release(pm->lock);
@@ -1203,11 +1214,11 @@ static void osm_pc_rcv_process(void *context, void *data)
/* detect an out of band clear on the port */
if (mad_context->perfmgr_context.mad_method != IB_MAD_METHOD_SET)
- osm_perfmgr_check_oob_clear(pm, node_guid, port,
+ osm_perfmgr_check_oob_clear(pm, p_mon_node, port,
&err_reading, &data_reading);
/* log any critical events from this reading */
- osm_perfmgr_log_events(pm, node_guid, port, &err_reading);
+ osm_perfmgr_log_events(pm, p_mon_node, port, &err_reading);
if (mad_context->perfmgr_context.mad_method == IB_MAD_METHOD_GET) {
perfmgr_db_add_err_reading(pm->db, node_guid, port,
@@ -1219,7 +1230,7 @@ static void osm_pc_rcv_process(void *context, void *data)
perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
}
- osm_perfmgr_check_overflow(pm, node_guid, port, wire_read);
+ osm_perfmgr_check_overflow(pm, p_mon_node, port, wire_read);
#if ENABLE_OSM_PERF_MGR_PROFILE
do {
--
1.5.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-Use-monitored-map-lookup-to-get-the-name-of-the-node.patch
Type: application/octet-stream
Size: 13485 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20071107/7d2d7d92/attachment.obj>
More information about the general
mailing list