[ewg] [PATCH 10/15] uDAPL v2.0 commom: add cm, link, and diag event counters in IB extended builds

Davis, Arlin R arlin.r.davis at intel.com
Mon Apr 23 13:01:05 PDT 2012


Add additional event monitoring capabilities during runtime to help
isolate issues during scaling in lieu of logging/printing warning
messages. Counters have been added to provider CM services and counters
have been added and mapped to sysfs ib_cm, device port and device
diag counters. ibdev_path is used for device sysfs counters.

uDAPL CM events are tracked on a per IA instance via internal
provider counters. The ib_cm, link, and diag events are tracked on a
per platform basis via sysfs. For these running counters a start
and stop function is provided for sampling and mapping to DAPL
64 bit counters. All counters, along with new start and stop functions,
are provided via dat_ib_extensions.h. New IB extension version is 2.0.7

New DCNT_IA_xx counters include 40 cm, 9 link, and 9 diag types.

To enable new counters (default build is disabled):
	./configure --enable-counters

New bitmappings have been added to DAPL_DBG_TYPE environment
variable to automatically start/stop counters and log
errors if counters are enabled. The following will control
CM, LINK, and DIAG respectively:

   DAPL_DBG_TYPE_CM_ERRS	= 0x080000,
   DAPL_DBG_TYPE_LINK_ERRS	= 0x100000,
   DAPL_DBG_TYPE_DIAG_ERRS	= 0x400000,

Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
 Makefile.am                          |    3 +
 configure.in                         |   11 +
 dapl/common/dapl_debug.c             |  431 +++++++++++++++++++++++++++++++++-
 dapl/common/dapl_ia_open.c           |    4 +
 dapl/common/dapl_ia_util.c           |   12 +-
 dapl/include/dapl_debug.h            |   13 +-
 dapl/openib_common/dapl_ib_common.h  |    2 +-
 dapl/openib_common/ib_extensions.c   |   26 ++
 dapl/udapl/linux/dapl_osd.h          |   16 ++
 dat/include/dat2/dat_ib_extensions.h |   95 ++++++++-
 10 files changed, 601 insertions(+), 12 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index a9bdeda..edff7f8 100755
--- a/Makefile.am
+++ b/Makefile.am
@@ -20,6 +20,9 @@ XFLAGS = -DDAT_EXTENSIONS
 XPROGRAMS = dapl/openib_common/ib_extensions.c
 XHEADERS =
 XLIBS =
+if DEFINE_COUNTERS
+XFLAGS += -DDAPL_COUNTERS
+endif
 if COLL_TYPE_FCA
 XFLAGS += -DDAT_IB_COLLECTIVES -DDAT_FCA_PROVIDER
 XPROGRAMS += dapl/openib_common/collectives/fca_provider.c
diff --git a/configure.in b/configure.in
index 71da96c..d577525 100644
--- a/configure.in
+++ b/configure.in
@@ -104,6 +104,17 @@ AC_ARG_ENABLE([ucm],
   [ucm=true])
 AM_CONDITIONAL(DEFINE_UCM, test x$ucm = xtrue)
 
+dnl Support to enable/disable IB extended counters (CM,LINK,DIAG)
+AC_ARG_ENABLE([counters], 
+  AS_HELP_STRING([--enable-counters],[enable counters provider build, default=disabled]),
+  [case "${enableval}" in
+    yes) counters=true ;;
+    no)  counters=false ;;
+    *) AC_MSG_ERROR(bad value ${enableval} for --enable-counters) ;; 
+  esac],
+  [counters=false])
+AM_CONDITIONAL(DEFINE_COUNTERS, test x$counters = xtrue)
+
 dnl Support ib_extension build - if enable-ext-type == ib 
 AC_ARG_ENABLE(ext-type,
 [  --enable-ext-type Enable extensions support for library: ib, none, default=ib],
diff --git a/dapl/common/dapl_debug.c b/dapl/common/dapl_debug.c
index 7a0a199..cb45496 100644
--- a/dapl/common/dapl_debug.c
+++ b/dapl/common/dapl_debug.c
@@ -74,6 +74,328 @@ void dapl_internal_dbg_log(DAPL_DBG_TYPE type, const char *fmt, ...)
 
 #ifdef DAPL_COUNTERS
 
+static int rd_ctr(const char *dev,
+		  const char *file,
+		  int port,
+		  DAT_IA_COUNTER_TYPE type,
+		  DAT_UINT64 *value)
+{
+	char *f_path;
+	int len, fd;
+	char vstr[21];
+	char pstr[2];
+
+	sprintf(pstr, "%d", port);
+	*value = 0;
+
+	switch (type) {
+	case DCNT_IA_CM:
+		if (asprintf(&f_path, "/sys/class/infiniband_cm/%s/%s/%s", dev, pstr, file) < 0)
+			return -1;
+		break;
+	case DCNT_IA_LNK:
+		if (asprintf(&f_path, "%s/ports/%s/counters/%s", dev, pstr, file) < 0)
+			return -1;
+		break;
+	case DCNT_IA_DIAG:
+		if (asprintf(&f_path, "%s/diag_counters/%s", dev, file) < 0)
+			return -1;
+		break;
+	default:
+		return -1;
+	}
+
+	fd = open(f_path, O_RDONLY);
+	if (fd < 0) {
+		free(f_path);
+		return -1;
+	}
+
+	len = read(fd, vstr, 21);
+
+	if (len > 0 && vstr[--len] == '\n')
+		vstr[len] = '\0';
+
+	*value = (DAT_UINT64)atoi(vstr);
+
+	close(fd);
+	free(f_path);
+	return 0;
+}
+
+#ifdef _OPENIB_CMA_
+static void dapl_start_cm_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	const char *dev = ibv_get_device_name(ia->hca_ptr->ib_trans.ib_dev);
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+	rd_ctr(dev,"cm_tx_msgs/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REQ_TX]);
+	rd_ctr(dev,"cm_tx_msgs/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REP_TX]);
+	rd_ctr(dev,"cm_tx_msgs/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_RTU_TX]);
+	rd_ctr(dev,"cm_tx_msgs/rej", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_USER_REJ_TX]);
+	rd_ctr(dev,"cm_tx_msgs/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_MRA_TX]);
+	rd_ctr(dev,"cm_tx_msgs/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREQ_TX]);
+	rd_ctr(dev,"cm_tx_msgs/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREP_TX]);
+
+	rd_ctr(dev,"cm_rx_msgs/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REQ_RX]);
+	rd_ctr(dev,"cm_rx_msgs/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_REP_RX]);
+	rd_ctr(dev,"cm_rx_msgs/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_RTU_RX]);
+	rd_ctr(dev,"cm_rx_msgs/rej", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_USER_REJ_RX]);
+	rd_ctr(dev,"cm_rx_msgs/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_MRA_RX]);
+	rd_ctr(dev,"cm_rx_msgs/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREQ_RX]);
+	rd_ctr(dev,"cm_rx_msgs/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_DREP_RX]);
+
+	rd_ctr(dev,"cm_tx_retries/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REQ_RETRY]);
+	rd_ctr(dev,"cm_tx_retries/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REP_RETRY]);
+	rd_ctr(dev,"cm_tx_retries/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_RTU_RETRY]);
+	rd_ctr(dev,"cm_tx_retries/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_MRA_RETRY]);
+	rd_ctr(dev,"cm_tx_retries/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREQ_RETRY]);
+	rd_ctr(dev,"cm_tx_retries/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREP_RETRY]);
+
+	rd_ctr(dev,"cm_tx_duplicates/req", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REQ_DUP]);
+	rd_ctr(dev,"cm_tx_duplicates/rep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_REP_DUP]);
+	rd_ctr(dev,"cm_tx_duplicates/rtu", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_RTU_DUP]);
+	rd_ctr(dev,"cm_tx_duplicates/mra", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_MRA_DUP]);
+	rd_ctr(dev,"cm_tx_duplicates/dreq", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREQ_DUP]);
+	rd_ctr(dev,"cm_tx_duplicates/drep", port, DCNT_IA_CM, &cntrs[DCNT_IA_CM_ERR_DREP_DUP]);
+}
+
+static void dapl_stop_cm_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	const char *dev = ibv_get_device_name(ia->hca_ptr->ib_trans.ib_dev);
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+	DAT_UINT64 val = 0;
+
+	rd_ctr(dev,"cm_tx_msgs/req", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_REQ_TX] = val - cntrs[DCNT_IA_CM_REQ_TX];
+	rd_ctr(dev,"cm_tx_msgs/rep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_REP_TX] = val - cntrs[DCNT_IA_CM_REP_TX];
+	rd_ctr(dev,"cm_tx_msgs/rtu", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_RTU_TX] = val - cntrs[DCNT_IA_CM_RTU_TX];
+	rd_ctr(dev,"cm_tx_msgs/rej", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_USER_REJ_TX] = val - cntrs[DCNT_IA_CM_USER_REJ_TX];
+	rd_ctr(dev,"cm_tx_msgs/mra", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_MRA_TX] = val - cntrs[DCNT_IA_CM_MRA_TX];
+	rd_ctr(dev,"cm_tx_msgs/dreq", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_DREQ_TX] = val - cntrs[DCNT_IA_CM_DREQ_TX];
+	rd_ctr(dev,"cm_tx_msgs/drep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_DREP_TX] = val - cntrs[DCNT_IA_CM_DREP_TX];
+
+	rd_ctr(dev,"cm_rx_msgs/req", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_REQ_RX] = val - cntrs[DCNT_IA_CM_REQ_RX];
+	rd_ctr(dev,"cm_rx_msgs/rep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_REP_RX] = val - cntrs[DCNT_IA_CM_REP_RX];
+	rd_ctr(dev,"cm_rx_msgs/rtu", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_RTU_RX] = val - cntrs[DCNT_IA_CM_RTU_RX];
+	rd_ctr(dev,"cm_rx_msgs/rej", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_USER_REJ_RX] = val - cntrs[DCNT_IA_CM_USER_REJ_RX];
+	rd_ctr(dev,"cm_rx_msgs/mra", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_MRA_RX] = val - cntrs[DCNT_IA_CM_MRA_RX];
+	rd_ctr(dev,"cm_rx_msgs/dreq", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_DREQ_RX] = val - cntrs[DCNT_IA_CM_DREQ_RX];
+	rd_ctr(dev,"cm_rx_msgs/drep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_DREP_RX] = val - cntrs[DCNT_IA_CM_DREP_RX];
+
+	rd_ctr(dev,"cm_tx_retries/req", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_REQ_RETRY] = val - cntrs[DCNT_IA_CM_ERR_REQ_RETRY];
+	rd_ctr(dev,"cm_tx_retries/rep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_REP_RETRY] = val - cntrs[DCNT_IA_CM_ERR_REP_RETRY];
+	rd_ctr(dev,"cm_tx_retries/rtu", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_RTU_RETRY] = val - cntrs[DCNT_IA_CM_ERR_RTU_RETRY];
+	rd_ctr(dev,"cm_tx_retries/mra", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_MRA_RETRY] = val - cntrs[DCNT_IA_CM_ERR_MRA_RETRY];
+	rd_ctr(dev,"cm_tx_retries/dreq", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_DREQ_RETRY] = val - cntrs[DCNT_IA_CM_ERR_DREQ_RETRY];
+	rd_ctr(dev,"cm_tx_retries/drep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_DREP_RETRY] = val - cntrs[DCNT_IA_CM_ERR_DREP_RETRY];
+
+	rd_ctr(dev,"cm_tx_duplicates/req", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_REQ_DUP] = val - cntrs[DCNT_IA_CM_ERR_REQ_DUP];
+	rd_ctr(dev,"cm_tx_duplicates/rep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_REP_DUP] = val - cntrs[DCNT_IA_CM_ERR_REP_DUP];
+	rd_ctr(dev,"cm_tx_duplicates/rtu", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_RTU_DUP] = val - cntrs[DCNT_IA_CM_ERR_RTU_DUP];
+	rd_ctr(dev,"cm_tx_duplicates/mra", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_MRA_DUP] = val - cntrs[DCNT_IA_CM_ERR_MRA_DUP];
+	rd_ctr(dev,"cm_tx_duplicates/dreq", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_DREQ_DUP] = val - cntrs[DCNT_IA_CM_ERR_DREQ_DUP];
+	rd_ctr(dev,"cm_tx_duplicates/drep", port, DCNT_IA_CM, &val);
+	cntrs[DCNT_IA_CM_ERR_DREP_DUP] = val - cntrs[DCNT_IA_CM_ERR_DREP_DUP];
+}
+#endif
+
+/* map selective IB port counters to dapl counters */
+static void dapl_start_lnk_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+	rd_ctr(dev,"port_rcv_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV]);
+	rd_ctr(dev,"port_rcv_remote_physical_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS]);
+	rd_ctr(dev,"port_rcv_contraint_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT]);
+	rd_ctr(dev,"port_xmit_discards", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS]);
+	rd_ctr(dev,"port_xmit_contraint", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT]);
+	rd_ctr(dev,"local_link_integrity_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_INTEGRITY]);
+	rd_ctr(dev,"excessive_buffer_overrun_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN]);
+	rd_ctr(dev,"port_xmit_wait", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_WARN_XMT_WAIT]);
+	rd_ctr(dev,"port_rcv_switch_relay_errors", port, DCNT_IA_LNK, &cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY]);
+}
+
+static void dapl_stop_lnk_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+	DAT_UINT64 val = 0;
+
+	rd_ctr(dev,"port_rcv_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_RCV] = val - cntrs[DCNT_IA_LNK_ERR_RCV];
+	rd_ctr(dev,"port_rcv_remote_physical_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS] = val - cntrs[DCNT_IA_LNK_ERR_RCV_REM_PHYS];
+	rd_ctr(dev,"port_rcv_contraint_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT] =	val - cntrs[DCNT_IA_LNK_ERR_RCV_CONSTRAINT];
+	rd_ctr(dev,"port_xmit_discards", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS] = val - cntrs[DCNT_IA_LNK_ERR_XMT_DISCARDS];
+	rd_ctr(dev,"port_xmit_contraint", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT] = val - cntrs[DCNT_IA_LNK_ERR_XMT_CONTRAINT];
+	rd_ctr(dev,"local_link_integrity_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_INTEGRITY]  = val - cntrs[DCNT_IA_LNK_ERR_INTEGRITY] ;
+	rd_ctr(dev,"excessive_buffer_overrun_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN] = val - cntrs[DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN];
+	rd_ctr(dev,"port_rcv_switch_relay_errors", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY] = val - cntrs[DCNT_IA_LNK_WARN_RCV_SW_RELAY];
+	rd_ctr(dev,"port_xmit_wait", port, DCNT_IA_LNK, &val);
+	cntrs[DCNT_IA_LNK_WARN_XMT_WAIT] = val - cntrs[DCNT_IA_LNK_WARN_XMT_WAIT];
+}
+
+/* map selective IB diag_counters to dapl counters */
+static void dapl_start_diag_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+
+	rd_ctr(dev,"rq_num_rae", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_RAE]);
+	rd_ctr(dev,"rq_num_oos", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_OOS]);
+	rd_ctr(dev,"rq_num_rire", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE]);
+	rd_ctr(dev,"rq_num_udsdprd", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD]);
+	rd_ctr(dev,"sq_num_rae", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RAE]);
+	rd_ctr(dev,"sq_num_oos", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_OOS]);
+	rd_ctr(dev,"sq_num_rire", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE]);
+	rd_ctr(dev,"sq_num_rree", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_RREE]);
+	rd_ctr(dev,"sq_num_tree", port, DCNT_IA_DIAG, &cntrs[DCNT_IA_DIAG_ERR_SQ_TREE]);
+}
+
+static void dapl_stop_diag_cntrs(DAT_HANDLE dh)
+{
+	DAPL_IA *ia = (DAPL_IA *)dh;
+	char *dev = ia->hca_ptr->ib_hca_handle->device->ibdev_path;
+	int port = ia->hca_ptr->port_num;
+	DAT_UINT64 *cntrs = (DAT_UINT64 *)ia->cntrs;
+	DAT_UINT64 val = 0;
+
+	rd_ctr(dev,"rq_num_rae", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_RQ_RAE] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_RAE];
+	rd_ctr(dev,"rq_num_oos", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_RQ_OOS] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_OOS];
+	rd_ctr(dev,"rq_num_rire", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_RIRE];
+	rd_ctr(dev,"rq_num_udsdprd", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD] = val - cntrs[DCNT_IA_DIAG_ERR_RQ_UDSDPRD];
+	rd_ctr(dev,"sq_num_rae", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_SQ_RAE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RAE];
+	rd_ctr(dev,"sq_num_oos", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_SQ_OOS] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_OOS];
+	rd_ctr(dev,"sq_num_rire", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RIRE];
+	rd_ctr(dev,"sq_num_rree", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_SQ_RREE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_RREE];
+	rd_ctr(dev,"sq_num_tree", port, DCNT_IA_DIAG, &val);
+	cntrs[DCNT_IA_DIAG_ERR_SQ_TREE] = val - cntrs[DCNT_IA_DIAG_ERR_SQ_TREE];
+}
+
+void dapl_start_counters(DAT_HANDLE dh, DAT_IA_COUNTER_TYPE type)
+{
+	switch (type) {
+	case DCNT_IA_CM:
+#ifdef _OPENIB_CMA_
+		dapl_start_cm_cntrs(dh); /* ib cm timers, cma only */
+#endif
+		break;
+	case DCNT_IA_LNK:
+		dapl_start_lnk_cntrs(dh);
+		break;
+	case DCNT_IA_DIAG:
+		dapl_start_diag_cntrs(dh);
+		break;
+	default:
+		break;
+	}
+}
+
+void dapl_stop_counters(DAT_HANDLE dh, DAT_IA_COUNTER_TYPE type)
+{
+	switch (type) {
+	case DCNT_IA_CM:
+#ifdef _OPENIB_CMA_
+		dapl_stop_cm_cntrs(dh);
+#endif
+		break;
+	case DCNT_IA_LNK:
+		dapl_stop_lnk_cntrs(dh);
+		break;
+	case DCNT_IA_DIAG:
+		dapl_stop_diag_cntrs(dh);
+		break;
+	default:
+		break;
+
+	}
+}
+
+void dapli_start_counters(DAT_HANDLE dh)
+{
+#ifdef _OPENIB_CMA_
+	if (g_dapl_dbg_type & (DAPL_DBG_TYPE_CM_ERRS | DAPL_DBG_TYPE_CM_STATS))
+		dapl_start_cm_cntrs(dh);
+#endif
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+		dapl_start_lnk_cntrs(dh);
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+		dapl_start_diag_cntrs(dh);
+}
+
+void dapli_stop_counters(DAT_HANDLE dh)
+{
+#ifdef _OPENIB_CMA_
+	if (g_dapl_dbg_type & (DAPL_DBG_TYPE_CM_ERRS | DAPL_DBG_TYPE_CM_STATS))
+		dapl_stop_cm_cntrs(dh);
+#endif
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+		dapl_stop_lnk_cntrs(dh);
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+		dapl_stop_diag_cntrs(dh);
+
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_STATS)
+		dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_CM");
+	else if (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_ERRS)
+		dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_CM_ERR");
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_ERRS)
+		dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_LNK_ERR");
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_LINK_WARN)
+		dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_LNK_WARN");
+	if (g_dapl_dbg_type & DAPL_DBG_TYPE_DIAG_ERRS)
+		dapl_print_counter_str(dh, DCNT_IA_ALL_COUNTERS, 1, "_DIAG_ERR");
+}
+
 /*
  * The order of this list must match the DAT counter definitions 
  */
@@ -103,9 +425,69 @@ static char *ia_cntr_names[] = {
 	"DCNT_IA_MEM_FREE",
 	"DCNT_IA_ASYNC_ERROR",
 	"DCNT_IA_ASYNC_QP_ERROR",
-	"DCNT_IA_ASYNC_CQ_ERROR"
+	"DCNT_IA_ASYNC_CQ_ERROR",
+	"DCNT_IA_CM_LISTEN",
+	"DCNT_IA_CM_REQ_TX",
+	"DCNT_IA_CM_REQ_RX",
+	"DCNT_IA_CM_REP_TX",
+	"DCNT_IA_CM_REP_RX",
+	"DCNT_IA_CM_RTU_TX",
+	"DCNT_IA_CM_RTU_RX",
+	"DCNT_IA_CM_USER_REJ_TX",
+	"DCNT_IA_CM_USER_REJ_RX",
+	"DCNT_IA_CM_ACTIVE_EST",
+	"DCNT_IA_CM_PASSIVE_EST",
+	"DCNT_IA_CM_AH_REQ_TX",
+	"DCNT_IA_CM_AH_REQ_RX",
+	"DCNT_IA_CM_AH_RESOLVED",
+	"DCNT_IA_CM_DREQ_TX",
+	"DCNT_IA_CM_DREQ_RX",
+	"DCNT_IA_CM_DREP_TX",
+	"DCNT_IA_CM_DREP_RX",
+	"DCNT_IA_CM_MRA_TX",
+	"DCNT_IA_CM_MRA_RX",
+	"DCNT_IA_CM_REQ_FULLQ_POLL",
+	"DCNT_IA_CM_ERR",
+	"DCNT_IA_CM_ERR_REQ_FULLQ",
+	"DCNT_IA_CM_ERR_REQ_DUP",
+	"DCNT_IA_CM_ERR_REQ_RETRY",
+	"DCNT_IA_CM_ERR_REP_DUP",
+	"DCNT_IA_CM_ERR_REP_RETRY",
+	"DCNT_IA_CM_ERR_RTU_DUP",
+	"DCNT_IA_CM_ERR_RTU_RETRY",
+	"DCNT_IA_CM_ERR_REFUSED",
+	"DCNT_IA_CM_ERR_RESET",
+	"DCNT_IA_CM_ERR_TIMEOUT",
+	"DCNT_IA_CM_ERR_REJ_TX",
+	"DCNT_IA_CM_ERR_REJ_RX",
+	"DCNT_IA_CM_ERR_DREQ_DUP",
+	"DCNT_IA_CM_ERR_DREQ_RETRY",
+	"DCNT_IA_CM_ERR_DREP_DUP",
+	"DCNT_IA_CM_ERR_DREP_RETRY",
+	"DCNT_IA_CM_ERR_MRA_DUP",
+	"DCNT_IA_CM_ERR_MRA_RETRY",
+	"DCNT_IA_CM_ERR_UNEXPECTED",
+	"DCNT_IA_LNK_ERR_RCV",
+	"DCNT_IA_LNK_ERR_RCV_REM_PHYS",
+	"DCNT_IA_LNK_ERR_RCV_CONSTRAINT",
+	"DCNT_IA_LNK_ERR_XMT_DISCARDS",
+	"DCNT_IA_LNK_ERR_XMT_CONTRAINT",
+	"DCNT_IA_LNK_ERR_INTEGRITY",
+	"DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN",
+	"DCNT_IA_LNK_WARN_RCV_SW_RELAY",
+	"DCNT_IA_LNK_WARN_XMT_WAIT",
+	"DCNT_IA_DIAG_ERR_RQ_RAE",
+	"DCNT_IA_DIAG_ERR_RQ_OOS",
+	"DCNT_IA_DIAG_ERR_RQ_RIRE",
+	"DCNT_IA_DIAG_ERR_RQ_UDSDPRD",
+	"DCNT_IA_DIAG_ERR_SQ_RAE",
+	"DCNT_IA_DIAG_ERR_SQ_OOS",
+	"DCNT_IA_DIAG_ERR_SQ_RIRE",
+	"DCNT_IA_DIAG_ERR_SQ_RREE",
+	"DCNT_IA_DIAG_ERR_SQ_TREE",
 };
 
+
 static char *ep_cntr_names[] = {
 	"DCNT_EP_CONNECT",
 	"DCNT_EP_DISCONNECT",
@@ -234,8 +616,9 @@ void dapl_print_counter(DAT_HANDLE dh, int counter, int reset)
 
 	for (i = 0; i < max; i++) {
 		if ((counter == i) || (counter == max)) {
-			printf(" %s = " F64u " \n",
-			       dapl_query_counter_name(dh, i), p_cntrs[i]);
+			printf(" %s:0x%x: %s = " F64u " \n",
+				_hostname_, dapl_os_getpid(),
+			       	dapl_query_counter_name(dh, i), p_cntrs[i]);
 			if (reset)
 				p_cntrs[i] = 0;
 		}
@@ -246,7 +629,47 @@ void dapl_print_counter(DAT_HANDLE dh, int counter, int reset)
 	    (g_dapl_dbg_type & DAPL_DBG_TYPE_CM_LIST)) {
 		dapls_print_cm_list((DAPL_IA*)dh);
 	}
-	return;
+}
+
+void dapl_print_counter_str(DAT_HANDLE dh, int counter, int reset, const char *pattern)
+{
+	int i, max;
+	DAT_UINT64 *p_cntrs;
+	DAT_HANDLE_TYPE type = 0;
+	DAPL_IA *ia = NULL;
+
+	dat_get_handle_type(dh, &type);
+
+	switch (type) {
+	case DAT_HANDLE_TYPE_IA:
+		max = DCNT_IA_ALL_COUNTERS;
+		ia = (DAPL_IA *)dh;
+		p_cntrs = ((DAPL_IA *) dh)->cntrs;
+		break;
+	case DAT_HANDLE_TYPE_EP:
+		max = DCNT_EP_ALL_COUNTERS;
+		p_cntrs = ((DAPL_EP *) dh)->cntrs;
+		break;
+	case DAT_HANDLE_TYPE_EVD:
+		max = DCNT_EVD_ALL_COUNTERS;
+		p_cntrs = ((DAPL_EVD *) dh)->cntrs;
+		break;
+	default:
+		return;
+	}
+
+	/* print only counters with pattern string match and non-zero values */
+	for (i = 0; i < max; i++) {
+		if ((counter == i) || (counter == max)) {
+			if (p_cntrs[i] && !dapl_os_pstrcmp(pattern, dapl_query_counter_name(dh, i))) {
+				printf(" %s:0x%x: %s = " F64u " \n",
+					_hostname_, dapl_os_getpid(),
+			        	dapl_query_counter_name(dh, i), p_cntrs[i]);
+				if (reset)
+					p_cntrs[i] = 0;
+			}
+		}
+	}
 }
 
 #endif				/* DAPL_COUNTERS */
diff --git a/dapl/common/dapl_ia_open.c b/dapl/common/dapl_ia_open.c
index edead04..e43d78d 100644
--- a/dapl/common/dapl_ia_open.c
+++ b/dapl/common/dapl_ia_open.c
@@ -266,6 +266,10 @@ dapl_ia_open(IN const DAT_NAME_PTR name,
 	*ia_handle_ptr = ia_ptr;
 	*async_evd_handle_ptr = evd_ptr;
 
+#if DAPL_COUNTERS
+	dapli_start_counters((DAT_HANDLE)ia_ptr);
+#endif
+
       bail:
 	if (dat_status != DAT_SUCCESS) {
 		if (ia_ptr) {
diff --git a/dapl/common/dapl_ia_util.c b/dapl/common/dapl_ia_util.c
index 2208c23..6d1b5a8 100755
--- a/dapl/common/dapl_ia_util.c
+++ b/dapl/common/dapl_ia_util.c
@@ -525,6 +525,13 @@ void dapli_ia_release_hca(DAPL_HCA * hca_ptr)
 	dapl_os_lock(&hca_ptr->lock);
 	dapl_os_atomic_dec(&hca_ptr->handle_ref_count);
 	if (dapl_os_atomic_read(&hca_ptr->handle_ref_count) == 0) {
+#ifdef DAPL_COUNTERS
+{
+		DAPL_IA *ia = (DAPL_IA *)dapl_llist_peek_head(&hca_ptr->ia_list_head);
+		dapli_stop_counters(ia);
+		dapl_os_free(ia->cntrs, sizeof(DAT_UINT64) * DCNT_IA_ALL_COUNTERS);
+}
+#endif
 		dapls_ib_close_hca(hca_ptr);
 		hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
 		hca_ptr->async_evd = NULL;
@@ -566,11 +573,6 @@ void dapls_ia_free(DAPL_IA * ia_ptr)
 	dapl_hca_unlink_ia(ia_ptr->hca_ptr, ia_ptr);
 	ia_ptr->header.magic = DAPL_MAGIC_INVALID;	/* reset magic to prevent reuse */
 	dapl_os_lock_destroy(&ia_ptr->header.lock);
-
-#ifdef DAPL_COUNTERS
-	dapl_os_free(ia_ptr->cntrs, sizeof(DAT_UINT64) * DCNT_IA_ALL_COUNTERS);
-#endif				/* DAPL_COUNTERS */
-
 	dapl_os_free(ia_ptr, sizeof(DAPL_IA));
 }
 
diff --git a/dapl/include/dapl_debug.h b/dapl/include/dapl_debug.h
index bb11c3d..6cbe028 100644
--- a/dapl/include/dapl_debug.h
+++ b/dapl/include/dapl_debug.h
@@ -71,7 +71,11 @@ typedef enum
     DAPL_DBG_TYPE_CM_EST  	= 0x8000,
     DAPL_DBG_TYPE_CM_WARN  	= 0x10000,
     DAPL_DBG_TYPE_EXTENSION	= 0x20000,
-    DAPL_DBG_TYPE_CM_STATS	= 0x40000
+    DAPL_DBG_TYPE_CM_STATS	= 0x40000,
+    DAPL_DBG_TYPE_CM_ERRS	= 0x80000,
+    DAPL_DBG_TYPE_LINK_ERRS	= 0x100000,
+    DAPL_DBG_TYPE_LINK_WARN	= 0x200000,
+    DAPL_DBG_TYPE_DIAG_ERRS	= 0x400000,
 
 } DAPL_DBG_TYPE;
 
@@ -100,6 +104,7 @@ extern void dapl_internal_dbg_log(DAPL_DBG_TYPE type,  const char *fmt,  ...);
 
 #define DAPL_CNTR(h_ptr, cntr) ((DAT_UINT64*)h_ptr->cntrs)[cntr]++
 #define DAPL_CNTR_DATA(h_ptr, cntr, data) ((DAT_UINT64*)h_ptr->cntrs)[cntr]+= data
+#define DAPL_CNTR_RESET(h_ptr, cntr) ((DAT_UINT64*)h_ptr->cntrs)[cntr] = 0
 
 DAT_RETURN dapl_query_counter(DAT_HANDLE dh, 
 			      int counter, 
@@ -107,11 +112,17 @@ DAT_RETURN dapl_query_counter(DAT_HANDLE dh,
 			      int reset);
 char *dapl_query_counter_name(DAT_HANDLE dh, int counter);
 void dapl_print_counter(DAT_HANDLE dh, int counter, int reset);
+void dapl_print_counter_str(DAT_HANDLE dh, int counter, int reset, const char *pattern);
+void dapl_start_counters(DAT_HANDLE ia, DAT_IA_COUNTER_TYPE type);
+void dapl_stop_counters(DAT_HANDLE ia, DAT_IA_COUNTER_TYPE type);
+void dapli_start_counters(DAT_HANDLE ia);
+void dapli_stop_counters(DAT_HANDLE ia);
 
 #else
 
 #define DAPL_CNTR(handle, cntr)
 #define DAPL_CNTR_DATA(handle, cntr, data)
+#define DAPL_CNTR_RESET(handle, cntr)
 
 #endif /* DAPL_COUNTERS */
 
diff --git a/dapl/openib_common/dapl_ib_common.h b/dapl/openib_common/dapl_ib_common.h
index e757b65..ba805d0 100644
--- a/dapl/openib_common/dapl_ib_common.h
+++ b/dapl/openib_common/dapl_ib_common.h
@@ -342,7 +342,7 @@ dapl_convert_errno( IN int err, IN const char *str )
     if (!err)  return DAT_SUCCESS;
     	
     if ((err != EAGAIN) && (err != ETIMEDOUT))
-	dapl_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err));
+	dapl_log (DAPL_DBG_TYPE_ERR," DAPL ERR %s %s\n", str, strerror(err));
 
     switch( err )
     {
diff --git a/dapl/openib_common/ib_extensions.c b/dapl/openib_common/ib_extensions.c
index c85323c..0952bd5 100644
--- a/dapl/openib_common/ib_extensions.c
+++ b/dapl/openib_common/ib_extensions.c
@@ -184,6 +184,32 @@ dapl_extensions(IN DAT_HANDLE dat_handle,
 			status = DAT_SUCCESS;
 			break;
 		}
+	case DAT_IB_START_COUNTERS_OP:
+		{
+			DAT_IA_COUNTER_TYPE type;
+
+			dapl_dbg_log(DAPL_DBG_TYPE_RTN,
+				     " Start counter extension call\n");
+
+			type = va_arg(args, int);
+
+			dapl_start_counters(dat_handle, type);
+			status = DAT_SUCCESS;
+			break;
+		}
+	case DAT_IB_STOP_COUNTERS_OP:
+		{
+			DAT_IA_COUNTER_TYPE type;
+
+			dapl_dbg_log(DAPL_DBG_TYPE_RTN,
+				     " Start counter extension call\n");
+
+			type = va_arg(args, int);
+
+			dapl_stop_counters(dat_handle, type);
+			status = DAT_SUCCESS;
+			break;
+		}
 #endif				/* DAPL_COUNTERS */
 #ifdef DAT_IB_COLLECTIVES
 	case DAT_IB_COLLECTIVE_CREATE_MEMBER_OP:
diff --git a/dapl/udapl/linux/dapl_osd.h b/dapl/udapl/linux/dapl_osd.h
index cb61cae..7198439 100644
--- a/dapl/udapl/linux/dapl_osd.h
+++ b/dapl/udapl/linux/dapl_osd.h
@@ -515,6 +515,22 @@ STATIC _INLINE_ char * dapl_os_strdup(const char *str)
     return strdup(str);
 }
 
+STATIC _INLINE_ int dapl_os_pstrcmp(const char *pstr, const char *str)
+{
+	int i, ii;
+	int plen = strlen(pstr);
+	int slen = strlen(str);
+
+	for (i=0; i < slen; i++) {
+		for (ii=0; ii < plen && i < slen; ii++, i++) {
+			if ((pstr[ii] == str[i]) && (ii == plen-1))
+				return 0;
+			else if (pstr[ii] != str[i])
+				break;
+		}
+	}
+	return 1;
+}
 
 /*
  * Timer Functions
diff --git a/dat/include/dat2/dat_ib_extensions.h b/dat/include/dat2/dat_ib_extensions.h
index ac69fed..6e3cb9e 100755
--- a/dat/include/dat2/dat_ib_extensions.h
+++ b/dat/include/dat2/dat_ib_extensions.h
@@ -73,9 +73,10 @@
  * 2.0.4 - Add DAT_IB_UD_CONNECTION_REJECT_EVENT extended UD event
  * 2.0.5 - Add DAT_IB_UD extended UD connection error events
  * 2.0.6 - Add MPI over IB collective extensions
+ * 2.0.7 - Add new IA counters for dapl CM, device LINK, device DIAG
  *
  */
-#define DAT_IB_EXTENSION_VERSION	206	/* 2.0.6 */
+#define DAT_IB_EXTENSION_VERSION	207	/* 2.0.7 */
 #define DAT_IB_ATTR_COUNTERS		"DAT_COUNTERS"
 #define DAT_IB_ATTR_FETCH_AND_ADD	"DAT_IB_FETCH_AND_ADD"
 #define DAT_IB_ATTR_CMP_AND_SWAP	"DAT_IB_CMP_AND_SWAP"
@@ -151,6 +152,8 @@ typedef enum dat_ib_op
 	DAT_IB_COLLECTIVE_SCAN_OP,
 	DAT_IB_COLLECTIVE_BROADCAST_OP,
 	DAT_IB_COLLECTIVE_BARRIER_OP,
+	DAT_IB_START_COUNTERS_OP,
+	DAT_IB_STOP_COUNTERS_OP,
 	
 } DAT_IB_OP;
 
@@ -369,6 +372,65 @@ typedef enum dat_ia_counters
 	DCNT_IA_ASYNC_ERROR,
 	DCNT_IA_ASYNC_QP_ERROR,
 	DCNT_IA_ASYNC_CQ_ERROR,
+	DCNT_IA_CM_LISTEN,
+	DCNT_IA_CM_REQ_TX,
+	DCNT_IA_CM_REQ_RX,
+	DCNT_IA_CM_REP_TX,
+	DCNT_IA_CM_REP_RX,
+	DCNT_IA_CM_RTU_TX,
+	DCNT_IA_CM_RTU_RX,
+	DCNT_IA_CM_USER_REJ_TX,
+	DCNT_IA_CM_USER_REJ_RX,
+	DCNT_IA_CM_ACTIVE_EST,
+	DCNT_IA_CM_PASSIVE_EST,
+	DCNT_IA_CM_AH_REQ_TX,
+	DCNT_IA_CM_AH_REQ_RX,
+	DCNT_IA_CM_AH_RESOLVED,
+	DCNT_IA_CM_DREQ_TX,
+	DCNT_IA_CM_DREQ_RX,
+	DCNT_IA_CM_DREP_TX,
+	DCNT_IA_CM_DREP_RX,
+	DCNT_IA_CM_MRA_TX,
+	DCNT_IA_CM_MRA_RX,
+	DCNT_IA_CM_REQ_FULLQ_POLL,
+	DCNT_IA_CM_ERR,
+	DCNT_IA_CM_ERR_REQ_FULLQ,
+	DCNT_IA_CM_ERR_REQ_DUP,
+	DCNT_IA_CM_ERR_REQ_RETRY,
+	DCNT_IA_CM_ERR_REP_DUP,
+	DCNT_IA_CM_ERR_REP_RETRY,
+	DCNT_IA_CM_ERR_RTU_DUP,
+	DCNT_IA_CM_ERR_RTU_RETRY,
+	DCNT_IA_CM_ERR_REFUSED,
+	DCNT_IA_CM_ERR_RESET,
+	DCNT_IA_CM_ERR_TIMEOUT,
+	DCNT_IA_CM_ERR_REJ_TX,
+	DCNT_IA_CM_ERR_REJ_RX,
+	DCNT_IA_CM_ERR_DREQ_DUP,
+	DCNT_IA_CM_ERR_DREQ_RETRY,
+	DCNT_IA_CM_ERR_DREP_DUP,
+	DCNT_IA_CM_ERR_DREP_RETRY,
+	DCNT_IA_CM_ERR_MRA_DUP,
+	DCNT_IA_CM_ERR_MRA_RETRY,
+	DCNT_IA_CM_ERR_UNEXPECTED,
+	DCNT_IA_LNK_ERR_RCV,
+	DCNT_IA_LNK_ERR_RCV_REM_PHYS,
+	DCNT_IA_LNK_ERR_RCV_CONSTRAINT,
+	DCNT_IA_LNK_ERR_XMT_DISCARDS,
+	DCNT_IA_LNK_ERR_XMT_CONTRAINT,
+	DCNT_IA_LNK_ERR_INTEGRITY,
+	DCNT_IA_LNK_ERR_EXC_BUF_OVERRUN,
+	DCNT_IA_LNK_WARN_RCV_SW_RELAY,
+	DCNT_IA_LNK_WARN_XMT_WAIT,
+	DCNT_IA_DIAG_ERR_RQ_RAE,
+	DCNT_IA_DIAG_ERR_RQ_OOS,
+	DCNT_IA_DIAG_ERR_RQ_RIRE,
+	DCNT_IA_DIAG_ERR_RQ_UDSDPRD,
+	DCNT_IA_DIAG_ERR_SQ_RAE,
+	DCNT_IA_DIAG_ERR_SQ_OOS,
+	DCNT_IA_DIAG_ERR_SQ_RIRE,
+	DCNT_IA_DIAG_ERR_SQ_RREE,
+	DCNT_IA_DIAG_ERR_SQ_TREE,
 	DCNT_IA_ALL_COUNTERS,  /* MUST be last */
 
 } DAT_IA_COUNTERS;
@@ -426,6 +488,19 @@ typedef enum dat_evd_counters
 } DAT_EVD_COUNTERS;
 
 /*
+ * Definitions IA Counter Types
+ * 	for sampling running counters
+ *
+ */
+typedef enum dat_ia_counter_type
+{
+	DCNT_IA_CM,
+	DCNT_IA_LNK,
+	DCNT_IA_DIAG,
+
+} DAT_IA_COUNTER_TYPE;
+
+/*
  * Data type for reduce operations
  */
 typedef enum dat_ib_collective_data_type
@@ -655,6 +730,24 @@ dat_strerror_ext_status (
 		IN (int) (reset))
 
 /*
+ * Start and stop counter(s):
+ * Provide IA, call will start sampling running IB counters
+ * 	DAT_HANDLE dat_handle, counter type (link, diag)
+ *
+ */
+#define dat_ib_start_counter(dat_handle, type) \
+	dat_extension_op(\
+		IN (DAT_HANDLE) dat_handle, \
+		IN (DAT_IB_OP) DAT_IB_START_COUNTERS_OP, \
+		IN (DAT_COUNTER_TYPE) (type))
+
+#define dat_ib_stop_counter(dat_handle, type) \
+	dat_extension_op(\
+		IN (DAT_HANDLE) dat_handle, \
+		IN (DAT_IB_OP) DAT_IB_STOP_COUNTERS_OP, \
+		IN (DAT_COUNTER_TYPE) (type))
+
+/*
  ************************ MPI IB Collective Functions ***********************
  */
 
-- 
1.7.3






More information about the ewg mailing list