[openib-general] [PATCH] uDAPL async QP/CQ error handling fixed
Arlin Davis
arlin.r.davis at intel.com
Thu Oct 13 11:23:38 PDT 2005
James,
Patch will fix the async error handling and callback mappings. QP/CQ error mappings were totally
screwed up. Updated TODO list.
-arlin
Signed-off by: Arlin Davis <ardavis at ichips.intel.com>
Index: dapl/openib/TODO
===================================================================
--- dapl/openib/TODO (revision 3768)
+++ dapl/openib/TODO (working copy)
@@ -1,12 +1,10 @@
IB Verbs:
- CQ resize
-- mulitple CQ event support
- memory window support
DAPL:
- reinit EP needs a QP timewait completion notification
-- direct cq_wait_object when multi-CQ verbs event support arrives
- shared receive queue support
Under discussion:
Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c (revision 3768)
+++ dapl/openib/dapl_ib_util.c (working copy)
@@ -214,8 +214,11 @@ DAT_RETURN dapls_ib_open_hca (
/* Get list of all IB devices, find match, open */
dev_list = ibv_get_devices();
dlist_start(dev_list);
- dlist_for_each_data(dev_list,hca_ptr->ib_trans.ib_dev,struct ibv_device) {
- if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev),hca_name))
+ dlist_for_each_data(dev_list,
+ hca_ptr->ib_trans.ib_dev,
+ struct ibv_device) {
+ if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
+ hca_name))
break;
}
@@ -226,20 +229,22 @@ DAT_RETURN dapls_ib_open_hca (
return DAT_INTERNAL_ERROR;
}
- dapl_dbg_log (DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n",
- ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
- (unsigned long
long)bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev)));
+ dapl_dbg_log (
+ DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n",
+ ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
+ (unsigned long long)
+ bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev)));
hca_ptr->ib_hca_handle = ibv_open_device(hca_ptr->ib_trans.ib_dev);
if (!hca_ptr->ib_hca_handle) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
" open_hca: IB dev open failed for %s\n",
- ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
+ ibv_get_device_name(hca_ptr->ib_trans.ib_dev));
return DAT_INTERNAL_ERROR;
}
hca_ptr->ib_trans.ib_ctx = hca_ptr->ib_hca_handle;
- /* set inline max with enviromment or default, get local lid and gid 0 */
+ /* set inline max with env or default, get local lid and gid 0 */
hca_ptr->ib_trans.max_inline_send =
dapl_os_get_env_val("DAPL_MAX_INLINE", INLINE_SEND_DEFAULT);
@@ -253,15 +258,17 @@ DAT_RETURN dapls_ib_open_hca (
}
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " open_hca: GID subnet %016llx id %016llx\n",
- (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
- (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+ " open_hca: GID subnet %016llx id %016llx\n",
+ (unsigned long long)
+ bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+ (unsigned long long)
+ bswap_64(hca_ptr->ib_trans.gid.global.interface_id));
/* get the IP address of the device using GID */
if (dapli_get_hca_addr(hca_ptr)) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
" open_hca: ERR ib_at_ips_by_gid for %s\n",
- ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
+ ibv_get_device_name(hca_ptr->ib_trans.ib_dev));
goto bail;
}
@@ -310,15 +317,23 @@ DAT_RETURN dapls_ib_open_hca (
write(g_ib_pipe[1], "w", sizeof "w");
dapl_os_unlock(&g_hca_lock);
- dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
- " open_hca: %s, port %d, %s %d.%d.%d.%d INLINE_MAX=%d\n",
- ibv_get_device_name(hca_ptr->ib_trans.ib_dev), hca_ptr->port_num,
- ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_family == AF_INET ?
"AF_INET":"AF_INET6",
- ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff,
- ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff,
- ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff,
- ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
- hca_ptr->ib_trans.max_inline_send );
+ dapl_dbg_log (
+ DAPL_DBG_TYPE_UTIL,
+ " open_hca: %s, port %d, %s %d.%d.%d.%d INLINE_MAX=%d\n",
+ ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
+ hca_ptr->port_num,
+ ((struct sockaddr_in *)
+ &hca_ptr->hca_address)->sin_family == AF_INET ?
+ "AF_INET":"AF_INET6",
+ ((struct sockaddr_in *)
+ &hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff,
+ ((struct sockaddr_in *)
+ &hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff,
+ ((struct sockaddr_in *)
+ &hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff,
+ ((struct sockaddr_in *)
+ &hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
+ hca_ptr->ib_trans.max_inline_send );
hca_ptr->ib_trans.d_hca = hca_ptr;
return DAT_SUCCESS;
@@ -370,7 +385,7 @@ DAT_RETURN dapls_ib_close_hca ( IN DAP
sleep.tv_sec = 0;
sleep.tv_nsec = 10000000; /* 10 ms */
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " ib_thread_destroy: waiting on hca %p destroy\n");
+ " ib_thread_destroy: wait on hca %p destroy\n");
nanosleep (&sleep, &remain);
}
return (DAT_SUCCESS);
@@ -425,19 +440,26 @@ DAT_RETURN dapls_ib_query_hca (
if (ia_attr != NULL) {
ia_attr->adapter_name[DAT_NAME_MAX_LENGTH - 1] = '\0';
ia_attr->vendor_name[DAT_NAME_MAX_LENGTH - 1] = '\0';
- ia_attr->ia_address_ptr = (DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address;
+ ia_attr->ia_address_ptr =
+ (DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address;
dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
" query_hca: %s %s %d.%d.%d.%d\n",
ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
- ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_family == AF_INET ?
"AF_INET":"AF_INET6",
- ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 &
0xff,
- ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 &
0xff,
- ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 &
0xff,
- ((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 &
0xff );
+ ((struct sockaddr_in *)
+ ia_attr->ia_address_ptr)->sin_family == AF_INET ?
+ "AF_INET":"AF_INET6",
+ ((struct sockaddr_in *)
+ ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 & 0xff,
+ ((struct sockaddr_in *)
+ ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 & 0xff,
+ ((struct sockaddr_in *)
+ ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 & 0xff,
+ ((struct sockaddr_in *)
+ ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 & 0xff);
ia_attr->hardware_version_major = dev_attr.hw_ver;
- ia_attr->hardware_version_minor = dev_attr.fw_ver;
+ /* ia_attr->hardware_version_minor = dev_attr.fw_ver; */
ia_attr->max_eps = dev_attr.max_qp;
ia_attr->max_dto_per_ep = dev_attr.max_qp_wr;
ia_attr->max_rdma_read_per_ep = dev_attr.max_qp_rd_atom;
@@ -468,7 +490,6 @@ DAT_RETURN dapls_ib_query_hca (
ia_attr->max_mtu_size, ia_attr->max_rdma_size,
ia_attr->max_iov_segments_per_dto, ia_attr->max_lmrs,
ia_attr->max_rmrs );
-
}
if (ep_attr != NULL) {
@@ -522,27 +543,28 @@ DAT_RETURN dapls_ib_setup_async_callback
ib_hca_transport_t *hca_ptr;
dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
- " setup_async_cb: ia %p type %d handle %p cb %p ctx %p\n",
+ " setup_async_cb: ia %p type %d hdl %p cb %p ctx %p\n",
ia_ptr, handler_type, evd_ptr, callback, context);
hca_ptr = &ia_ptr->hca_ptr->ib_trans;
switch(handler_type)
{
case DAPL_ASYNC_UNAFILIATED:
- hca_ptr->async_unafiliated = callback;
+ hca_ptr->async_unafiliated =
+ (ib_async_handler_t)callback;
hca_ptr->async_un_ctx = context;
break;
case DAPL_ASYNC_CQ_ERROR:
- hca_ptr->async_cq_error = callback;
- hca_ptr->async_cq_ctx = context;
+ hca_ptr->async_cq_error =
+ (ib_async_cq_handler_t)callback;
break;
case DAPL_ASYNC_CQ_COMPLETION:
- hca_ptr->async_cq = callback;
- hca_ptr->async_ctx = context;
+ hca_ptr->async_cq =
+ (ib_async_dto_handler_t)callback;
break;
case DAPL_ASYNC_QP_ERROR:
- hca_ptr->async_qp_error = callback;
- hca_ptr->async_qp_ctx = context;
+ hca_ptr->async_qp_error =
+ (ib_async_qp_handler_t)callback;
break;
default:
break;
@@ -573,7 +595,6 @@ void dapli_ib_thread_destroy(void)
int retries = 10;
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
" ib_thread_destroy(%d)\n", getpid());
-
/*
* wait for async thread to terminate.
* pthread_join would be the correct method
@@ -623,34 +644,42 @@ void dapli_async_event_cb(struct _ib_hca
case IBV_EVENT_CQ_ERR:
{
- dapl_dbg_log(DAPL_DBG_TYPE_WARN,
- " dapli_async_event CQ ERR %d\n",
- event.event_type);
+ struct dapl_ep *evd_ptr =
+ event.element.cq->cq_context;
+
+ dapl_dbg_log(
+ DAPL_DBG_TYPE_WARN,
+ " dapli_async_event CQ (%p) ERR %d\n",
+ evd_ptr, event.event_type);
/* report up if async callback still setup */
if (hca->async_cq_error)
hca->async_cq_error(hca->ib_ctx,
+ event.element.cq,
&event,
- hca->async_cq_ctx);
+ (void*)evd_ptr);
break;
}
case IBV_EVENT_COMM_EST:
{
- /* Received messages on connected QP before RTU */
- struct dapl_ep *ep_ptr = event.element.qp->qp_context;
+ /* Received msgs on connected QP before RTU */
+ struct dapl_ep *ep_ptr =
+ event.element.qp->qp_context;
/* TODO: cannot process COMM_EST until ibv
* guarantees valid QP context for events.
* Race conditions exist with QP destroy call.
* For now, assume the RTU will arrive.
*/
- dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
- " dapli_async_event COMM_EST (qp=%p)\n",
- event.element.qp);
+ dapl_dbg_log(
+ DAPL_DBG_TYPE_UTIL,
+ " dapli_async_event COMM_EST(qp=%p)\n",
+ event.element.qp);
if (!DAPL_BAD_HANDLE(ep_ptr, DAPL_MAGIC_EP) &&
ep_ptr->cm_handle != IB_INVALID_HANDLE)
- ib_cm_establish(ep_ptr->cm_handle->cm_id);
+ ib_cm_establish(
+ ep_ptr->cm_handle->cm_id);
break;
}
@@ -662,15 +691,20 @@ void dapli_async_event_cb(struct _ib_hca
case IBV_EVENT_SRQ_LIMIT_REACHED:
case IBV_EVENT_SQ_DRAINED:
{
- dapl_dbg_log(DAPL_DBG_TYPE_WARN,
- " dapli_async_event QP ERR %d\n",
- event.event_type);
+ struct dapl_ep *ep_ptr =
+ event.element.qp->qp_context;
+
+ dapl_dbg_log(
+ DAPL_DBG_TYPE_WARN,
+ " dapli_async_event QP (%p) ERR %d\n",
+ ep_ptr, event.event_type);
/* report up if async callback still setup */
if (hca->async_qp_error)
hca->async_qp_error(hca->ib_ctx,
+ event.element.qp,
&event,
- hca->async_qp_ctx);
+ (void*)ep_ptr);
break;
}
case IBV_EVENT_PATH_MIG:
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h (revision 3768)
+++ dapl/openib/dapl_ib_util.h (working copy)
@@ -141,7 +141,7 @@ typedef enum ibv_send_flags ib_send_op_t
typedef struct ibv_sge ib_data_segment_t;
typedef enum ibv_qp_state ib_qp_state_t;
typedef enum ibv_event_type ib_async_event_type;
-typedef struct ibv_async_event ib_error_record_t;
+typedef struct ibv_async_event ib_error_record_t;
/* CQ notifications */
typedef enum
@@ -222,12 +222,30 @@ typedef struct ibv_comp_channel *ib_wait
* ibv_post_recv - Return 0, -1 & bad_wr
*/
-/* async handler for CQ, QP, and unafiliated */
+/* async handler for DTO, CQ, QP, and unafiliated */
+typedef void (*ib_async_dto_handler_t)(
+ IN ib_hca_handle_t ib_hca_handle,
+ IN ib_error_record_t *err_code,
+ IN void *context);
+
+typedef void (*ib_async_cq_handler_t)(
+ IN ib_hca_handle_t ib_hca_handle,
+ IN ib_cq_handle_t ib_cq_handle,
+ IN ib_error_record_t *err_code,
+ IN void *context);
+
+typedef void (*ib_async_qp_handler_t)(
+ IN ib_hca_handle_t ib_hca_handle,
+ IN ib_qp_handle_t ib_qp_handle,
+ IN ib_error_record_t *err_code,
+ IN void *context);
+
typedef void (*ib_async_handler_t)(
IN ib_hca_handle_t ib_hca_handle,
IN ib_error_record_t *err_code,
IN void *context);
+
/* ib_hca_transport_t, specific to this implementation */
typedef struct _ib_hca_transport
{
@@ -244,12 +262,9 @@ typedef struct _ib_hca_transport
union ibv_gid gid;
ib_async_handler_t async_unafiliated;
void *async_un_ctx;
- ib_async_handler_t async_cq_error;
- void *async_ctx;
- ib_async_handler_t async_cq;
- void *async_cq_ctx;
- ib_async_handler_t async_qp_error;
- void *async_qp_ctx;
+ ib_async_cq_handler_t async_cq_error;
+ ib_async_dto_handler_t async_cq;
+ ib_async_qp_handler_t async_qp_error;
} ib_hca_transport_t;
More information about the general
mailing list