[openib-general] [PATCH] uDAPL async QP/CQ error handling fixed

Arlin Davis arlin.r.davis at intel.com
Thu Oct 13 11:23:38 PDT 2005


James,

Patch will fix the async error handling and callback mappings. QP/CQ error mappings were totally
screwed up. Updated TODO list. 

-arlin

Signed-off by: Arlin Davis <ardavis at ichips.intel.com>


Index: dapl/openib/TODO
===================================================================
--- dapl/openib/TODO	(revision 3768)
+++ dapl/openib/TODO	(working copy)
@@ -1,12 +1,10 @@
 
 IB Verbs:
 - CQ resize
-- mulitple CQ event support
 - memory window support
 
 DAPL:
 - reinit EP needs a QP timewait completion notification
-- direct cq_wait_object when multi-CQ verbs event support arrives
 - shared receive queue support
 
 Under discussion:
Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c	(revision 3768)
+++ dapl/openib/dapl_ib_util.c	(working copy)
@@ -214,8 +214,11 @@ DAT_RETURN dapls_ib_open_hca (
 	/* Get list of all IB devices, find match, open */
 	dev_list = ibv_get_devices();
 	dlist_start(dev_list);
-	dlist_for_each_data(dev_list,hca_ptr->ib_trans.ib_dev,struct ibv_device) {
-		if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev),hca_name))
+	dlist_for_each_data(dev_list,
+			    hca_ptr->ib_trans.ib_dev,
+			    struct ibv_device) {
+		if (!strcmp(ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
+						hca_name))
 			break;
 	}
 
@@ -226,20 +229,22 @@ DAT_RETURN dapls_ib_open_hca (
 		return DAT_INTERNAL_ERROR;
 	}
 	
-	dapl_dbg_log (DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n", 
-			ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
-			(unsigned long
long)bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev)));
+	dapl_dbg_log (
+	    DAPL_DBG_TYPE_UTIL," open_hca: Found dev %s %016llx\n", 
+	    ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
+	    (unsigned long long)
+		bswap_64(ibv_get_device_guid(hca_ptr->ib_trans.ib_dev)));
 
 	hca_ptr->ib_hca_handle = ibv_open_device(hca_ptr->ib_trans.ib_dev);
 	if (!hca_ptr->ib_hca_handle) {
 		dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
 			      " open_hca: IB dev open failed for %s\n", 
-			      ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
+			      ibv_get_device_name(hca_ptr->ib_trans.ib_dev));
 		return DAT_INTERNAL_ERROR;
 	}
 	hca_ptr->ib_trans.ib_ctx = hca_ptr->ib_hca_handle;
 
-	/* set inline max with enviromment or default, get local lid and gid 0 */
+	/* set inline max with env or default, get local lid and gid 0 */
 	hca_ptr->ib_trans.max_inline_send = 
 		dapl_os_get_env_val("DAPL_MAX_INLINE", INLINE_SEND_DEFAULT);
 
@@ -253,15 +258,17 @@ DAT_RETURN dapls_ib_open_hca (
 	}
 			
 	dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
-		     " open_hca: GID subnet %016llx id %016llx\n",
-		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
-		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+		" open_hca: GID subnet %016llx id %016llx\n",
+		(unsigned long long)
+			bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+		(unsigned long long)
+			bswap_64(hca_ptr->ib_trans.gid.global.interface_id));
 
 	/* get the IP address of the device using GID */
 	if (dapli_get_hca_addr(hca_ptr)) {
 		dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
 			      " open_hca: ERR ib_at_ips_by_gid for %s\n", 
-			      ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
+			      ibv_get_device_name(hca_ptr->ib_trans.ib_dev));
 		goto bail;
 	}
 
@@ -310,15 +317,23 @@ DAT_RETURN dapls_ib_open_hca (
 	write(g_ib_pipe[1], "w", sizeof "w");
 	dapl_os_unlock(&g_hca_lock);
 	
-  	dapl_dbg_log (DAPL_DBG_TYPE_UTIL, 
-		      " open_hca: %s, port %d, %s  %d.%d.%d.%d INLINE_MAX=%d\n", 
-		      ibv_get_device_name(hca_ptr->ib_trans.ib_dev), hca_ptr->port_num,
-		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_family == AF_INET ?
"AF_INET":"AF_INET6",
-		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff,
-		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff,
-		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff,
-		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
-		      hca_ptr->ib_trans.max_inline_send );
+  	dapl_dbg_log (
+		DAPL_DBG_TYPE_UTIL, 
+		" open_hca: %s, port %d, %s  %d.%d.%d.%d INLINE_MAX=%d\n",
+		ibv_get_device_name(hca_ptr->ib_trans.ib_dev), 
+		hca_ptr->port_num,
+		((struct sockaddr_in *)
+			&hca_ptr->hca_address)->sin_family == AF_INET ?  
+			"AF_INET":"AF_INET6",
+		((struct sockaddr_in *)
+			&hca_ptr->hca_address)->sin_addr.s_addr >> 0 & 0xff,
+		((struct sockaddr_in *)
+			&hca_ptr->hca_address)->sin_addr.s_addr >> 8 & 0xff,
+		((struct sockaddr_in *)
+			&hca_ptr->hca_address)->sin_addr.s_addr >> 16 & 0xff,
+		((struct sockaddr_in *)
+			&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
+		hca_ptr->ib_trans.max_inline_send );
 
 	hca_ptr->ib_trans.d_hca = hca_ptr;
 	return DAT_SUCCESS;
@@ -370,7 +385,7 @@ DAT_RETURN dapls_ib_close_hca (	IN   DAP
 		sleep.tv_sec = 0;
 		sleep.tv_nsec = 10000000; /* 10 ms */
 		dapl_dbg_log(DAPL_DBG_TYPE_UTIL, 
-			     " ib_thread_destroy: waiting on hca %p destroy\n");
+			     " ib_thread_destroy: wait on hca %p destroy\n");
 		nanosleep (&sleep, &remain);
 	}
 	return (DAT_SUCCESS);
@@ -425,19 +440,26 @@ DAT_RETURN dapls_ib_query_hca (
 	if (ia_attr != NULL) {
 		ia_attr->adapter_name[DAT_NAME_MAX_LENGTH - 1] = '\0';
 		ia_attr->vendor_name[DAT_NAME_MAX_LENGTH - 1] = '\0';
-		ia_attr->ia_address_ptr = (DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address;
+		ia_attr->ia_address_ptr = 
+			(DAT_IA_ADDRESS_PTR)&hca_ptr->hca_address;
 
 		dapl_dbg_log (DAPL_DBG_TYPE_UTIL, 
 			" query_hca: %s %s  %d.%d.%d.%d\n", 
 			ibv_get_device_name(hca_ptr->ib_trans.ib_dev),
-			((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_family == AF_INET ?
"AF_INET":"AF_INET6",
-			((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 &
0xff,
-			((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 &
0xff,
-			((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 &
0xff,
-			((struct sockaddr_in *)ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 &
0xff );
+			((struct sockaddr_in *)
+			ia_attr->ia_address_ptr)->sin_family == AF_INET ? 
+			"AF_INET":"AF_INET6",
+			((struct sockaddr_in *)
+			ia_attr->ia_address_ptr)->sin_addr.s_addr >> 0 & 0xff,
+			((struct sockaddr_in *)
+			ia_attr->ia_address_ptr)->sin_addr.s_addr >> 8 & 0xff,
+			((struct sockaddr_in *)
+			ia_attr->ia_address_ptr)->sin_addr.s_addr >> 16 & 0xff,
+			((struct sockaddr_in *)
+			ia_attr->ia_address_ptr)->sin_addr.s_addr >> 24 & 0xff);
 		
 		ia_attr->hardware_version_major   = dev_attr.hw_ver;
-		ia_attr->hardware_version_minor   = dev_attr.fw_ver;
+		/* ia_attr->hardware_version_minor   = dev_attr.fw_ver; */
 		ia_attr->max_eps                  = dev_attr.max_qp;
 		ia_attr->max_dto_per_ep           = dev_attr.max_qp_wr;
 		ia_attr->max_rdma_read_per_ep     = dev_attr.max_qp_rd_atom;
@@ -468,7 +490,6 @@ DAT_RETURN dapls_ib_query_hca (
 			ia_attr->max_mtu_size, ia_attr->max_rdma_size,
 			ia_attr->max_iov_segments_per_dto, ia_attr->max_lmrs, 
 			ia_attr->max_rmrs );
-
 	}
 	
 	if (ep_attr != NULL) {
@@ -522,27 +543,28 @@ DAT_RETURN dapls_ib_setup_async_callback
 	ib_hca_transport_t	*hca_ptr;
 
 	dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
-			" setup_async_cb: ia %p type %d handle %p cb %p ctx %p\n",
+			" setup_async_cb: ia %p type %d hdl %p cb %p ctx %p\n",
 			ia_ptr, handler_type, evd_ptr, callback, context);
 
 	hca_ptr = &ia_ptr->hca_ptr->ib_trans;
 	switch(handler_type)
 	{
 		case DAPL_ASYNC_UNAFILIATED:
-			hca_ptr->async_unafiliated = callback;
+			hca_ptr->async_unafiliated = 
+				(ib_async_handler_t)callback;
 			hca_ptr->async_un_ctx = context;
 			break;
 		case DAPL_ASYNC_CQ_ERROR:
-			hca_ptr->async_cq_error = callback;
-			hca_ptr->async_cq_ctx = context;
+			hca_ptr->async_cq_error = 
+				(ib_async_cq_handler_t)callback;
 			break;
 		case DAPL_ASYNC_CQ_COMPLETION:
-			hca_ptr->async_cq = callback;
-			hca_ptr->async_ctx = context;
+			hca_ptr->async_cq = 
+				(ib_async_dto_handler_t)callback;
 			break;
 		case DAPL_ASYNC_QP_ERROR:
-			hca_ptr->async_qp_error = callback;
-			hca_ptr->async_qp_ctx = context;
+			hca_ptr->async_qp_error = 
+				(ib_async_qp_handler_t)callback;
 			break;
 		default:
 			break;
@@ -573,7 +595,6 @@ void dapli_ib_thread_destroy(void)
 	int retries = 10;
 	dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
 		     " ib_thread_destroy(%d)\n", getpid());
-
 	/* 
 	 * wait for async thread to terminate. 
 	 * pthread_join would be the correct method
@@ -623,34 +644,42 @@ void dapli_async_event_cb(struct _ib_hca
 
 			case	IBV_EVENT_CQ_ERR:
 			{
-				dapl_dbg_log(DAPL_DBG_TYPE_WARN,
-					     " dapli_async_event CQ ERR %d\n",
-					     event.event_type);				
+				struct dapl_ep *evd_ptr = 
+					event.element.cq->cq_context;
+
+				dapl_dbg_log(
+					DAPL_DBG_TYPE_WARN,
+					" dapli_async_event CQ (%p) ERR %d\n",
+					evd_ptr, event.event_type);				
 				
 				/* report up if async callback still setup */
 				if (hca->async_cq_error)
 					hca->async_cq_error(hca->ib_ctx,
+							    event.element.cq,	
 							    &event,
-							    hca->async_cq_ctx);
+							    (void*)evd_ptr);
 				break;
 			}
 			case	IBV_EVENT_COMM_EST:
 			{
-				/* Received messages on connected QP before RTU */
-				struct dapl_ep *ep_ptr = event.element.qp->qp_context;
+				/* Received msgs on connected QP before RTU */
+				struct dapl_ep *ep_ptr = 
+					event.element.qp->qp_context;
 				
 				/* TODO: cannot process COMM_EST until ibv  
 				 * guarantees valid QP context for events. 
 				 * Race conditions exist with QP destroy call. 
 				 * For now, assume the RTU will arrive.
 				 */
-				dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
-					     " dapli_async_event COMM_EST (qp=%p)\n",
-					     event.element.qp);	
+				dapl_dbg_log(
+					DAPL_DBG_TYPE_UTIL,
+					" dapli_async_event COMM_EST(qp=%p)\n",
+					event.element.qp);	
 
 				if (!DAPL_BAD_HANDLE(ep_ptr, DAPL_MAGIC_EP) &&
 				    ep_ptr->cm_handle != IB_INVALID_HANDLE)
-					ib_cm_establish(ep_ptr->cm_handle->cm_id);
+					ib_cm_establish(
+						ep_ptr->cm_handle->cm_id);
 			
 				break;
 			}
@@ -662,15 +691,20 @@ void dapli_async_event_cb(struct _ib_hca
 			case	IBV_EVENT_SRQ_LIMIT_REACHED:
 			case	IBV_EVENT_SQ_DRAINED:
 			{
-				dapl_dbg_log(DAPL_DBG_TYPE_WARN,
-					     " dapli_async_event QP ERR %d\n",
-					     event.event_type);	
+				struct dapl_ep *ep_ptr = 
+					event.element.qp->qp_context;
+
+				dapl_dbg_log(
+					DAPL_DBG_TYPE_WARN,
+					" dapli_async_event QP (%p) ERR %d\n",
+					ep_ptr, event.event_type);	
 				
 				/* report up if async callback still setup */
 				if (hca->async_qp_error)
 					hca->async_qp_error(hca->ib_ctx,
+							    event.element.qp,
 							    &event,
-							    hca->async_qp_ctx);
+							    (void*)ep_ptr);
 				break;
 			}
 			case	IBV_EVENT_PATH_MIG:
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h	(revision 3768)
+++ dapl/openib/dapl_ib_util.h	(working copy)
@@ -141,7 +141,7 @@ typedef enum	ibv_send_flags	ib_send_op_t
 typedef	struct	ibv_sge		ib_data_segment_t;
 typedef enum	ibv_qp_state	ib_qp_state_t;
 typedef	enum	ibv_event_type	ib_async_event_type;
-typedef struct	ibv_async_event	ib_error_record_t;	
+typedef struct	ibv_async_event	ib_error_record_t;
 
 /* CQ notifications */
 typedef enum
@@ -222,12 +222,30 @@ typedef struct ibv_comp_channel *ib_wait
  * ibv_post_recv - Return 0, -1 & bad_wr 
  */
 
-/* async handler for CQ, QP, and unafiliated */
+/* async handler for DTO, CQ, QP, and unafiliated */
+typedef void (*ib_async_dto_handler_t)(
+    IN    ib_hca_handle_t    ib_hca_handle,
+    IN    ib_error_record_t  *err_code,
+    IN    void               *context);
+
+typedef void (*ib_async_cq_handler_t)(
+    IN    ib_hca_handle_t    ib_hca_handle,
+    IN    ib_cq_handle_t     ib_cq_handle,
+    IN    ib_error_record_t  *err_code,
+    IN    void               *context);
+
+typedef void (*ib_async_qp_handler_t)(
+    IN    ib_hca_handle_t    ib_hca_handle,
+    IN    ib_qp_handle_t     ib_qp_handle,
+    IN    ib_error_record_t  *err_code,
+    IN    void               *context);
+
 typedef void (*ib_async_handler_t)(
     IN    ib_hca_handle_t    ib_hca_handle,
     IN    ib_error_record_t  *err_code,
     IN    void               *context);
 
+
 /* ib_hca_transport_t, specific to this implementation */
 typedef struct _ib_hca_transport
 { 
@@ -244,12 +262,9 @@ typedef struct _ib_hca_transport
 	union ibv_gid		gid;
 	ib_async_handler_t	async_unafiliated;
 	void			*async_un_ctx;
-	ib_async_handler_t	async_cq_error;
-	void			*async_ctx;
-	ib_async_handler_t	async_cq;
-	void			*async_cq_ctx;
-	ib_async_handler_t	async_qp_error;
-	void			*async_qp_ctx;
+	ib_async_cq_handler_t	async_cq_error;
+	ib_async_dto_handler_t	async_cq;
+	ib_async_qp_handler_t	async_qp_error;
 
 } ib_hca_transport_t;









More information about the general mailing list