[openib-general] [PATCH 1.0] uDAPL - QP destroy and HCA close problems fixed

Arlin Davis arlin.r.davis at intel.com
Mon Feb 27 15:11:53 PST 2006


James,

Here is a small uDAPL patch that should go into 1.0 that fixes some issues that we just found with
MPI scale out testing on OpenIB. QP was not being destroyed in some cases and hca_close issues with
async work thread. I am still working one other elusive disconnect problem that may require another
small patch.

Thanks,

-arlin

Signed-off by: Arlin Davis <ardavis at ichips.intel.com>

Index: dapl/openib_cma/dapl_ib_util.c
===================================================================
--- dapl/openib_cma/dapl_ib_util.c	(revision 5489)
+++ dapl/openib_cma/dapl_ib_util.c	(working copy)
@@ -330,6 +330,13 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
 		hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
 	}
 
+	dapl_os_lock(&g_hca_lock);
+	if (g_ib_thread_state != IB_THREAD_RUN) {
+		dapl_os_unlock(&g_hca_lock);
+		goto bail;
+	}
+	dapl_os_unlock(&g_hca_lock);
+
 	/* 
 	 * Remove hca from async and CQ event processing list
 	 * Wakeup work thread to remove from polling list
@@ -342,10 +349,12 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
 		struct timespec	sleep, remain;
 		sleep.tv_sec = 0;
 		sleep.tv_nsec = 10000000; /* 10 ms */
+		write(g_ib_pipe[1], "w", sizeof "w");
 		dapl_dbg_log(DAPL_DBG_TYPE_UTIL, 
 			     " ib_thread_destroy: wait on hca %p destroy\n");
 		nanosleep (&sleep, &remain);
 	}
+bail:
 	return (DAT_SUCCESS);
 }
   
Index: dapl/openib_cma/dapl_ib_cm.c
===================================================================
--- dapl/openib_cma/dapl_ib_cm.c	(revision 5489)
+++ dapl/openib_cma/dapl_ib_cm.c	(working copy)
@@ -306,15 +306,6 @@ static int dapli_cm_active_cb(struct dap
 	destroy = conn->destroy;
 	conn->in_callback = conn->destroy;
 	dapl_os_unlock(&conn->lock);
-	if (destroy) {
-		dapl_dbg_log(DAPL_DBG_TYPE_CM, 
-			     " active_cb: DESTROY conn %p id %d \n",
-			     conn, conn->cm_id );
-		if (conn->ep)
-			conn->ep->cm_handle = IB_INVALID_HANDLE;
-		
-		dapl_os_free(conn, sizeof(*conn));
-	}
 	return(destroy);
 }
 
@@ -389,12 +380,6 @@ static int dapli_cm_passive_cb(struct da
 	destroy = conn->destroy;
 	conn->in_callback = conn->destroy;
 	dapl_os_unlock(&conn->lock);
-	if (destroy) {
-		if (conn->ep)
-			conn->ep->cm_handle = IB_INVALID_HANDLE;
-
-		dapl_os_free(conn, sizeof(*conn));
-	}
 	return(destroy);
 }
 
@@ -1080,10 +1065,21 @@ void dapli_cma_event_cb(void)
 				ret = dapli_cm_passive_cb(conn,event);
 			else 
 				ret = dapli_cm_active_cb(conn,event);
-			
-			if (ret) 
+
+			/* destroy both qp and cm_id */
+			if (ret) {
+				dapl_dbg_log(DAPL_DBG_TYPE_CM, 
+					     " cma_cb: DESTROY conn %p" 
+					     " cm_id %p qp %p\n",
+					     conn, conn->cm_id, 
+					     conn->cm_id->qp);
+	
+				if (conn->cm_id->qp)
+					rdma_destroy_qp(conn->cm_id);
+
 				rdma_destroy_id(conn->cm_id);
-			
+				dapl_os_free(conn, sizeof(*conn));
+			}
 			break;
 		case RDMA_CM_EVENT_CONNECT_RESPONSE:
 		default:
@@ -1095,7 +1091,7 @@ void dapli_cma_event_cb(void)
 		}
 		rdma_ack_cm_event(event);
 	} else {
-		dapl_dbg_log(DAPL_DBG_TYPE_WARN,
+		dapl_dbg_log(DAPL_DBG_TYPE_CM,
 			" cm_event: ERROR: rdma_get_cm_event() %d %d %s\n",
 			ret, errno, strerror(errno));
 	}
Index: dapl/openib_cma/dapl_ib_util.h
===================================================================
--- dapl/openib_cma/dapl_ib_util.h	(revision 5489)
+++ dapl/openib_cma/dapl_ib_util.h	(working copy)
@@ -295,7 +295,8 @@ dapl_convert_errno( IN int err, IN const
     if (!err)	return DAT_SUCCESS;
     	
 #if DAPL_DBG
-    if ((err != EAGAIN) && (err != ETIME) && (err != ETIMEDOUT))
+    if ((err != EAGAIN) && (err != ETIME) && 
+	(err != ETIMEDOUT) && (err != EINTR))
 	dapl_dbg_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err));
 #endif 
 
Index: dapl/openib_cma/dapl_ib_cq.c
===================================================================
--- dapl/openib_cma/dapl_ib_cq.c	(revision 5489)
+++ dapl/openib_cma/dapl_ib_cq.c	(working copy)
@@ -498,7 +498,10 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
 	if (timeout != DAT_TIMEOUT_INFINITE)
 		timeout_ms = timeout/1000;
 
-	status = poll(&cq_fd, 1, timeout_ms);
+	/* restart syscall */
+	while ((status = poll(&cq_fd, 1, timeout_ms)) == -1 )
+		if (errno == EINTR)
+			continue;
 
 	/* returned event */
 	if (status > 0) {
@@ -511,13 +514,15 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
 	/* timeout */
 	} else if (status == 0) 
 		status = ETIMEDOUT;
+	else 
+		status = errno;
 	
 	dapl_dbg_log(DAPL_DBG_TYPE_UTIL, 
 		     " cq_object_wait: RET evd %p ibv_cq %p ibv_ctx %p %s\n",
 		     evd_ptr, ibv_cq,ibv_ctx,strerror(errno));
 	
 	return(dapl_convert_errno(status,"cq_wait_object_wait"));
-	
+
 }
 #endif
 


-------------- next part --------------
A non-text attachment was scrubbed...
Name: udapl_patch_1.0
Type: application/octet-stream
Size: 4236 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060227/4c4ea6bf/attachment.obj>


More information about the general mailing list