[openib-general] [PATCH] [RFC] - dapl - dat_ep_free() can return without freeing the endpoint

Steve Wise swise at opengridcomputing.com
Tue Apr 4 12:04:03 PDT 2006


Arlin/James,

I'm running dapltest over the Chelsio cxgb3 openib/iwarp driver. I'm
running into an intermittent failure where the server side fails to
properly clean up its resources.  It has to do with disconnect vs ep
freeing.  Basically if the disconnect event handler thread doesn't get
done (and turn off conn->in_callback) before the main dapltest thread
attempts to destroy the EP, then dat_ep_free() will return "ok I freed
it" even though it doesn't because in_callback == 1 in the dapl_cm_id
struct.  The idea, I assume, was to let the callback finish and destroy
the connection on the callback thread.  However, once dat_ep_free()
returns, the main dapltest thread then tries to free the EVDs and PZ and
gets errors because they are still in use.  

So dapli_destroy_conn() defers destroying the ib_qp if conn->in_callback
== 1.  This, however, leads to the dapltest program trying to destroy
CQs and PDs with a QP still attached to them.

Here is a patch that fixes the problem.  Please review.  Basically, I
changed the logic so that dapli_destroy_conn() will wait until the
callback finishes.  Is this gonna break anything else?

The patch probably needs to get rid of the usleep() in favor of a
pthread_cond_wait() or something, but you'll get the idea. I'd like to
get this patch into the trunk.  Lemme know what you think.

This fix seems to work fine with my testing so far over the chelsio
driver.  

Thanks,

Steve.


Signed-off-by: Steve Wise <swise at opengridcomputing.com>


Index: openib_cma/dapl_ib_cm.c
===================================================================
--- openib_cma/dapl_ib_cm.c	(revision 6107)
+++ openib_cma/dapl_ib_cm.c	(working copy)
@@ -62,9 +62,9 @@
 /* local prototypes */
 static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn, 
 					  struct rdma_cm_event *event);
-static int dapli_cm_active_cb(struct dapl_cm_id *conn, 
+static void dapli_cm_active_cb(struct dapl_cm_id *conn, 
 			      struct rdma_cm_event *event);
-static int dapli_cm_passive_cb(struct dapl_cm_id *conn, 
+static void dapli_cm_passive_cb(struct dapl_cm_id *conn, 
 			       struct rdma_cm_event *event);
 static void dapli_addr_resolve(struct dapl_cm_id *conn);
 static void dapli_route_resolve(struct dapl_cm_id *conn);
@@ -164,28 +164,34 @@
 void dapli_destroy_conn(struct dapl_cm_id *conn)
 {
 	int in_callback;
+	struct rdma_cm_id *cm_id;
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
 		     " destroy_conn: conn %p id %d\n",
 		     conn,conn->cm_id);
-
 	dapl_os_lock(&conn->lock);
 	conn->destroy = 1;
 	in_callback = conn->in_callback;
-	dapl_os_unlock(&conn->lock);
-
-	if (!in_callback) {
-		if (conn->ep)
-			conn->ep->cm_handle = IB_INVALID_HANDLE;
-		if (conn->cm_id) {
-			if (conn->cm_id->qp)
-				rdma_destroy_qp(conn->cm_id);
-			rdma_destroy_id(conn->cm_id);
+	do {
+		if (in_callback) {
+			dapl_os_unlock(&conn->lock);
+			usleep(10);
+			dapl_os_lock(&conn->lock);
 		}
-			
-		conn->cm_id = NULL;
-		dapl_os_free(conn, sizeof(*conn));
+		in_callback = conn->in_callback;
+	} while (in_callback);
+
+	if (conn->ep)
+		conn->ep->cm_handle = IB_INVALID_HANDLE;
+	cm_id = conn->cm_id;
+	conn->cm_id = NULL;
+	dapl_os_unlock(&conn->lock);
+	if (cm_id) {
+		if (cm_id->qp)
+			rdma_destroy_qp(cm_id);
+		rdma_destroy_id(cm_id);
 	}
+	dapl_os_free(conn, sizeof(*conn));
 }
 
 static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn,
@@ -243,11 +249,9 @@
 	return new_conn;
 }
 
-static int dapli_cm_active_cb(struct dapl_cm_id *conn,
+static void dapli_cm_active_cb(struct dapl_cm_id *conn,
 			      struct rdma_cm_event *event)
 {
-	int destroy;
-
 	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
 		     " active_cb: conn %p id %d event %d\n",
 		     conn, conn->cm_id, event->event );
@@ -255,7 +259,7 @@
 	dapl_os_lock(&conn->lock);
 	if (conn->destroy) {
 		dapl_os_unlock(&conn->lock);
-		return 0;
+		return;
 	}
 	conn->in_callback = 1;
 	dapl_os_unlock(&conn->lock);
@@ -303,16 +307,14 @@
 	}
 
 	dapl_os_lock(&conn->lock);
-	destroy = conn->destroy;
-	conn->in_callback = conn->destroy;
+	conn->in_callback = 0;
 	dapl_os_unlock(&conn->lock);
-	return(destroy);
+	return;
 }
 
-static int dapli_cm_passive_cb(struct dapl_cm_id *conn,
+static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
 			       struct rdma_cm_event *event)
 {
-	int destroy;
 	struct dapl_cm_id *new_conn;
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
@@ -322,7 +324,7 @@
 	dapl_os_lock(&conn->lock);
 	if (conn->destroy) {
 		dapl_os_unlock(&conn->lock);
-		return 0;
+		return;
 	}
 	conn->in_callback = 1;
 	dapl_os_unlock(&conn->lock);
@@ -377,10 +379,9 @@
 	}
 
 	dapl_os_lock(&conn->lock);
-	destroy = conn->destroy;
-	conn->in_callback = conn->destroy;
+	conn->in_callback = 0;
 	dapl_os_unlock(&conn->lock);
-	return(destroy);
+	return;
 }
 

@@ -1021,7 +1022,6 @@
 	/* process one CM event, fairness */
 	if(!ret) {
 		struct dapl_cm_id *conn;
-		int ret;
 				
 		/* set proper conn from cm_id context*/
 		if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
@@ -1059,24 +1059,9 @@
 		case RDMA_CM_EVENT_DISCONNECTED:
 			/* passive or active */
 			if (conn->sp) 
-				ret = dapli_cm_passive_cb(conn,event);
+				dapli_cm_passive_cb(conn,event);
 			else 
-				ret = dapli_cm_active_cb(conn,event);
-
-			/* destroy both qp and cm_id */
-			if (ret) {
-				dapl_dbg_log(DAPL_DBG_TYPE_CM, 
-					     " cma_cb: DESTROY conn %p" 
-					     " cm_id %p qp %p\n",
-					     conn, conn->cm_id, 
-					     conn->cm_id->qp);
-	
-				if (conn->cm_id->qp)
-					rdma_destroy_qp(conn->cm_id);
-
-				rdma_destroy_id(conn->cm_id);
-				dapl_os_free(conn, sizeof(*conn));
-			}
+				dapli_cm_active_cb(conn,event);
 			break;
 		case RDMA_CM_EVENT_CONNECT_RESPONSE:
 		default:




More information about the general mailing list