[openib-general] [PATCH 1.0] uDAPL - QP destroy and HCA close problems fixed
Arlin Davis
arlin.r.davis at intel.com
Mon Feb 27 15:11:53 PST 2006
James,
Here is a small uDAPL patch that should go into 1.0 that fixes some issues that we just found with
MPI scale out testing on OpenIB. QP was not being destroyed in some cases and hca_close issues with
async work thread. I am still working one other elusive disconnect problem that may require another
small patch.
Thanks,
-arlin
Signed-off by: Arlin Davis <ardavis at ichips.intel.com>
Index: dapl/openib_cma/dapl_ib_util.c
===================================================================
--- dapl/openib_cma/dapl_ib_util.c (revision 5489)
+++ dapl/openib_cma/dapl_ib_util.c (working copy)
@@ -330,6 +330,13 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
hca_ptr->ib_hca_handle = IB_INVALID_HANDLE;
}
+ dapl_os_lock(&g_hca_lock);
+ if (g_ib_thread_state != IB_THREAD_RUN) {
+ dapl_os_unlock(&g_hca_lock);
+ goto bail;
+ }
+ dapl_os_unlock(&g_hca_lock);
+
/*
* Remove hca from async and CQ event processing list
* Wakeup work thread to remove from polling list
@@ -342,10 +349,12 @@ DAT_RETURN dapls_ib_close_hca(IN DAPL_HC
struct timespec sleep, remain;
sleep.tv_sec = 0;
sleep.tv_nsec = 10000000; /* 10 ms */
+ write(g_ib_pipe[1], "w", sizeof "w");
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
" ib_thread_destroy: wait on hca %p destroy\n");
nanosleep (&sleep, &remain);
}
+bail:
return (DAT_SUCCESS);
}
Index: dapl/openib_cma/dapl_ib_cm.c
===================================================================
--- dapl/openib_cma/dapl_ib_cm.c (revision 5489)
+++ dapl/openib_cma/dapl_ib_cm.c (working copy)
@@ -306,15 +306,6 @@ static int dapli_cm_active_cb(struct dap
destroy = conn->destroy;
conn->in_callback = conn->destroy;
dapl_os_unlock(&conn->lock);
- if (destroy) {
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " active_cb: DESTROY conn %p id %d \n",
- conn, conn->cm_id );
- if (conn->ep)
- conn->ep->cm_handle = IB_INVALID_HANDLE;
-
- dapl_os_free(conn, sizeof(*conn));
- }
return(destroy);
}
@@ -389,12 +380,6 @@ static int dapli_cm_passive_cb(struct da
destroy = conn->destroy;
conn->in_callback = conn->destroy;
dapl_os_unlock(&conn->lock);
- if (destroy) {
- if (conn->ep)
- conn->ep->cm_handle = IB_INVALID_HANDLE;
-
- dapl_os_free(conn, sizeof(*conn));
- }
return(destroy);
}
@@ -1080,10 +1065,21 @@ void dapli_cma_event_cb(void)
ret = dapli_cm_passive_cb(conn,event);
else
ret = dapli_cm_active_cb(conn,event);
-
- if (ret)
+
+ /* destroy both qp and cm_id */
+ if (ret) {
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " cma_cb: DESTROY conn %p"
+ " cm_id %p qp %p\n",
+ conn, conn->cm_id,
+ conn->cm_id->qp);
+
+ if (conn->cm_id->qp)
+ rdma_destroy_qp(conn->cm_id);
+
rdma_destroy_id(conn->cm_id);
-
+ dapl_os_free(conn, sizeof(*conn));
+ }
break;
case RDMA_CM_EVENT_CONNECT_RESPONSE:
default:
@@ -1095,7 +1091,7 @@ void dapli_cma_event_cb(void)
}
rdma_ack_cm_event(event);
} else {
- dapl_dbg_log(DAPL_DBG_TYPE_WARN,
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_event: ERROR: rdma_get_cm_event() %d %d %s\n",
ret, errno, strerror(errno));
}
Index: dapl/openib_cma/dapl_ib_util.h
===================================================================
--- dapl/openib_cma/dapl_ib_util.h (revision 5489)
+++ dapl/openib_cma/dapl_ib_util.h (working copy)
@@ -295,7 +295,8 @@ dapl_convert_errno( IN int err, IN const
if (!err) return DAT_SUCCESS;
#if DAPL_DBG
- if ((err != EAGAIN) && (err != ETIME) && (err != ETIMEDOUT))
+ if ((err != EAGAIN) && (err != ETIME) &&
+ (err != ETIMEDOUT) && (err != EINTR))
dapl_dbg_log (DAPL_DBG_TYPE_ERR," %s %s\n", str, strerror(err));
#endif
Index: dapl/openib_cma/dapl_ib_cq.c
===================================================================
--- dapl/openib_cma/dapl_ib_cq.c (revision 5489)
+++ dapl/openib_cma/dapl_ib_cq.c (working copy)
@@ -498,7 +498,10 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
if (timeout != DAT_TIMEOUT_INFINITE)
timeout_ms = timeout/1000;
- status = poll(&cq_fd, 1, timeout_ms);
+ /* restart syscall */
+ while ((status = poll(&cq_fd, 1, timeout_ms)) == -1 )
+ if (errno == EINTR)
+ continue;
/* returned event */
if (status > 0) {
@@ -511,13 +514,15 @@ dapls_ib_wait_object_wait(IN ib_wait_obj
/* timeout */
} else if (status == 0)
status = ETIMEDOUT;
+ else
+ status = errno;
dapl_dbg_log(DAPL_DBG_TYPE_UTIL,
" cq_object_wait: RET evd %p ibv_cq %p ibv_ctx %p %s\n",
evd_ptr, ibv_cq,ibv_ctx,strerror(errno));
return(dapl_convert_errno(status,"cq_wait_object_wait"));
-
+
}
#endif
-------------- next part --------------
A non-text attachment was scrubbed...
Name: udapl_patch_1.0
Type: application/octet-stream
Size: 4236 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060227/4c4ea6bf/attachment.obj>
More information about the general
mailing list