[ofw] [PATCH 10/12] dapl-2.0: ucm: fix issues with new EP to CM linking changes
Davis, Arlin R
arlin.r.davis at intel.com
Wed May 19 11:11:37 PDT 2010
Add EP locking around QP modify
Remove release during disconnect event processing
Add check in cm_free to check state and schedule thread if necessary.
Add some additional debugging
Add processing in disconnect_clean for conn_req timeout
Remove extra CR's
Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
dapl/openib_ucm/cm.c | 107 ++++++++++++++++++++++++++++++++++++-------------
1 files changed, 78 insertions(+), 29 deletions(-)
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index 85c8b4b..8fed8f6 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -392,13 +392,13 @@ static void ucm_process_recv(ib_hca_transport_t *tp,
cm->msg.op = htons(DCM_DREP);
ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0);
- }
- /* UD reply retried ok to ignore, any other print warning */
- if (ntohs(msg->op) != DCM_REP) {
+ } else if (ntohs(msg->op) != DCM_DREP){
+ /* DREP ok to ignore, any other print warning */
dapl_log(DAPL_DBG_TYPE_WARN,
- " ucm_recv: UNKNOWN operation"
- " <- op %d, %s spsp %d sqpn %d\n",
- ntohs(msg->op), dapl_cm_state_str(cm->state),
+ " ucm_recv: UNEXPECTED MSG on cm %p"
+ " <- op %s, st %s spsp %d sqpn %d\n",
+ cm, dapl_cm_op_str(ntohs(msg->op)),
+ dapl_cm_state_str(cm->state),
ntohs(msg->sport), ntohl(msg->sqpn));
}
dapl_os_unlock(&cm->lock);
@@ -635,11 +635,11 @@ void dapls_cm_acquire(dp_ib_cm_handle_t cm)
void dapls_cm_release(dp_ib_cm_handle_t cm)
{
dapl_os_lock(&cm->lock);
- cm->ref_count--;
- if (cm->ref_count) {
- dapl_os_unlock(&cm->lock);
- return;
- }
+ cm->ref_count--;
+ if (cm->ref_count) {
+ dapl_os_unlock(&cm->lock);
+ return;
+ }
/* client, release local conn id port */
if (!cm->sp && cm->msg.sport)
ucm_free_port(&cm->hca->ib_trans, ntohs(cm->msg.sport));
@@ -652,9 +652,9 @@ void dapls_cm_release(dp_ib_cm_handle_t cm)
if (cm->ah) {
ibv_destroy_ah(cm->ah);
cm->ah = NULL;
- }
- dapl_os_unlock(&cm->lock);
- dapli_cm_dealloc(cm);
+ }
+ dapl_os_unlock(&cm->lock);
+ dapli_cm_dealloc(cm);
}
dp_ib_cm_handle_t dapls_ib_cm_create(DAPL_EP *ep)
@@ -710,6 +710,11 @@ bail:
/* schedule destruction of CM object */
void dapli_cm_free(dp_ib_cm_handle_t cm)
{
+ dapl_log(DAPL_DBG_TYPE_CM,
+ " dapli_cm_free: cm %p %s ep %p refs=%d\n",
+ cm, dapl_cm_state_str(cm->state),
+ cm->ep, cm->ref_count);
+
dapl_os_lock(&cm->lock);
cm->state = DCM_FREE;
dapls_thread_signal(&cm->hca->ib_trans.signal);
@@ -720,15 +725,18 @@ void dapli_cm_free(dp_ib_cm_handle_t cm)
void dapls_cm_free(dp_ib_cm_handle_t cm)
{
dapl_log(DAPL_DBG_TYPE_CM,
- " cm_free: cm %p %s ep %p refs=%d\n",
+ " dapl_cm_free: cm %p %s ep %p refs=%d\n",
cm, dapl_cm_state_str(cm->state),
cm->ep, cm->ref_count);
/* free from internal workq, wait until EP is last ref */
dapl_os_lock(&cm->lock);
- cm->state = DCM_FREE;
+ if (cm->state != DCM_FREE)
+ cm->state = DCM_FREE;
+
while (cm->ref_count != 1) {
dapl_os_unlock(&cm->lock);
+ dapls_thread_signal(&cm->hca->ib_trans.signal);
dapl_os_sleep_usec(10000);
dapl_os_lock(&cm->lock);
}
@@ -804,8 +812,6 @@ static void ucm_disconnect_final(dp_ib_cm_handle_t cm)
else
dapl_evd_connection_callback(cm, IB_CME_DISCONNECTED, NULL, 0, cm->ep);
- /* free local resources, EP ref will prevent destory until dat_ep_free */
- dapls_cm_release(cm);
}
/*
@@ -815,6 +821,7 @@ static void ucm_disconnect_final(dp_ib_cm_handle_t cm)
DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
{
int finalize = 1;
+ int wakeup = 0;
dapl_os_lock(&cm->lock);
switch (cm->state) {
@@ -826,8 +833,8 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
/* send DREQ, event after DREP or DREQ timeout */
cm->state = DCM_DISC_PENDING;
cm->msg.op = htons(DCM_DREQ);
- finalize = 0; /* wait for DREP, wakeup timer thread */
- dapls_thread_signal(&cm->hca->ib_trans.signal);
+ finalize = 0; /* wait for DREP, wakeup timer after DREQ sent */
+ wakeup = 1;
break;
case DCM_DISC_PENDING:
/* DREQ timeout, resend until retries exhausted */
@@ -850,10 +857,29 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
if (cm->ep->qp_handle->qp_type != IBV_QPT_UD)
dapls_modify_qp_state(cm->ep->qp_handle, IBV_QPS_ERR,0,0,0);
- /* DREQ received, send DREP and schedule event */
+ /* DREQ received, send DREP and schedule event, finalize */
cm->msg.op = htons(DCM_DREP);
break;
+ case DCM_DISCONNECTED:
+ dapl_os_unlock(&cm->lock);
+ return DAT_SUCCESS;
default:
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ " disconnect UNKNOWN state: ep %p cm %p %s %s"
+ " %x %x %x %s %x %x %x r_pid %x,%d\n",
+ cm->ep, cm,
+ cm->msg.saddr.ib.qp_type == IBV_QPT_RC ? "RC" : "UD",
+ dapl_cm_state_str(cm->state),
+ ntohs(cm->msg.saddr.ib.lid),
+ ntohs(cm->msg.sport),
+ ntohl(cm->msg.saddr.ib.qpn),
+ cm->sp ? "<-" : "->",
+ ntohs(cm->msg.daddr.ib.lid),
+ ntohs(cm->msg.dport),
+ ntohl(cm->msg.daddr.ib.qpn),
+ ntohs(cm->msg.op) == DCM_REQ ? 0 : ntohl(*(DAT_UINT32*)cm->msg.resv),
+ ntohs(cm->msg.op) == DCM_REQ ? 0 : ntohl(*(DAT_UINT32*)cm->msg.resv));
+
dapl_os_unlock(&cm->lock);
return DAT_SUCCESS;
}
@@ -861,6 +887,9 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
dapl_os_get_time(&cm->timer); /* reply expected */
ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0);
dapl_os_unlock(&cm->lock);
+
+ if (wakeup)
+ dapls_thread_signal(&cm->hca->ib_trans.signal);
if (finalize)
ucm_disconnect_final(cm);
@@ -1302,6 +1331,9 @@ static int ucm_reply(dp_ib_cm_handle_t cm)
{
dapl_os_lock(&cm->lock);
if (cm->state != DCM_RTU_PENDING) {
+ dapl_log(DAPL_DBG_TYPE_ERR,
+ " CM_REPLY: wrong state %s",
+ dapl_cm_state_str(cm->state));
dapl_os_unlock(&cm->lock);
return -1;
}
@@ -1375,6 +1407,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
dapl_os_unlock(&cm->lock);
return DAT_INVALID_STATE;
}
+ dapl_os_unlock(&cm->lock);
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" ACCEPT_USR: remote lid=%x"
@@ -1401,6 +1434,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
#endif
/* modify QP to RTR and then to RTS with remote info already read */
+ dapl_os_lock(&ep->header.lock);
if (dapls_modify_qp_state(ep->qp_handle,
IBV_QPS_RTR,
cm->msg.daddr.ib.qpn,
@@ -1410,7 +1444,7 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
" ACCEPT_USR: QPS_RTR ERR %s -> lid %x qpn %x\n",
strerror(errno), ntohs(cm->msg.daddr.ib.lid),
ntohl(cm->msg.daddr.ib.qpn));
- dapl_os_unlock(&cm->lock);
+ dapl_os_unlock(&ep->header.lock);
goto bail;
}
if (dapls_modify_qp_state(ep->qp_handle,
@@ -1422,9 +1456,10 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
" ACCEPT_USR: QPS_RTS ERR %s -> lid %x qpn %x\n",
strerror(errno), ntohs(cm->msg.daddr.ib.lid),
ntohl(cm->msg.daddr.ib.qpn));
- dapl_os_unlock(&cm->lock);
+ dapl_os_unlock(&ep->header.lock);
goto bail;
}
+ dapl_os_unlock(&ep->header.lock);
/* save remote address information */
dapl_os_memcpy(&ep->remote_ia_address,
@@ -1449,8 +1484,9 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
dapl_ep_link_cm(ep, cm);
cm->ep = ep;
cm->hca = ia->hca_ptr;
+
+ dapl_os_lock(&cm->lock);
cm->state = DCM_RTU_PENDING;
- dapl_os_get_time(&cm->timer); /* RTU expected */
dapl_os_unlock(&cm->lock);
if (ucm_reply(cm)) {
@@ -1540,14 +1576,15 @@ dapls_ib_disconnect(IN DAPL_EP *ep_ptr, IN DAT_CLOSE_FLAGS close_flags)
{
dp_ib_cm_handle_t cm_ptr = dapl_get_cm_from_ep(ep_ptr);
+ dapl_os_lock(&ep_ptr->header.lock);
if (ep_ptr->param.ep_state == DAT_EP_STATE_DISCONNECTED ||
- ep_ptr->param.ep_attr.service_type != DAT_SERVICE_TYPE_RC) {
+ ep_ptr->param.ep_attr.service_type != DAT_SERVICE_TYPE_RC ||
+ cm_ptr == NULL) {
+ dapl_os_unlock(&ep_ptr->header.lock);
return DAT_SUCCESS;
}
+ dapl_os_unlock(&ep_ptr->header.lock);
- /* RC. Transition to error state to flush queue */
- dapls_modify_qp_state(ep_ptr->qp_handle, IBV_QPS_ERR, 0, 0, 0);
-
return (dapli_cm_disconnect(cm_ptr));
}
@@ -1575,7 +1612,19 @@ dapls_ib_disconnect_clean(IN DAPL_EP *ep,
IN DAT_BOOLEAN active,
IN const ib_cm_events_t ib_cm_event)
{
- /* nothing to cleanup */
+ if (ib_cm_event == IB_CME_TIMEOUT) {
+ dp_ib_cm_handle_t cm_ptr;
+
+ if ((cm_ptr = dapl_get_cm_from_ep(ep)) == NULL)
+ return;
+
+ dapl_log(DAPL_DBG_TYPE_WARN,
+ "dapls_ib_disc_clean: CONN_TIMEOUT ep %p cm %p %s\n",
+ ep, cm_ptr, dapl_cm_state_str(cm_ptr->state));
+
+ /* schedule release of socket and local resources */
+ dapli_cm_free(cm_ptr);
+ }
}
/*
--
1.5.2.5
More information about the ofw
mailing list