[ofw] [PATCH 1/5] uDAPL v2: Patch series for ucm, scm: fixes for issues discovered during scale-up, out testing
Arlin Davis
arlin.r.davis at intel.com
Wed Oct 28 16:19:03 PDT 2009
Linux testing completed with Intel MPI/HPCC benchmarks on 128 nodes, 1024 cores.
ucm, scm: address handles need destroyed when freeing Endpoints with UD QP's.
Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
dapl/openib_scm/cm.c | 4 ++++
dapl/openib_ucm/cm.c | 6 ++++++
dapl/openib_ucm/dapl_ib_util.h | 1 +
dapl/openib_ucm/device.c | 16 ++++++++++------
4 files changed, 21 insertions(+), 6 deletions(-)
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 453e32e..0d2d058 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -355,6 +355,10 @@ multi_cleanup:
dapl_os_lock(&cr->lock);
hca_ptr = cr->hca;
cr->ep = NULL;
+ if (cr->ah) {
+ ibv_destroy_ah(cr->ah);
+ cr->ah = NULL;
+ }
cr->state = DCM_DESTROY;
dapl_os_unlock(&cr->lock);
}
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index cc480c4..96ee382 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -679,6 +679,10 @@ static void ucm_ud_free(DAPL_EP *ep)
dapl_os_lock(&cm->lock);
hca = cm->hca;
cm->ep = NULL;
+ if (cm->ah) {
+ ibv_destroy_ah(cm->ah);
+ cm->ah = NULL;
+ }
cm->state = DCM_DESTROY;
dapl_os_unlock(&cm->lock);
}
@@ -1041,6 +1045,7 @@ ud_bail:
event = IB_CME_LOCAL_FAILURE;
goto bail;
}
+ cm->ah = xevent.remote_ah.ah; /* keep ref to destroy */
dapl_os_memcpy(&xevent.remote_ah.ia_addr,
&cm->msg.daddr,
@@ -1218,6 +1223,7 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
if (xevent.remote_ah.ah == NULL)
goto bail;
+ cm->ah = xevent.remote_ah.ah; /* keep ref to destroy */
dapl_os_memcpy(&xevent.remote_ah.ia_addr,
&cm->msg.daddr,
sizeof(union dcm_addr));
diff --git a/dapl/openib_ucm/dapl_ib_util.h b/dapl/openib_ucm/dapl_ib_util.h
index 27ff8dd..6273459 100644
--- a/dapl/openib_ucm/dapl_ib_util.h
+++ b/dapl/openib_ucm/dapl_ib_util.h
@@ -43,6 +43,7 @@ struct ib_cm_handle
struct dapl_hca *hca;
struct dapl_sp *sp;
struct dapl_ep *ep;
+ struct ibv_ah *ah;
uint16_t p_size; /* accept p_data, for retries */
uint8_t p_data[DCM_MAX_PDATA_SIZE];
ib_cm_msg_t msg;
diff --git a/dapl/openib_ucm/device.c b/dapl/openib_ucm/device.c
index 077446b..e890eef 100644
--- a/dapl/openib_ucm/device.c
+++ b/dapl/openib_ucm/device.c
@@ -434,14 +434,18 @@ static void ucm_service_destroy(IN DAPL_HCA *hca)
if (tp->rch)
ibv_destroy_comp_channel(tp->rch);
- dapl_log(DAPL_DBG_TYPE_UTIL,
- " destroy_service: pd %p ctx %p handle 0x%x\n",
- tp->pd, tp->pd->context, tp->pd->handle);
- if (tp->pd)
- ibv_dealloc_pd(tp->pd);
+ if (tp->ah) {
+ int i;
- if (tp->ah)
+ for (i = 0;i < 0xffff; i++) {
+ if (tp->ah[i])
+ ibv_destroy_ah(tp->ah[i]);
+ }
dapl_os_free(tp->ah, (sizeof(*tp->ah) * 0xffff));
+ }
+
+ if (tp->pd)
+ ibv_dealloc_pd(tp->pd);
if (tp->sid)
dapl_os_free(tp->sid, (sizeof(*tp->sid) * 0xffff));
--
1.5.2.5
More information about the ofw
mailing list