[ofa-general] [PATCH 4/4] DAPL v2: ucm: tighten up locking with CM processing, state changes

Barry Mavin Barry.Mavin at recital.com
Wed Sep 9 17:02:24 PDT 2009


Hi

Not sure if this is the correct place to ask this question.

We have a cluster site with OFED 1.4.1 installed using mellanox cards and
switches which we installed from source on all boxes.

The cluster is behaving itself but we have had some throughput problems with
ib in our applications.

We have tried to use NFS / RDMA but cannot get it to work "out-of-the-box".
The kerenel modules all load fine but any attempt to mount from the client
hangs. On some occasions even causes the servers to reboot.

OS is redhat 5.3.
Mellanox fw is all at 2.6

Has anyone got NFS / RDMA working with redhat 5.3?

---
Regards
Barry Mavin
Recital Corporation



> From: "Davis, Arlin R" <arlin.r.davis at intel.com>
> Date: Wed, 9 Sep 2009 15:14:46 -0700
> To: "general at lists.openfabrics.org" <general at lists.openfabrics.org>,
> "ofw at lists.openfabrics.org" <ofw at lists.openfabrics.org>
> Subject: [ofa-general] [PATCH 4/4] DAPL v2: ucm: tighten up locking with CM
> processing, state changes
> 
> 
> 
> tighten up locking on CM processing and state changes
> and reduce the send completion threshold to 50 from 100
> to replenish the request message faster.
> 
> Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
> ---
>  dapl/openib_ucm/cm.c |   24 ++++++++++++++++++------
>  1 files changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
> index 5c5287f..e76e920 100644
> --- a/dapl/openib_ucm/cm.c
> +++ b/dapl/openib_ucm/cm.c
> @@ -185,7 +185,7 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm,
> ib_cm_msg_t *msg);
>  static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg, DAT_PVOID
> p_data, DAT_COUNT p_size);
>  DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm);
>  
> -#define UCM_SND_BURST 100
> +#define UCM_SND_BURST 50 
>  
>  /* Service ids - port space */
>  static uint16_t ucm_get_port(ib_hca_transport_t *tp, uint16_t port)
> @@ -916,11 +916,14 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm,
> ib_cm_msg_t *msg)
> /* Send RTU, no private data */
> cm->msg.op = htons(DCM_RTU);
> 
> + dapl_os_lock(&cm->lock);
> + cm->state = DCM_CONNECTED;
> + dapl_os_unlock(&cm->lock);
> +
> if (ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0))   
> goto bail;
>  
> /* init cm_handle and post the event with private data */
> - cm->state = DCM_CONNECTED;
> dapl_dbg_log(DAPL_DBG_TYPE_EP, " ACTIVE: connected!\n");
>  
>  #ifdef DAT_EXTENSIONS
> @@ -986,7 +989,10 @@ ud_bail:
> (DAT_PVOID *)&xevent);
>  
> /* we are done, don't destroy cm_ptr, need pdata */
> +  dapl_os_lock(&cm->lock);
> cm->state = DCM_RELEASED;
> +  dapl_os_unlock(&cm->lock);
> +  
> } else
>  #endif
> {
> @@ -1157,7 +1163,9 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm,
> ib_cm_msg_t *msg)
> (DAT_PVOID *)&xevent);
>  
>                  /* done with CM object, don't destroy cm, need pdata */
> +  dapl_os_lock(&cm->lock);
> cm->state = DCM_RELEASED;
> +  dapl_os_unlock(&cm->lock);
> } else {
>  #endif
> cm->ep->cm_handle = cm; /* only RC, multi CR's on UD */
> @@ -1262,8 +1270,6 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT
> p_size, DAT_PVOID p_data)
> */
> cm->p_size = p_size;
> dapl_os_memcpy(&cm->p_data, p_data, p_size);
> - if (ucm_send(&cm->hca->ib_trans, &cm->msg, p_data, p_size))   
> -  goto bail;
>  
> /* save state and setup valid reference to EP, HCA */
> dapl_os_lock(&cm->lock);
> @@ -1272,6 +1278,9 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT
> p_size, DAT_PVOID p_data)
> cm->state = DCM_ACCEPTED;
> dapl_os_unlock(&cm->lock);
>  
> + if (ucm_send(&cm->hca->ib_trans, &cm->msg, p_data, p_size))   
> +  goto bail;
> +
> dapl_dbg_log(DAPL_DBG_TYPE_CM, " PASSIVE: accepted!\n");
> return DAT_SUCCESS;
>  
> @@ -1587,14 +1596,17 @@ dapls_ib_reject_connection(IN dp_ib_cm_handle_t cm,
> if (psize)
> dapl_os_memcpy(&cm->msg.p_data, pdata, psize);
> 
> + /* cr_thread will destroy CR */
> + dapl_os_lock(&cm->lock);
> + cm->state = DCM_REJECTING;
> + dapl_os_unlock(&cm->lock);
> +
> if (ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0)) {
> dapl_log(DAPL_DBG_TYPE_WARN,
> " cm_reject: ERR: %s\n", strerror(errno));
> return DAT_INTERNAL_ERROR;
> }
> 
> - /* cr_thread will destroy CR */
> - cm->state = DCM_REJECTING;
> send(cm->hca->ib_trans.scm[1], "w", sizeof "w", 0);
> return DAT_SUCCESS;
>  }
> -- 
> 1.5.2.5
> 
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
> 
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general




More information about the general mailing list