[openib-general] kdapl locking problem
James Lentini
jlentini at netapp.com
Thu Jun 23 07:15:29 PDT 2005
In this instance, the code path getting us in trouble is:
ib_modify_qp() calls
mthca_modify_qp() calls
mthca_MODIFY_QP() calls
mthca_cmd_box() calls
mthca_cmd_wait() calls
wait_for_completion() calls
schedule()
but there are numerous code paths that reach mthca_cmd_wait(). We may
have made this mistake in other places.
Roland, is an OpenIB user allowed to hold a spin lock over a verbs
call?
On Tue, 21 Jun 2005, Itamar Rabenstein wrote:
itamar> >Jun 20 09:13:50 localhost kernel: dapl_cm_active_cb_handler 138 event = 9
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 760 event=16389
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 798 ep = c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect_clean 579 ep=c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect 538 ep=c0a17bf8 flags=0
itamar> >Jun 20 09:13:50 localhost kernel: dapl_destroy_cm_id 63
itamar> >Jun 20 09:13:50 localhost kernel: dapl_modify_qp_state_to_error 300
itamar> >Jun 20 09:13:50 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_ep.c:1111: spin_lock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) already locked by drivers/infiniband/ulp/dat-provider/dapl_evd.c/759
itamar> >Jun 20 09:13:51 localhost kernel: dapl_modify_qp_state_to_error 305
itamar> >Jun 20 09:13:51 localhost kernel: dapl_evd_connection_callback 800 ep = c0a17bf8
itamar> >Jun 20 09:13:51 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_evd.c:802: spin_unlock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) not locked
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 434 event=16389
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 512 ep = c9821bf8
itamar> >Jun 20 09:13:51 localhost kernel: dapl_ib_disconnect_clean 579 ep=c982m: DREQ rcvd
itamar>
itamar> Hi Hal,
itamar> I think i have found the problem. It is not legal to call ib_modify_qp()
itamar> inside spin_lock. This function can sleep.
itamar>
itamar> Please try the patch below and if it works please tell james to ci it.
itamar>
itamar> fix locking problem in cm callback functions
itamar>
itamar> Signed-off-by: Itamar Rabenstein <itamar at mellanox.co.il>
itamar>
itamar> Index: dapl_openib_util.h
itamar> ===================================================================
itamar> --- dapl_openib_util.h (revision 2665)
itamar> +++ dapl_openib_util.h (working copy)
itamar> @@ -125,7 +125,7 @@
itamar>
itamar> void dapl_ib_reinit_ep(struct dapl_ep *ep);
itamar>
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep, boolean_t passive);
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep);
itamar>
itamar> u32 dapl_ib_get_async_event(struct ib_event *cause,
itamar> enum dat_event_number *async_event);
itamar> Index: dapl_cr.c
itamar> ===================================================================
itamar> --- dapl_cr.c (revision 2665)
itamar> +++ dapl_cr.c (working copy)
itamar> @@ -479,8 +479,7 @@
itamar> /* If someone pulled the plug on the EP or connection,
itamar> * just exit
itamar> */
itamar> - spin_unlock_irqrestore(&ep->common.lock,
itamar> - ep->common.flags);
itamar> + spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> status = DAT_SUCCESS;
itamar> /* Set evd = NULL so we don't generate an event below */
itamar> evd = NULL;
itamar> @@ -504,36 +503,23 @@
itamar> /* The disconnect has already occurred, we are now
itamar> * cleaned up and ready to exit
itamar> */
itamar> - spin_unlock_irqrestore(&ep->common.lock,
itamar> - ep->common.flags);
itamar> + spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> return;
itamar> }
itamar> ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, FALSE);
itamar> spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> + dapl_ib_disconnect_clean(ep);
itamar>
itamar> break;
itamar> case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar> case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar> case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> - /*
itamar> - * After posting an accept the requesting node has
itamar> - * stopped talking.
itamar> - */
itamar> + case DAT_CONNECTION_EVENT_BROKEN:
itamar> spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar> ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, FALSE);
itamar> spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> + dapl_ib_disconnect_clean(ep);
itamar> break;
itamar> - case DAT_CONNECTION_EVENT_BROKEN:
itamar> - spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar> - ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, FALSE);
itamar> - spin_unlock_irqrestore(&ep->common.lock,
itamar> - ep->common.flags);
itamar> -
itamar> - break;
itamar> default:
itamar> evd = NULL;
itamar> dapl_os_assert(0); /* shouldn't happen */
itamar> Index: dapl_evd.c
itamar> ===================================================================
itamar> --- dapl_evd.c (revision 2665)
itamar> +++ dapl_evd.c (working copy)
itamar> @@ -760,7 +760,6 @@
itamar>
itamar> switch (event) {
itamar> case DAT_CONNECTION_EVENT_ESTABLISHED:
itamar> - {
itamar> /*
itamar> * If we don't have an EP at this point we are very screwed up
itamar> */
itamar> @@ -784,65 +783,28 @@
itamar> private_data_size);
itamar> }
itamar> spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> break;
itamar> - }
itamar> case DAT_CONNECTION_EVENT_DISCONNECTED:
itamar> - {
itamar> - /*
itamar> - * EP is now fully disconnected; initiate any post processing
itamar> - * to reset the underlying QP and get the EP ready for
itamar> - * another connection
itamar> - */
itamar> ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, TRUE);
itamar> - spin_unlock_irqrestore(&ep->common.lock,
itamar> - ep->common.flags);
itamar> -
itamar> + spin_unlock_irqrestore(&ep->common.lock,ep->common.flags);
itamar> + dapl_ib_disconnect_clean(ep);
itamar> break;
itamar> - }
itamar> case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar> - {
itamar> - ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, TRUE);
itamar> - spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> - break;
itamar> - }
itamar> case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> - {
itamar> - ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, TRUE);
itamar> - spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> - break;
itamar> - }
itamar> case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar> - {
itamar> - ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, TRUE);
itamar> - spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> - break;
itamar> - }
itamar> case DAT_CONNECTION_EVENT_BROKEN:
itamar> - {
itamar> ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> - dapl_ib_disconnect_clean(ep, FALSE);
itamar> spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> + dapl_ib_disconnect_clean(ep);
itamar> break;
itamar> - }
itamar> case DAT_CONNECTION_REQUEST_EVENT:
itamar> default:
itamar> - {
itamar> spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> evd = NULL;
itamar>
itamar> dapl_os_assert(0); /* shouldn't happen */
itamar> break;
itamar> }
itamar> - }
itamar>
itamar> /*
itamar> * Post the event
itamar> Index: dapl_openib_cm.c
itamar> ===================================================================
itamar> --- dapl_openib_cm.c (revision 2665)
itamar> +++ dapl_openib_cm.c (working copy)
itamar> @@ -562,13 +562,12 @@
itamar> * void
itamar> *
itamar> */
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr, boolean_t active)
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr)
itamar> {
itamar> int status;
itamar>
itamar> dapl_dbg_log(DAPL_DBG_TYPE_CM,
itamar> - " >>> dapl_ib_disconnect_clean: EP: %p active %d\n",
itamar> - ep_ptr, active);
itamar> + " >>> dapl_ib_disconnect_clean: EP: %p \n", ep_ptr);
itamar>
itamar> /*
itamar> * Clean up outstanding connection state
itamar> --
itamar> Itamar
itamar>
More information about the general
mailing list