[openib-general] kdapl locking problem
Itamar Rabenstein
itamar at mellanox.co.il
Tue Jun 21 04:51:34 PDT 2005
>Jun 20 09:13:50 localhost kernel: dapl_cm_active_cb_handler 138 event = 9
>Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 760 event=16389
>Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 798 ep = c0a17bf8
>Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect_clean 579 ep=c0a17bf8
>Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect 538 ep=c0a17bf8 flags=0
>Jun 20 09:13:50 localhost kernel: dapl_destroy_cm_id 63
>Jun 20 09:13:50 localhost kernel: dapl_modify_qp_state_to_error 300
>Jun 20 09:13:50 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_ep.c:1111: spin_lock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) already locked by drivers/infiniband/ulp/dat-provider/dapl_evd.c/759
>Jun 20 09:13:51 localhost kernel: dapl_modify_qp_state_to_error 305
>Jun 20 09:13:51 localhost kernel: dapl_evd_connection_callback 800 ep = c0a17bf8
>Jun 20 09:13:51 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_evd.c:802: spin_unlock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) not locked
>Jun 20 09:13:51 localhost kernel: dapl_cr_callback 434 event=16389
>Jun 20 09:13:51 localhost kernel: dapl_cr_callback 512 ep = c9821bf8
>Jun 20 09:13:51 localhost kernel: dapl_ib_disconnect_clean 579 ep=c982m: DREQ rcvd
Hi Hal,
I think i have found the problem. It is not legal to call ib_modify_qp()
inside spin_lock. This function can sleep.
Please try the patch below and if it works please tell james to ci it.
fix locking problem in cm callback functions
Signed-off-by: Itamar Rabenstein <itamar at mellanox.co.il>
Index: dapl_openib_util.h
===================================================================
--- dapl_openib_util.h (revision 2665)
+++ dapl_openib_util.h (working copy)
@@ -125,7 +125,7 @@
void dapl_ib_reinit_ep(struct dapl_ep *ep);
-void dapl_ib_disconnect_clean(struct dapl_ep *ep, boolean_t passive);
+void dapl_ib_disconnect_clean(struct dapl_ep *ep);
u32 dapl_ib_get_async_event(struct ib_event *cause,
enum dat_event_number *async_event);
Index: dapl_cr.c
===================================================================
--- dapl_cr.c (revision 2665)
+++ dapl_cr.c (working copy)
@@ -479,8 +479,7 @@
/* If someone pulled the plug on the EP or connection,
* just exit
*/
- spin_unlock_irqrestore(&ep->common.lock,
- ep->common.flags);
+ spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
status = DAT_SUCCESS;
/* Set evd = NULL so we don't generate an event below */
evd = NULL;
@@ -504,36 +503,23 @@
/* The disconnect has already occurred, we are now
* cleaned up and ready to exit
*/
- spin_unlock_irqrestore(&ep->common.lock,
- ep->common.flags);
+ spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
return;
}
ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, FALSE);
spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
+ dapl_ib_disconnect_clean(ep);
break;
case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
case DAT_CONNECTION_EVENT_PEER_REJECTED:
case DAT_CONNECTION_EVENT_UNREACHABLE:
- /*
- * After posting an accept the requesting node has
- * stopped talking.
- */
+ case DAT_CONNECTION_EVENT_BROKEN:
spin_lock_irqsave(&ep->common.lock, ep->common.flags);
ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, FALSE);
spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
+ dapl_ib_disconnect_clean(ep);
break;
- case DAT_CONNECTION_EVENT_BROKEN:
- spin_lock_irqsave(&ep->common.lock, ep->common.flags);
- ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, FALSE);
- spin_unlock_irqrestore(&ep->common.lock,
- ep->common.flags);
-
- break;
default:
evd = NULL;
dapl_os_assert(0); /* shouldn't happen */
Index: dapl_evd.c
===================================================================
--- dapl_evd.c (revision 2665)
+++ dapl_evd.c (working copy)
@@ -760,7 +760,6 @@
switch (event) {
case DAT_CONNECTION_EVENT_ESTABLISHED:
- {
/*
* If we don't have an EP at this point we are very screwed up
*/
@@ -784,65 +783,28 @@
private_data_size);
}
spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
break;
- }
case DAT_CONNECTION_EVENT_DISCONNECTED:
- {
- /*
- * EP is now fully disconnected; initiate any post processing
- * to reset the underlying QP and get the EP ready for
- * another connection
- */
ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, TRUE);
- spin_unlock_irqrestore(&ep->common.lock,
- ep->common.flags);
-
+ spin_unlock_irqrestore(&ep->common.lock,ep->common.flags);
+ dapl_ib_disconnect_clean(ep);
break;
- }
case DAT_CONNECTION_EVENT_PEER_REJECTED:
- {
- ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, TRUE);
- spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
- break;
- }
case DAT_CONNECTION_EVENT_UNREACHABLE:
- {
- ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, TRUE);
- spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
- break;
- }
case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
- {
- ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, TRUE);
- spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
- break;
- }
case DAT_CONNECTION_EVENT_BROKEN:
- {
ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
- dapl_ib_disconnect_clean(ep, FALSE);
spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
+ dapl_ib_disconnect_clean(ep);
break;
- }
case DAT_CONNECTION_REQUEST_EVENT:
default:
- {
spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
evd = NULL;
dapl_os_assert(0); /* shouldn't happen */
break;
}
- }
/*
* Post the event
Index: dapl_openib_cm.c
===================================================================
--- dapl_openib_cm.c (revision 2665)
+++ dapl_openib_cm.c (working copy)
@@ -562,13 +562,12 @@
* void
*
*/
-void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr, boolean_t active)
+void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr)
{
int status;
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " >>> dapl_ib_disconnect_clean: EP: %p active %d\n",
- ep_ptr, active);
+ " >>> dapl_ib_disconnect_clean: EP: %p \n", ep_ptr);
/*
* Clean up outstanding connection state
--
Itamar
More information about the general
mailing list