[openib-general] kdapl locking problem

Itamar Rabenstein itamar at mellanox.co.il
Tue Jun 21 04:51:34 PDT 2005


>Jun 20 09:13:50 localhost kernel: dapl_cm_active_cb_handler 138 event = 9
>Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 760 event=16389
>Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 798 ep = c0a17bf8
>Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect_clean 579 ep=c0a17bf8
>Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect 538 ep=c0a17bf8 flags=0
>Jun 20 09:13:50 localhost kernel: dapl_destroy_cm_id 63
>Jun 20 09:13:50 localhost kernel: dapl_modify_qp_state_to_error 300
>Jun 20 09:13:50 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_ep.c:1111: spin_lock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) already locked by drivers/infiniband/ulp/dat-provider/dapl_evd.c/759
>Jun 20 09:13:51 localhost kernel: dapl_modify_qp_state_to_error 305
>Jun 20 09:13:51 localhost kernel: dapl_evd_connection_callback 800 ep = c0a17bf8
>Jun 20 09:13:51 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_evd.c:802: spin_unlock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) not locked
>Jun 20 09:13:51 localhost kernel: dapl_cr_callback 434 event=16389
>Jun 20 09:13:51 localhost kernel: dapl_cr_callback 512 ep = c9821bf8
>Jun 20 09:13:51 localhost kernel: dapl_ib_disconnect_clean 579 ep=c982m: DREQ rcvd

Hi Hal,
I think i have found the problem. It is not legal to call ib_modify_qp()
inside spin_lock. This function can sleep.

Please try the patch below and if it works please tell james to ci it.

fix locking problem in cm callback functions

Signed-off-by: Itamar Rabenstein <itamar at mellanox.co.il>

Index: dapl_openib_util.h
===================================================================
--- dapl_openib_util.h	(revision 2665)
+++ dapl_openib_util.h	(working copy)
@@ -125,7 +125,7 @@
 
 void dapl_ib_reinit_ep(struct dapl_ep *ep);
 
-void dapl_ib_disconnect_clean(struct dapl_ep *ep, boolean_t passive);
+void dapl_ib_disconnect_clean(struct dapl_ep *ep);
 
 u32 dapl_ib_get_async_event(struct ib_event *cause,
 			    enum dat_event_number *async_event);
Index: dapl_cr.c
===================================================================
--- dapl_cr.c	(revision 2665)
+++ dapl_cr.c	(working copy)
@@ -479,8 +479,7 @@
 			/* If someone pulled the plug on the EP or connection,
 			 * just exit
 			 */
-			spin_unlock_irqrestore(&ep->common.lock, 
-					       ep->common.flags);
+			spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
 			status = DAT_SUCCESS;
 			/* Set evd = NULL so we don't generate an event below */
 			evd = NULL;
@@ -504,36 +503,23 @@
 			/* The disconnect has already occurred, we are now
 			 * cleaned up and ready to exit
 			 */
-			spin_unlock_irqrestore(&ep->common.lock, 
-					       ep->common.flags);
+			spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
 			return;
 		}
 		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, FALSE);
 		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
+		dapl_ib_disconnect_clean(ep);
 
 		break;
 	case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
 	case DAT_CONNECTION_EVENT_PEER_REJECTED:
 	case DAT_CONNECTION_EVENT_UNREACHABLE:
-		/*
-		 * After posting an accept the requesting node has
-		 * stopped talking.
-		 */
+	case DAT_CONNECTION_EVENT_BROKEN:
 		spin_lock_irqsave(&ep->common.lock, ep->common.flags);
 		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, FALSE);
 		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
+		dapl_ib_disconnect_clean(ep);
 		break;
-	case DAT_CONNECTION_EVENT_BROKEN:
-		spin_lock_irqsave(&ep->common.lock, ep->common.flags);
-		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, FALSE);
-		spin_unlock_irqrestore(&ep->common.lock,
-				       ep->common.flags);
-
-		break;
 	default:
 		evd = NULL;
 		dapl_os_assert(0);	/* shouldn't happen */
Index: dapl_evd.c
===================================================================
--- dapl_evd.c	(revision 2665)
+++ dapl_evd.c	(working copy)
@@ -760,7 +760,6 @@
 
 	switch (event) {
 	case DAT_CONNECTION_EVENT_ESTABLISHED:
-	{
 		/* 
 		 * If we don't have an EP at this point we are very screwed up 
 		 */
@@ -784,65 +783,28 @@
 				      private_data_size);
 		}
 		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
 		break;
-	}
 	case DAT_CONNECTION_EVENT_DISCONNECTED:
-	{
-		/*
-		 * EP is now fully disconnected; initiate any post processing
-		 * to reset the underlying QP and get the EP ready for
-		 * another connection
-		 */
 		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, TRUE);
-		spin_unlock_irqrestore(&ep->common.lock,
-				       ep->common.flags);
-
+		spin_unlock_irqrestore(&ep->common.lock,ep->common.flags);
+		dapl_ib_disconnect_clean(ep);
 		break;
-	}
 	case DAT_CONNECTION_EVENT_PEER_REJECTED:
-	{
-		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, TRUE);
-		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
-		break;
-	}
 	case DAT_CONNECTION_EVENT_UNREACHABLE:
-	{
-		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, TRUE);
-		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
-		break;
-	}
 	case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
-	{
-		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, TRUE);
-		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
-		break;
-	}
 	case DAT_CONNECTION_EVENT_BROKEN:
-	{
 		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
-		dapl_ib_disconnect_clean(ep, FALSE);
 		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
-
+		dapl_ib_disconnect_clean(ep);
 		break;
-	}
 	case DAT_CONNECTION_REQUEST_EVENT:
 	default:
-	{
 		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
 		evd = NULL;
 
 		dapl_os_assert(0);	/* shouldn't happen */
 		break;
 	}
-	}
 
 	/*
 	 * Post the event
Index: dapl_openib_cm.c
===================================================================
--- dapl_openib_cm.c	(revision 2665)
+++ dapl_openib_cm.c	(working copy)
@@ -562,13 +562,12 @@
  *      void
  *
  */
-void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr, boolean_t active)
+void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr)
 {
 	int status;
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     "  >>> dapl_ib_disconnect_clean: EP: %p active %d\n",
-		     ep_ptr, active);
+		     "  >>> dapl_ib_disconnect_clean: EP: %p \n", ep_ptr);
 
 	/*
 	 * Clean up outstanding connection state
-- 
Itamar



More information about the general mailing list