[openib-general] kdapl locking problem

James Lentini jlentini at netapp.com
Thu Jun 23 07:15:29 PDT 2005


In this instance, the code path getting us in trouble is:

 ib_modify_qp() calls 
   mthca_modify_qp() calls
     mthca_MODIFY_QP() calls
       mthca_cmd_box() calls
         mthca_cmd_wait() calls
           wait_for_completion() calls
             schedule()

but there are numerous code paths that reach mthca_cmd_wait(). We may 
have made this mistake in other places.

Roland, is an OpenIB user allowed to hold a spin lock over a verbs 
call? 

On Tue, 21 Jun 2005, Itamar Rabenstein wrote:

itamar> >Jun 20 09:13:50 localhost kernel: dapl_cm_active_cb_handler 138 event = 9
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 760 event=16389
itamar> >Jun 20 09:13:50 localhost kernel: dapl_evd_connection_callback 798 ep = c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect_clean 579 ep=c0a17bf8
itamar> >Jun 20 09:13:50 localhost kernel: dapl_ib_disconnect 538 ep=c0a17bf8 flags=0
itamar> >Jun 20 09:13:50 localhost kernel: dapl_destroy_cm_id 63
itamar> >Jun 20 09:13:50 localhost kernel: dapl_modify_qp_state_to_error 300
itamar> >Jun 20 09:13:50 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_ep.c:1111: spin_lock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) already locked by drivers/infiniband/ulp/dat-provider/dapl_evd.c/759
itamar> >Jun 20 09:13:51 localhost kernel: dapl_modify_qp_state_to_error 305
itamar> >Jun 20 09:13:51 localhost kernel: dapl_evd_connection_callback 800 ep = c0a17bf8
itamar> >Jun 20 09:13:51 localhost kernel: drivers/infiniband/ulp/dat-provider/dapl_evd.c:802: spin_unlock(drivers/infiniband/ulp/dat-provider/dapl_ep.c:c0a17c08) not locked
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 434 event=16389
itamar> >Jun 20 09:13:51 localhost kernel: dapl_cr_callback 512 ep = c9821bf8
itamar> >Jun 20 09:13:51 localhost kernel: dapl_ib_disconnect_clean 579 ep=c982m: DREQ rcvd
itamar> 
itamar> Hi Hal,
itamar> I think i have found the problem. It is not legal to call ib_modify_qp()
itamar> inside spin_lock. This function can sleep.
itamar> 
itamar> Please try the patch below and if it works please tell james to ci it.
itamar> 
itamar> fix locking problem in cm callback functions
itamar> 
itamar> Signed-off-by: Itamar Rabenstein <itamar at mellanox.co.il>
itamar> 
itamar> Index: dapl_openib_util.h
itamar> ===================================================================
itamar> --- dapl_openib_util.h	(revision 2665)
itamar> +++ dapl_openib_util.h	(working copy)
itamar> @@ -125,7 +125,7 @@
itamar>  
itamar>  void dapl_ib_reinit_ep(struct dapl_ep *ep);
itamar>  
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep, boolean_t passive);
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep);
itamar>  
itamar>  u32 dapl_ib_get_async_event(struct ib_event *cause,
itamar>  			    enum dat_event_number *async_event);
itamar> Index: dapl_cr.c
itamar> ===================================================================
itamar> --- dapl_cr.c	(revision 2665)
itamar> +++ dapl_cr.c	(working copy)
itamar> @@ -479,8 +479,7 @@
itamar>  			/* If someone pulled the plug on the EP or connection,
itamar>  			 * just exit
itamar>  			 */
itamar> -			spin_unlock_irqrestore(&ep->common.lock, 
itamar> -					       ep->common.flags);
itamar> +			spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar>  			status = DAT_SUCCESS;
itamar>  			/* Set evd = NULL so we don't generate an event below */
itamar>  			evd = NULL;
itamar> @@ -504,36 +503,23 @@
itamar>  			/* The disconnect has already occurred, we are now
itamar>  			 * cleaned up and ready to exit
itamar>  			 */
itamar> -			spin_unlock_irqrestore(&ep->common.lock, 
itamar> -					       ep->common.flags);
itamar> +			spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar>  			return;
itamar>  		}
itamar>  		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, FALSE);
itamar>  		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> +		dapl_ib_disconnect_clean(ep);
itamar>  
itamar>  		break;
itamar>  	case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar>  	case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar>  	case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> -		/*
itamar> -		 * After posting an accept the requesting node has
itamar> -		 * stopped talking.
itamar> -		 */
itamar> +	case DAT_CONNECTION_EVENT_BROKEN:
itamar>  		spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar>  		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, FALSE);
itamar>  		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> +		dapl_ib_disconnect_clean(ep);
itamar>  		break;
itamar> -	case DAT_CONNECTION_EVENT_BROKEN:
itamar> -		spin_lock_irqsave(&ep->common.lock, ep->common.flags);
itamar> -		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, FALSE);
itamar> -		spin_unlock_irqrestore(&ep->common.lock,
itamar> -				       ep->common.flags);
itamar> -
itamar> -		break;
itamar>  	default:
itamar>  		evd = NULL;
itamar>  		dapl_os_assert(0);	/* shouldn't happen */
itamar> Index: dapl_evd.c
itamar> ===================================================================
itamar> --- dapl_evd.c	(revision 2665)
itamar> +++ dapl_evd.c	(working copy)
itamar> @@ -760,7 +760,6 @@
itamar>  
itamar>  	switch (event) {
itamar>  	case DAT_CONNECTION_EVENT_ESTABLISHED:
itamar> -	{
itamar>  		/* 
itamar>  		 * If we don't have an EP at this point we are very screwed up 
itamar>  		 */
itamar> @@ -784,65 +783,28 @@
itamar>  				      private_data_size);
itamar>  		}
itamar>  		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar>  		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_EVENT_DISCONNECTED:
itamar> -	{
itamar> -		/*
itamar> -		 * EP is now fully disconnected; initiate any post processing
itamar> -		 * to reset the underlying QP and get the EP ready for
itamar> -		 * another connection
itamar> -		 */
itamar>  		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, TRUE);
itamar> -		spin_unlock_irqrestore(&ep->common.lock,
itamar> -				       ep->common.flags);
itamar> -
itamar> +		spin_unlock_irqrestore(&ep->common.lock,ep->common.flags);
itamar> +		dapl_ib_disconnect_clean(ep);
itamar>  		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_EVENT_PEER_REJECTED:
itamar> -	{
itamar> -		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, TRUE);
itamar> -		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> -		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_EVENT_UNREACHABLE:
itamar> -	{
itamar> -		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, TRUE);
itamar> -		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> -		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_EVENT_NON_PEER_REJECTED:
itamar> -	{
itamar> -		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, TRUE);
itamar> -		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> -		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_EVENT_BROKEN:
itamar> -	{
itamar>  		ep->param.ep_state = DAT_EP_STATE_DISCONNECTED;
itamar> -		dapl_ib_disconnect_clean(ep, FALSE);
itamar>  		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar> -
itamar> +		dapl_ib_disconnect_clean(ep);
itamar>  		break;
itamar> -	}
itamar>  	case DAT_CONNECTION_REQUEST_EVENT:
itamar>  	default:
itamar> -	{
itamar>  		spin_unlock_irqrestore(&ep->common.lock, ep->common.flags);
itamar>  		evd = NULL;
itamar>  
itamar>  		dapl_os_assert(0);	/* shouldn't happen */
itamar>  		break;
itamar>  	}
itamar> -	}
itamar>  
itamar>  	/*
itamar>  	 * Post the event
itamar> Index: dapl_openib_cm.c
itamar> ===================================================================
itamar> --- dapl_openib_cm.c	(revision 2665)
itamar> +++ dapl_openib_cm.c	(working copy)
itamar> @@ -562,13 +562,12 @@
itamar>   *      void
itamar>   *
itamar>   */
itamar> -void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr, boolean_t active)
itamar> +void dapl_ib_disconnect_clean(struct dapl_ep *ep_ptr)
itamar>  {
itamar>  	int status;
itamar>  
itamar>  	dapl_dbg_log(DAPL_DBG_TYPE_CM,
itamar> -		     "  >>> dapl_ib_disconnect_clean: EP: %p active %d\n",
itamar> -		     ep_ptr, active);
itamar> +		     "  >>> dapl_ib_disconnect_clean: EP: %p \n", ep_ptr);
itamar>  
itamar>  	/*
itamar>  	 * Clean up outstanding connection state
itamar> -- 
itamar> Itamar
itamar> 



More information about the general mailing list