[openib-general] Re: [PATCH] [RFC] - dapl - dat_ep_free() can return without freeing the endpoint

Steve Wise swise at opengridcomputing.com
Tue Apr 4 15:30:19 PDT 2006


> resources (PZ, etc.). We must have been getting lucky on IB.
> 
> I'm worried that a callback could occur ...
> 
>   void dapli_destroy_conn(struct dapl_cm_id *conn)
>   {
>           int in_callback;
>           struct rdma_cm_id *cm_id;
>   
>           dapl_dbg_log(DAPL_DBG_TYPE_CM,
>                        " destroy_conn: conn %p id %d\n",
>                        conn,conn->cm_id);
> 
>           dapl_os_lock(&conn->lock);
>           conn->destroy = 1;
>           in_callback = conn->in_callback;
>           do {
>                   if (in_callback) {
>                           dapl_os_unlock(&conn->lock);
>                           usleep(10);
>                           dapl_os_lock(&conn->lock);
>                   }
>                   in_callback = conn->in_callback;
>           } while (in_callback);
>   
>           if (conn->ep)
>                   conn->ep->cm_handle = IB_INVALID_HANDLE;
>           cm_id = conn->cm_id;
>           conn->cm_id = NULL;
>           dapl_os_unlock(&conn->lock);
> 
> /* ... here */
> 
>           if (cm_id) {
>                   if (cm_id->qp)
>                           rdma_destroy_qp(cm_id);
>                   rdma_destroy_id(cm_id);
>           }
>           dapl_os_free(conn, sizeof(*conn));
>   }
> 
> Destroying the cm_id while in a callback would be bad.
> 
> >  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn,
> > @@ -243,11 +249,9 @@
> >  	return new_conn;
> >  }
> >  
> > -static int dapli_cm_active_cb(struct dapl_cm_id *conn,
> > +static void dapli_cm_active_cb(struct dapl_cm_id *conn,
> >  			      struct rdma_cm_event *event)
> >  {
> > -	int destroy;
> > -
> >  	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> >  		     " active_cb: conn %p id %d event %d\n",
> >  		     conn, conn->cm_id, event->event );
> > @@ -255,7 +259,7 @@
> >  	dapl_os_lock(&conn->lock);
> >  	if (conn->destroy) {
> >  		dapl_os_unlock(&conn->lock);
> > -		return 0;
> > +		return;
> >  	}
> >  	conn->in_callback = 1;
> >  	dapl_os_unlock(&conn->lock);
> > @@ -303,16 +307,14 @@
> >  	}
> >  
> >  	dapl_os_lock(&conn->lock);
> > -	destroy = conn->destroy;
> > -	conn->in_callback = conn->destroy;
> > +	conn->in_callback = 0;
> >  	dapl_os_unlock(&conn->lock);
> > -	return(destroy);
> > +	return;
> >  }
> >  
> > -static int dapli_cm_passive_cb(struct dapl_cm_id *conn,
> > +static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
> >  			       struct rdma_cm_event *event)
> >  {
> > -	int destroy;
> >  	struct dapl_cm_id *new_conn;
> >  
> >  	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> > @@ -322,7 +324,7 @@
> >  	dapl_os_lock(&conn->lock);
> >  	if (conn->destroy) {
> >  		dapl_os_unlock(&conn->lock);
> > -		return 0;
> > +		return;
> >  	}
> >  	conn->in_callback = 1;
> >  	dapl_os_unlock(&conn->lock);
> > @@ -377,10 +379,9 @@
> >  	}
> >  
> >  	dapl_os_lock(&conn->lock);
> > -	destroy = conn->destroy;
> > -	conn->in_callback = conn->destroy;
> > +	conn->in_callback = 0;
> >  	dapl_os_unlock(&conn->lock);
> > -	return(destroy);
> > +	return;
> >  }
> >  
> > 
> > @@ -1021,7 +1022,6 @@
> >  	/* process one CM event, fairness */
> >  	if(!ret) {
> >  		struct dapl_cm_id *conn;
> > -		int ret;
> >  				
> >  		/* set proper conn from cm_id context*/
> >  		if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
> > @@ -1059,24 +1059,9 @@
> >  		case RDMA_CM_EVENT_DISCONNECTED:
> >  			/* passive or active */
> >  			if (conn->sp) 
> > -				ret = dapli_cm_passive_cb(conn,event);
> > +				dapli_cm_passive_cb(conn,event);
> >  			else 
> > -				ret = dapli_cm_active_cb(conn,event);
> > -
> > -			/* destroy both qp and cm_id */
> > -			if (ret) {
> > -				dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> > -					     " cma_cb: DESTROY conn %p" 
> > -					     " cm_id %p qp %p\n",
> > -					     conn, conn->cm_id, 
> > -					     conn->cm_id->qp);
> > -	
> > -				if (conn->cm_id->qp)
> > -					rdma_destroy_qp(conn->cm_id);
> > -
> > -				rdma_destroy_id(conn->cm_id);
> > -				dapl_os_free(conn, sizeof(*conn));
> > -			}
> > +				dapli_cm_active_cb(conn,event);
> 
> If this code is removed, we'll need to update functions that set 
> conn->destroy to 1 to destroy the cm_id.
> 

I thought only dapli_destroy_conn() sets this, but now I see that
dapli_thread() can also set this so I'll go check this out.

> What happens if a consumer attempts to free the EP from a callback? 

There are no direct consumer callbacks in usermode are there?  consumers
call dat_evd_wait() or whatever and get scheduled.  Not like kernel
mode...  Or am I confused?

> With this change (or any one that blocked a callback thread from 
> attempting to free the EP), I believe we would deadlock. 
> 
> Is it possible to destroy CM IDs from within a callback? 

I'll go verify this again, but I don't think the callback dapl thread
does this.  

Stevo.





More information about the general mailing list