[openib-general] Re: SPAM: [PATCH] [RFC] - dapl - dat_ep_free() can return without freeing the endpoint

James Lentini jlentini at netapp.com
Tue Apr 4 13:57:01 PDT 2006


Comments below:

On Tue, 4 Apr 2006, Steve Wise wrote:

> Index: openib_cma/dapl_ib_cm.c
> ===================================================================
> --- openib_cma/dapl_ib_cm.c	(revision 6107)
> +++ openib_cma/dapl_ib_cm.c	(working copy)
> @@ -62,9 +62,9 @@
>  /* local prototypes */
>  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn, 
>  					  struct rdma_cm_event *event);
> -static int dapli_cm_active_cb(struct dapl_cm_id *conn, 
> +static void dapli_cm_active_cb(struct dapl_cm_id *conn, 
>  			      struct rdma_cm_event *event);
> -static int dapli_cm_passive_cb(struct dapl_cm_id *conn, 
> +static void dapli_cm_passive_cb(struct dapl_cm_id *conn, 
>  			       struct rdma_cm_event *event);
>  static void dapli_addr_resolve(struct dapl_cm_id *conn);
>  static void dapli_route_resolve(struct dapl_cm_id *conn);
> @@ -164,28 +164,34 @@
>  void dapli_destroy_conn(struct dapl_cm_id *conn)
>  {
>  	int in_callback;
> +	struct rdma_cm_id *cm_id;
>  
>  	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
>  		     " destroy_conn: conn %p id %d\n",
>  		     conn,conn->cm_id);
> -
>  	dapl_os_lock(&conn->lock);
>  	conn->destroy = 1;
>  	in_callback = conn->in_callback;
> -	dapl_os_unlock(&conn->lock);
> -
> -	if (!in_callback) {
> -		if (conn->ep)
> -			conn->ep->cm_handle = IB_INVALID_HANDLE;
> -		if (conn->cm_id) {
> -			if (conn->cm_id->qp)
> -				rdma_destroy_qp(conn->cm_id);
> -			rdma_destroy_id(conn->cm_id);
> +	do {
> +		if (in_callback) {
> +			dapl_os_unlock(&conn->lock);
> +			usleep(10);
> +			dapl_os_lock(&conn->lock);
>  		}
> -			
> -		conn->cm_id = NULL;
> -		dapl_os_free(conn, sizeof(*conn));
> +		in_callback = conn->in_callback;
> +	} while (in_callback);
> +
> +	if (conn->ep)
> +		conn->ep->cm_handle = IB_INVALID_HANDLE;
> +	cm_id = conn->cm_id;
> +	conn->cm_id = NULL;
> +	dapl_os_unlock(&conn->lock);
> +	if (cm_id) {
> +		if (cm_id->qp)
> +			rdma_destroy_qp(cm_id);
> +		rdma_destroy_id(cm_id);
>  	}
> +	dapl_os_free(conn, sizeof(*conn));
>  }

This is an improvement. If dat_ep_free() returns success, DAPL 
consumers expect to be able to free the free'd EP's 
resources (PZ, etc.). We must have been getting lucky on IB.

I'm worried that a callback could occur ...

  void dapli_destroy_conn(struct dapl_cm_id *conn)
  {
          int in_callback;
          struct rdma_cm_id *cm_id;
  
          dapl_dbg_log(DAPL_DBG_TYPE_CM,
                       " destroy_conn: conn %p id %d\n",
                       conn,conn->cm_id);

          dapl_os_lock(&conn->lock);
          conn->destroy = 1;
          in_callback = conn->in_callback;
          do {
                  if (in_callback) {
                          dapl_os_unlock(&conn->lock);
                          usleep(10);
                          dapl_os_lock(&conn->lock);
                  }
                  in_callback = conn->in_callback;
          } while (in_callback);
  
          if (conn->ep)
                  conn->ep->cm_handle = IB_INVALID_HANDLE;
          cm_id = conn->cm_id;
          conn->cm_id = NULL;
          dapl_os_unlock(&conn->lock);

/* ... here */

          if (cm_id) {
                  if (cm_id->qp)
                          rdma_destroy_qp(cm_id);
                  rdma_destroy_id(cm_id);
          }
          dapl_os_free(conn, sizeof(*conn));
  }

Destroying the cm_id while in a callback would be bad.

>  static struct dapl_cm_id * dapli_req_recv(struct dapl_cm_id *conn,
> @@ -243,11 +249,9 @@
>  	return new_conn;
>  }
>  
> -static int dapli_cm_active_cb(struct dapl_cm_id *conn,
> +static void dapli_cm_active_cb(struct dapl_cm_id *conn,
>  			      struct rdma_cm_event *event)
>  {
> -	int destroy;
> -
>  	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
>  		     " active_cb: conn %p id %d event %d\n",
>  		     conn, conn->cm_id, event->event );
> @@ -255,7 +259,7 @@
>  	dapl_os_lock(&conn->lock);
>  	if (conn->destroy) {
>  		dapl_os_unlock(&conn->lock);
> -		return 0;
> +		return;
>  	}
>  	conn->in_callback = 1;
>  	dapl_os_unlock(&conn->lock);
> @@ -303,16 +307,14 @@
>  	}
>  
>  	dapl_os_lock(&conn->lock);
> -	destroy = conn->destroy;
> -	conn->in_callback = conn->destroy;
> +	conn->in_callback = 0;
>  	dapl_os_unlock(&conn->lock);
> -	return(destroy);
> +	return;
>  }
>  
> -static int dapli_cm_passive_cb(struct dapl_cm_id *conn,
> +static void dapli_cm_passive_cb(struct dapl_cm_id *conn,
>  			       struct rdma_cm_event *event)
>  {
> -	int destroy;
>  	struct dapl_cm_id *new_conn;
>  
>  	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> @@ -322,7 +324,7 @@
>  	dapl_os_lock(&conn->lock);
>  	if (conn->destroy) {
>  		dapl_os_unlock(&conn->lock);
> -		return 0;
> +		return;
>  	}
>  	conn->in_callback = 1;
>  	dapl_os_unlock(&conn->lock);
> @@ -377,10 +379,9 @@
>  	}
>  
>  	dapl_os_lock(&conn->lock);
> -	destroy = conn->destroy;
> -	conn->in_callback = conn->destroy;
> +	conn->in_callback = 0;
>  	dapl_os_unlock(&conn->lock);
> -	return(destroy);
> +	return;
>  }
>  
> 
> @@ -1021,7 +1022,6 @@
>  	/* process one CM event, fairness */
>  	if(!ret) {
>  		struct dapl_cm_id *conn;
> -		int ret;
>  				
>  		/* set proper conn from cm_id context*/
>  		if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
> @@ -1059,24 +1059,9 @@
>  		case RDMA_CM_EVENT_DISCONNECTED:
>  			/* passive or active */
>  			if (conn->sp) 
> -				ret = dapli_cm_passive_cb(conn,event);
> +				dapli_cm_passive_cb(conn,event);
>  			else 
> -				ret = dapli_cm_active_cb(conn,event);
> -
> -			/* destroy both qp and cm_id */
> -			if (ret) {
> -				dapl_dbg_log(DAPL_DBG_TYPE_CM, 
> -					     " cma_cb: DESTROY conn %p" 
> -					     " cm_id %p qp %p\n",
> -					     conn, conn->cm_id, 
> -					     conn->cm_id->qp);
> -	
> -				if (conn->cm_id->qp)
> -					rdma_destroy_qp(conn->cm_id);
> -
> -				rdma_destroy_id(conn->cm_id);
> -				dapl_os_free(conn, sizeof(*conn));
> -			}
> +				dapli_cm_active_cb(conn,event);

If this code is removed, we'll need to update functions that set 
conn->destroy to 1 to destroy the cm_id.

What happens if a consumer attempts to free the EP from a callback? 
With this change (or any one that blocked a callback thread from 
attempting to free the EP), I believe we would deadlock. 

Is it possible to destroy CM IDs from within a callback? 



More information about the general mailing list