[openib-general][PATCH][RFC]: CMA IB implementation

Mon Sep 19 10:59:16 PDT 2005

Guy German wrote:
> #define CMA_TARGET_MAX		4
> #define CMA_INITIATOR_DEPTH	4
> #define CMA_RC_RETRY_COUNT	7
> #define CMA_RNR_RETRY_COUNT	6
> #define CMA_CM_RESPONSE_TIMEOUT	20 /* 4 sec */
> #define CMA_MAX_CM_RETRIES	0

Are these values hard-coded just for the initial implementation?  How would 
these change?

> enum cma_close_flags {
> 	CMA_CLOSE_ABRUPT = 0,
> 	CMA_CLOSE_GRACEFUL
> };

Not sure what these are for.  Why not have the user always destroy the cma_id? 
If it hasn't yet been destroyed when a disconnect comes in, callback the user. 
If a connection hasn't been disconnected when it is destroyed, automatically 
send a disconnect message.

> struct accept_callback {
> 	ib_cma_ac_handler func;
> 	void *context;
> };
> 
> struct listen_callback {
> 	ib_cma_listen_handler func;
> 	void *context;
> };

These could be eliminated if we just associated a context with a cma_id and left 
it at that.  I think we're asking for race conditions if we try to update the 
context with every callback.

> static int cma_modify_qp_state(struct ib_cm_id *cm_id, struct ib_qp *qp,
> 				      enum ib_qp_state qp_state, 
> 				      int qp_attr_mask)
> {
> 	struct ib_qp_attr qp_attr;
> 	int status = 0;
> 	printk(KERN_DEBUG PFX "%s: enter >>> modify to %d\n",
> 	       __func__, qp_state);
> 
> 	if (qp == NULL)
> 		return -EINVAL;

We shouldn't need checks like this in the kernel.

> 	memset(&qp_attr, 0, sizeof qp_attr);
> 	qp_attr.qp_state = qp_state;
> 
> 	if (cm_id && !qp_attr_mask)

Or this check...

> static int destroy_cma_ctx(struct cma_context *cma_ctx)
> {
> 	if(!IS_ERR(cma_ctx->cm_id))
> 		ib_destroy_cm_id(cma_ctx->cm_id);
> 	if (cma_ctx->cma_param.private_data)
> 		kfree(cma_ctx->cma_param.private_data);

Is this the outbound private data or inbound?  Why not tie the private data to 
an event and avoid storing it with the cma_ctx?

> 	if (cma_ctx)
> 		kfree(cma_ctx);
> 	
> 	cma_ctx = NULL;

Is there a reason to set this to NULL?

> 	return 0;
> }

> static int cma_disconnect(struct ib_qp *qp,
> 			  struct cma_context *cma_ctx, 
> 			  enum cma_close_flags cflags)
> {
> 	int status;
> 
> 	if (cma_ctx == NULL)
> 		goto modqp;

We shouldn't need this check.

> 	if (cflags == CMA_CLOSE_ABRUPT)
> 		status = destroy_cma_ctx(cma_ctx);

Why would this call fail?  What would the user do if it does?

> 	else if (cflags == CMA_CLOSE_GRACEFUL){
> 		status = ib_send_cm_dreq(cma_ctx->cm_id, NULL, 0);
> 	}

See comments above.  Eliminate the GRACEFUL/ABRUPT flags and just let the user 
either issue the disconnect or just destroy the cma_ctx.

> modqp:
> 	status = cma_modify_qp_state(0, qp, IB_QPS_ERR, IB_QP_STATE);
> 	return status;
> }
> 
> void cma_connection_callback(struct cma_context *cma_ctx,
> 				const enum ib_cma_event event,
> 				const void *private_data)
> {
> 	ib_cma_event_handler conn_cb;
> 	struct ib_qp *qp = cma_ctx->cma_conn.qp;
> 	int status;
> 
> 	conn_cb = cma_ctx->cma_conn.cma_event_handler;
> 	
> 	switch (event) {
> 	case IB_CMA_EVENT_ESTABLISHED:
> 		break;
> 	case IB_CMA_EVENT_DISCONNECTED:
> 	case IB_CMA_EVENT_REJECTED:
> 	case IB_CMA_EVENT_UNREACHABLE:
> 	case IB_CMA_EVENT_NON_PEER_REJECTED:
> 		status = cma_disconnect(qp, cma_ctx, CMA_CLOSE_ABRUPT);

This is destroying the cma_ctx without the user knowing it.  The dereference to 
cma_ctx below will crash.  We shouldn't take any action on behalf of the user. 
Simply report the error and let the user destroy the cma_id.

> 		break;
> 	default:
> 		printk(KERN_ERR PFX "%s: unknown event !!\n", __func__);
> 	}
> 
> 	printk(KERN_DEBUG PFX "%s: event=%d\n", __func__, event);
> 
> 	conn_cb(event, cma_ctx->cma_conn.context, private_data);
> }
> 
> static inline int cma_rep_recv(struct cma_context *cma_ctx,
> 			 struct ib_cm_event *rep_cm_event)
> {
> 	int status;
> 
> 	status = cma_modify_qp_state(cma_ctx->cm_id, cma_ctx->cma_conn.qp,
> 				     IB_QPS_RTR, 0);
> 	if (status) {
> 		printk(KERN_ERR PFX "%s: fail to modify QPS_RTR %d\n", 
> 		       __func__, status);
> 		return status;
> 	}
> 
> 	status = cma_modify_qp_state(cma_ctx->cm_id, cma_ctx->cma_conn.qp,
> 				     IB_QPS_RTS, 0);
> 	if (status) {
> 		printk(KERN_ERR PFX "%s: fail to modify QPS_RTR %d\n", 
> 		       __func__, status);
> 		return status;
> 	}
> 
> 	status = ib_send_cm_rtu(cma_ctx->cm_id, NULL, 0);
> 	if (status) {
> 		printk(KERN_ERR PFX "%s: fail to send cm rtu %d\n", 
> 		       __func__, status);
> 		return status;
> 	}
> 
> 	return 0;
> }
> 
> int cma_active_cb_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
> {
> 	int status = 0;
> 	enum ib_cma_event cma_event = 0;
> 	struct cma_context *cma_ctx = cm_id->context;
> 
> 	printk(KERN_DEBUG PFX "%s: enter >>> cm_id=%p cma_ctx=%p\n",__func__, 
> 		cm_id, cma_ctx);
> 
> 	switch (event->event) {
> 	case IB_CM_REQ_ERROR:
> 		cma_event = IB_CMA_EVENT_UNREACHABLE;
> 		break;
> 	case IB_CM_REJ_RECEIVED:
> 		cma_event = IB_CMA_EVENT_NON_PEER_REJECTED;
> 		break;
> 	case IB_CM_DREP_RECEIVED:
> 	case IB_CM_TIMEWAIT_EXIT:
> 		cma_event = IB_CMA_EVENT_DISCONNECTED;
> 		break;
> 	case IB_CM_REP_RECEIVED:
> 		status = cma_rep_recv(cma_ctx, event);
> 		if (!status)
> 			cma_event = IB_CMA_EVENT_ESTABLISHED;
> 		else
> 			cma_event = IB_CMA_EVENT_DISCONNECTED;
> 		break;
> 	case IB_CM_DREQ_RECEIVED:
> 		ib_send_cm_drep(cm_id, NULL, 0);
> 		cma_event = IB_CMA_EVENT_DISCONNECTED;
> 		break;
> 	case IB_CM_DREQ_ERROR:
> 		break;
> 	default:
> 		printk(KERN_WARNING PFX "%s: cm event (%d) not handled\n",
> 			__func__, event->event);
> 		break;
> 	}
> 
> 	printk(KERN_WARNING PFX "%s: cm_event=%d cma_event=%d\n",
> 		__func__, event->event, cma_event);
> 
> 	if (cma_event)
This check isn't needed.
> 		cma_connection_callback(cma_ctx, cma_event, 
> 					event->private_data);
> 
> 	return status;
Returning non-zero will destroy the underlying cm_id.  We can avoid some 
synchronization by letting it exist until the user destroys the corresponding 
cma_id.  Otherwise, there's the potential of the user trying to destroy it 
twice.  Once from the cma_connection_callback reporting an error, and then again 
here.

> }
> 
> static struct cma_context *get_cma_ctx(struct ib_cm_id *cm_id,
> 				       struct ib_cm_event *event)
> {
> 	struct cma_context *new_cma_ctx;
> 	int status;
> 
> 	
> 	if (event->event != IB_CM_REQ_RECEIVED)
> 		return cm_id->context;
> 
> 	if (((struct cma_context *)cm_id->context)->cm_id != cm_id) 
> 		printk(KERN_DEBUG PFX "%s: old_cm_id=%p new_cm_id=%p\n", 
> 		       __func__, ((struct cma_context *)cm_id->context)->cm_id,
> 		       cm_id);

This check shouldn't be needed.

> 	new_cma_ctx = kmalloc(sizeof *new_cma_ctx, GFP_KERNEL);
> 	if (!new_cma_ctx) {
> 		status = ib_send_cm_rej(cm_id,
> 					IB_CM_REJ_CONSUMER_DEFINED,
> 					NULL, 0, NULL, 0);
> 		return NULL;
> 	}
> 
> 	memset(new_cma_ctx, 0, sizeof *new_cma_ctx);
> 	new_cma_ctx->cm_id = cm_id;
> 	new_cma_ctx->creq_cma_ctx = cm_id->context;
> 	cm_id->context = new_cma_ctx;
> 
> 	return new_cma_ctx;
> }
> 
> static void cma_path_handler(u64 req_id, void *context, int rec_num)
> {
> 	struct cma_context *cma_ctx = context;
> 	enum ib_cma_event event;
> 	int status = 0;
> 
> 	if (!cma_ctx) {
This check isn't needed.
> 		printk(KERN_ERR PFX "%s: context received null\n",__func__);
> 		return;
> 	}
> 
> 	if (rec_num <= 0) {
> 		event = IB_CMA_EVENT_UNREACHABLE;
> 		goto error;
> 	}
> 
>         cma_ctx->cma_param.primary_path = &cma_ctx->cma_path;
> 	cma_ctx->cma_param.alternate_path = NULL;
> 
> 	printk(KERN_DEBUG PFX "%s: dlid=%d slid=%d pkey=%d mtu=%d sid=%llx "
> 		"qpn=%d qpt=%d psn=%d prd=%s respres=%d rcm=%d flc=%d "
> 		"cmt=%d rtrc=%d rntrtr=%d maxcm=%d \n",__func__,
> 		cma_ctx->cma_param.primary_path->dlid ,
> 		cma_ctx->cma_param.primary_path->slid ,
> 		cma_ctx->cma_param.primary_path->pkey ,
> 		cma_ctx->cma_param.primary_path->mtu ,
> 		cma_ctx->cma_param.service_id,
> 		cma_ctx->cma_param.qp_num,
> 		cma_ctx->cma_param.qp_type,
> 		cma_ctx->cma_param.starting_psn,
> 		(char *)cma_ctx->cma_param.private_data,
> 		cma_ctx->cma_param.responder_resources,
> 		cma_ctx->cma_param.remote_cm_response_timeout,
> 		cma_ctx->cma_param.flow_control,
> 		cma_ctx->cma_param.local_cm_response_timeout,
> 		cma_ctx->cma_param.retry_count,
> 		cma_ctx->cma_param.rnr_retry_count,
> 		cma_ctx->cma_param.max_cm_retries);
> 
> 	status = ib_send_cm_req(cma_ctx->cm_id, &cma_ctx->cma_param);
> 	if (status) {
> 		printk(KERN_ERR PFX "%s: cm_req failed %d\n",__func__, status);
> 		event = IB_CMA_EVENT_REJECTED;
> 		goto error;
> 	}
> 
> 	return;
> 
> error:
> 	printk(KERN_ERR PFX "%s: return error %d \n",__func__, status);
> 	cma_connection_callback(cma_ctx, event, NULL);
> }
> 
> static void cma_route_handler(u64 req_id, void *context, int rec_num)
> {
> 	struct cma_context *cma_ctx = context;
> 	enum ib_cma_event event;
> 	int status = 0;
> 	
> 	if (rec_num <= 0) {
> 		event = IB_CMA_EVENT_UNREACHABLE;
> 		goto error;
> 	}
> 	cma_ctx->ibat_comp.fn = &cma_path_handler;
> 	cma_ctx->ibat_comp.context = cma_ctx;
> 
> 	status = ib_at_paths_by_route(&cma_ctx->cma_route, 0,
> 				      &cma_ctx->cma_path, 1,
> 				      &cma_ctx->ibat_comp);
> 
> 	if (status) {
> 		event = IB_CMA_EVENT_DISCONNECTED;
> 		goto error;
> 	}
> 	return;
> 
> error:
> 	printk(KERN_ERR PFX "%s: return error %d \n",__func__, status);
> 	cma_connection_callback(cma_ctx, event ,NULL);
> }
> 
> /* API functions */
> 
> int ib_cma_create_qp(struct ib_pd *pd, u8 port, struct ib_qp **qp_in, 
> 		     struct ib_qp_init_attr *init_attr)
> {
Why not return struct ib_qp* similar to how the other APIs operate?

> 	struct ib_qp_attr qp_attr;
> 	int qp_attr_mask;
> 	struct ib_qp *qp;
> 
> 	qp = ib_create_qp(pd, init_attr);
> 	if (IS_ERR(qp))
> 		return IS_ERR(qp);
> 	*qp_in = qp;
> 	printk(KERN_DEBUG PFX "%s: qp created (%p)\n",__func__, qp);
> 	memset(&qp_attr, 0, sizeof qp_attr);
> 
> 	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | 
> 		       IB_QP_PKEY_INDEX | IB_QP_PORT;
> 
> 	qp_attr.qp_access_flags = IB_ACCESS_REMOTE_READ | 
> 				  IB_ACCESS_REMOTE_WRITE;
> 	qp_attr.qp_state = IB_QPS_INIT;
> 	qp_attr.pkey_index = 0;
> 	qp_attr.port_num = port;
> 	printk(KERN_DEBUG PFX "%s: call ib_modify_qp (qp_attr_mask=%x)\n",
> 		__func__, qp_attr_mask);
> 	
> 	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);

If modify fails, the QP should be destroyed.

> };
> EXPORT_SYMBOL(ib_cma_create_qp);
> 
> int ib_cma_connect(struct ib_cma_conn *cma_conn, void **cma_id)
> {
> 	struct cma_context *cma_ctx;
> 	int status;
> 	u32 timeout;
> 	u32 dst_ip;
> 	dst_ip = (((struct sockaddr_in *)(cma_conn->dst_ip))->sin_addr).s_addr;
> 
> 	printk(KERN_DEBUG PFX "%s: enter >>> dst_ip=%d.%d.%d.%d\n",__func__,
> 		(dst_ip & 0x000000ff), 
> 		(dst_ip & 0x0000ff00) >> 8, 
> 		(dst_ip & 0x00ff0000) >> 16, 
> 		(dst_ip & 0xff000000) >> 24); 
> 	cma_ctx = kmalloc(sizeof *cma_ctx, GFP_KERNEL);
> 	if (!cma_ctx) 
> 		return -ENOMEM;
> 	memset(cma_ctx, 0, sizeof *cma_ctx);
> 
> 	timeout = us_to_cmt(cma_conn->qp_attr->timeout);
> 
> 	cma_ctx->cm_id = ib_create_cm_id(cma_conn->device, 
> 					 cma_active_cb_handler,
> 					 (void *)cma_ctx);
> 	if (IS_ERR(cma_ctx->cm_id)) {
> 		printk(KERN_ERR PFX "%s: cm_id creation failed\n", __func__);
> 		destroy_cma_ctx(cma_ctx);
> 		return -EAGAIN;
> 	}
> 	else
> 		printk(KERN_DEBUG PFX "%s: cm_id created %p\n", __func__, 
> 		       cma_ctx->cm_id);
> 
> 	printk(KERN_DEBUG PFX "%s: cma_event_handler=%p\n", __func__, 
> 		       cma_conn->cma_event_handler );
> 	memcpy(&cma_ctx->cma_conn, cma_conn, sizeof *cma_conn);
> 	cma_ctx->cma_param.service_id = cma_conn->service_id;
> 	cma_ctx->cma_param.qp_num = cma_conn->qp->qp_num;
> 	cma_ctx->cma_param.qp_type = IB_QPT_RC;
> 	cma_ctx->cma_param.private_data = kmalloc(cma_conn->private_data_len,
> 						  GFP_KERNEL);
> 	memcpy((u8 *)cma_ctx->cma_param.private_data, 
> 	       (u8 *)cma_conn->private_data, cma_conn->private_data_len);
> 	cma_ctx->cma_param.private_data_len = cma_conn->private_data_len;
> 	cma_ctx->cma_param.responder_resources = CMA_TARGET_MAX;
> 	cma_ctx->cma_param.initiator_depth = CMA_INITIATOR_DEPTH;
> 	cma_ctx->cma_param.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; //timeout;
> 	cma_ctx->cma_param.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
> 	cma_ctx->cma_param.retry_count = CMA_RC_RETRY_COUNT;
> 	cma_ctx->cma_param.rnr_retry_count = CMA_RNR_RETRY_COUNT;
> 	cma_ctx->cma_param.max_cm_retries = CMA_MAX_CM_RETRIES;
> 	cma_ctx->ibat_comp.fn = &cma_route_handler;
> 	cma_ctx->ibat_comp.context = cma_ctx;
> 
> 	status = ib_at_route_by_ip(dst_ip, 0, 0, 0, &cma_ctx->cma_route,
> 				   &cma_ctx->ibat_comp);
> 	if (status < 0) {
> 		printk(KERN_ERR PFX " ib_at_route_by_ip failed (%d)\n", 
> 		       status);
> 		destroy_cma_ctx(cma_ctx);
> 		return -EAGAIN;
> 	}
> 
> 	if (status == 1) {
> 		printk(KERN_DEBUG PFX "%s: immidiate route - call "
> 		       "route_handler\n",__func__);
> 		cma_route_handler(cma_ctx->ibat_comp.req_id, cma_ctx, 1);
> 	}	
> 	*cma_id = (void *)cma_ctx;	
> 	return 0;
> };
> EXPORT_SYMBOL(ib_cma_connect);
> 
> int ib_cma_disconnect(struct ib_qp *qp, void *cma_id)
> {
> 	struct cma_context *cma_ctx = cma_id;
> 	int status;
> 	
> 	status = cma_disconnect(qp, cma_ctx, CMA_CLOSE_ABRUPT);
> 	        
> 	return status;
> };
> EXPORT_SYMBOL(ib_cma_disconnect);
> 
> int ib_cma_listen(struct ib_device *device, struct sockaddr *address,
> 		  __be64 service_id, void *context,
> 		  ib_cma_listen_handler cm_listen_handler,
> 		  void **cma_id)
> {
> 	struct cma_context *cma_ctx;
> 	int status;
> 
> 	printk(KERN_DEBUG PFX "%s: enter >> \n",__func__);
> 	cma_ctx = kmalloc(sizeof *cma_ctx, GFP_KERNEL);
> 	if (!cma_ctx) 
> 		return -ENOMEM;
> 	memset(cma_ctx, 0, sizeof *cma_ctx);
> 	cma_ctx->listen_cb.func = cm_listen_handler;
> 	cma_ctx->listen_cb.context = context;
> 	cma_ctx->cm_id = ib_create_cm_id(device, cma_passive_cb_handler,
> 					 (void *)cma_ctx);
> 	if (IS_ERR(cma_ctx->cm_id)) {
> 		printk(KERN_ERR PFX "%s: cm_id creation failed\n", __func__);
> 		destroy_cma_ctx(cma_ctx);
> 		return -EAGAIN;
> 	}
> 	else
> 		printk(KERN_DEBUG PFX "%s: cm_id created %p\n", __func__, 
> 		       cma_ctx->cm_id);
> 	/* `address` is ignored at the moment ... */
> 	status = ib_cm_listen(cma_ctx->cm_id, service_id, 0);
> 	if (status) {
> 		printk(KERN_ERR PFX "%s: cm_listen failed %d\n", __func__, 
> 		       status);
> 		destroy_cma_ctx(cma_ctx);
> 		return status;
> 	}
> 	printk(KERN_INFO PFX "%s:cm_id=%p cma_id=%p\n", __func__, 
> 	       cma_ctx->cm_id, cma_ctx);
> 
> 	*cma_id = (void *)cma_ctx;	
> 	return 0;
> };
> EXPORT_SYMBOL(ib_cma_listen);
> 
> int ib_cma_destroy(void *cma_id)
> {
> 	return destroy_cma_ctx((struct cma_context *)cma_id);
> };
> EXPORT_SYMBOL(ib_cma_destroy);
> 
> int ib_cma_accept(void *cma_id, struct ib_qp *qp, 
> 		  const void *private_data, u8 private_data_len,
> 		  void *context, ib_cma_ac_handler cm_accept_handler)
> {
> 	struct cma_context *cma_ctx = cma_id;
> 	struct ib_cm_rep_param passive_params;
> 	int status;
> 
> 	printk(KERN_DEBUG PFX "%s: enter >> private_data = %s (len=%d)\n",
> 		__func__, (char *)private_data, private_data_len);
> 
> 	if (private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE) {
> 		status = -EINVAL;
> 		goto reject;
> 	}
> 
> 	memset(&passive_params, 0, sizeof passive_params);
> 	passive_params.private_data = private_data;
> 	passive_params.private_data_len = private_data_len;
> 	passive_params.qp_num = qp->qp_num;
> 	passive_params.responder_resources = CMA_TARGET_MAX;
> 	passive_params.initiator_depth = CMA_INITIATOR_DEPTH;
> 	passive_params.rnr_retry_count = CMA_RNR_RETRY_COUNT;
> 
> 	status = cma_modify_qp_state(cma_ctx->cm_id, qp, IB_QPS_RTR, 0);
> 	if (status)
> 		goto reject;
> 	
> 	cma_ctx->accept_cb.func = cm_accept_handler;
> 	cma_ctx->accept_cb.context = context;
> 	status = ib_send_cm_rep(cma_ctx->cm_id, &passive_params);
> 	if (status)
> 		goto reject;
> 
> 	printk(KERN_DEBUG PFX "%s: return success\n", __func__);
> 	return 0;
> 
> reject:
> 	printk(KERN_ERR PFX "%s: error status %d\n", __func__, status);
> 	ib_send_cm_rej(cma_ctx->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
> 			NULL, 0, NULL, 0);
> 	destroy_cma_ctx(cma_ctx);
> 	return status;
> };
> EXPORT_SYMBOL(ib_cma_accept);
> 
> int ib_cma_reject(void *cma_id, const void *private_data,
> 		  u8 private_data_len)
> {
> 	struct cma_context *cma_ctx = cma_id;
> 	int status;
> 
> 	if (cma_ctx == NULL)
This check isn't needed.
> 		return -EINVAL;
> 	
> 	status = ib_send_cm_rej(cma_ctx->cm_id, IB_CM_REJ_CONSUMER_DEFINED,
> 				NULL, 0, private_data, private_data_len);
> 	destroy_cma_ctx(cma_ctx);
> 
> 	return status;
> };
> EXPORT_SYMBOL(ib_cma_reject);

As a general statement, can we reduce the number of debug print statements?

- Sean