[ofw] [PATCH 3/4] DAPL v2: ucm: For UD type QP's, return CR p_data with CONN_EST event on passive side.

Davis, Arlin R arlin.r.davis at intel.com
Wed Sep 9 15:14:44 PDT 2009


Intel MPI uses the p_data provided with CONN_EST as a reference to the
UD pair and remote rank. The ucm provider was overwriting the CR p_data
with the ACCEPT p_data. Change to save CR p_data but also provide
storage for user provided ACCEPT p_data in case the REPLY is lost
and needs retransmitted.

p_data size was provided to event processing in network order
instead of host order.

For new QP's create new address handles and do not use
existing AH's created for the CM. Different PD's are
associated with each.

Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
 dapl/openib_common/dapl_ib_dto.h |    2 +-
 dapl/openib_scm/cm.c             |    4 +-
 dapl/openib_ucm/cm.c             |   62 +++++++++++++++++++++++++------------
 dapl/openib_ucm/dapl_ib_util.h   |    2 +
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/dapl/openib_common/dapl_ib_dto.h b/dapl/openib_common/dapl_ib_dto.h
index e6c03b2..b93565c 100644
--- a/dapl/openib_common/dapl_ib_dto.h
+++ b/dapl/openib_common/dapl_ib_dto.h
@@ -346,7 +346,7 @@ dapls_ib_post_ext_send (
 		dapl_dbg_log(DAPL_DBG_TYPE_EP, 
 			     " post_ext: OP_SEND_UD ah=%p"
 			     " qp_num=0x%x\n",
-			     remote_ah, remote_ah->qpn);
+			     remote_ah->ah, remote_ah->qpn);
 		
 		wr.opcode = OP_SEND;
 		wr.wr.ud.ah = remote_ah->ah;
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 8560788..2403918 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -795,7 +795,7 @@ ud_bail:
 				(DAPL_EVD *) ep_ptr->param.connect_evd_handle,
 				event,
 				(DAT_EP_HANDLE) ep_ptr,
-				(DAT_COUNT) cm_ptr->msg.p_size,
+				(DAT_COUNT) exp,
 				(DAT_PVOID *) cm_ptr->msg.p_data,
 				(DAT_PVOID *) &xevent);
 
@@ -1213,7 +1213,7 @@ void dapli_socket_accept_rtu(dp_ib_cm_handle_t cm_ptr)
 				cm_ptr->ep->param.connect_evd_handle,
 				DAT_IB_UD_CONNECTION_EVENT_ESTABLISHED,
 				(DAT_EP_HANDLE) cm_ptr->ep,
-				(DAT_COUNT) cm_ptr->msg.p_size,
+				(DAT_COUNT) ntohs(cm_ptr->msg.p_size),
 				(DAT_PVOID *) cm_ptr->msg.p_data,
 				(DAT_PVOID *) &xevent);
 
diff --git a/dapl/openib_ucm/cm.c b/dapl/openib_ucm/cm.c
index aa6bb73..5c5287f 100644
--- a/dapl/openib_ucm/cm.c
+++ b/dapl/openib_ucm/cm.c
@@ -182,7 +182,7 @@ static int dapl_select(struct dapl_fd_set *set)
 static void ucm_accept(ib_cm_srvc_handle_t cm, ib_cm_msg_t *msg);
 static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg);
 static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg);
-static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg);
+static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg, DAT_PVOID p_data, DAT_COUNT p_size);
 DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm);
 
 #define UCM_SND_BURST	100
@@ -304,7 +304,7 @@ static int ucm_reject(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
 		     ntohs(smsg.daddr.ib.lid),
 		     ntohl(smsg.dqpn), ntohs(smsg.dport));
 
-	return (ucm_send(tp, &smsg));
+	return (ucm_send(tp, &smsg, NULL, 0));
 }
 
 static void ucm_process_recv(ib_hca_transport_t *tp, 
@@ -489,7 +489,7 @@ retry:
 }
 
 /* ACTIVE/PASSIVE: build and send CM message out of CM object */
-static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
+static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg, DAT_PVOID p_data, DAT_COUNT p_size)
 {
 	ib_cm_msg_t *smsg = NULL;
 	struct ibv_send_wr wr, *bad_wr;
@@ -502,8 +502,10 @@ static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
 	if ((smsg = ucm_get_smsg(tp)) == NULL)
 		goto bail;
 
-	len = ((sizeof(*msg) - DCM_MAX_PDATA_SIZE) + ntohs(msg->p_size));
+	len = (sizeof(*msg) - DCM_MAX_PDATA_SIZE);
 	dapl_os_memcpy(smsg, msg, len);
+	if (p_size)
+		dapl_os_memcpy(&smsg->p_data, p_data, p_size);
 
 	wr.next = NULL;
         wr.sg_list = &sge;
@@ -514,13 +516,13 @@ static int ucm_send(ib_hca_transport_t *tp, ib_cm_msg_t *msg)
 	if (len <= tp->max_inline_send)
 		wr.send_flags |= IBV_SEND_INLINE; 
 
-        sge.length = len;
+        sge.length = len + p_size;
         sge.lkey = tp->mr_sbuf->lkey;
         sge.addr = (uintptr_t)smsg;
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM, 
 		" ucm_send: op %d ln %d lid %x c_qpn %x rport %d\n", 
-		ntohs(smsg->op), len, htons(smsg->daddr.ib.lid), 
+		ntohs(smsg->op), sge.length, htons(smsg->daddr.ib.lid), 
 		htonl(smsg->dqpn), htons(smsg->dport));
 
 	/* empty slot, then create AH */
@@ -717,7 +719,7 @@ DAT_RETURN dapli_cm_disconnect(dp_ib_cm_handle_t cm)
 	} else {
 		/* send disc, schedule destroy */
 		cm->msg.op = htons(DCM_DREQ);
-		if (ucm_send(&cm->hca->ib_trans, &cm->msg)) {
+		if (ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0)) {
 			dapl_log(DAPL_DBG_TYPE_WARN, 
 				 " disc_req: ERR-> %s lid %d qpn %d"
 				 " r_psp %d \n", strerror(errno), 
@@ -788,7 +790,8 @@ dapli_cm_connect(DAPL_EP *ep, dp_ib_cm_handle_t cm)
 	dapl_os_unlock(&cm->lock);
 
 	cm->msg.op = htons(DCM_REQ);
-	if (ucm_send(&cm->hca->ib_trans, &cm->msg)) 		
+	if (ucm_send(&cm->hca->ib_trans, &cm->msg, 
+		     &cm->msg.p_data, ntohs(cm->msg.p_size))) 		
 		goto bail;
 
 	/* first time through, put on work queue */
@@ -910,10 +913,10 @@ static void ucm_connect_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 	}
 	dapl_os_unlock(&cm->ep->header.lock);
 	
-	/* Send RTU */
+	/* Send RTU, no private data */
 	cm->msg.op = htons(DCM_RTU);
 	
-	if (ucm_send(&cm->hca->ib_trans, &cm->msg)) 		
+	if (ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0)) 		
 		goto bail;
 
 	/* init cm_handle and post the event with private data */
@@ -929,11 +932,21 @@ ud_bail:
 		/* post EVENT, modify_qp, AH already created, ucm msg */
 		xevent.status = 0;
 		xevent.type = DAT_IB_UD_REMOTE_AH;
-		xevent.remote_ah.ah = cm->hca->ib_trans.ah[lid];
 		xevent.remote_ah.qpn = ntohl(cm->msg.daddr.ib.qpn);
+		xevent.remote_ah.ah = dapls_create_ah(cm->hca, 
+						      cm->ep->qp_handle->pd, 
+						      cm->ep->qp_handle, 
+						      htons(lid), 
+						      NULL);
+		if (xevent.remote_ah.ah == NULL) {
+			event = IB_CME_LOCAL_FAILURE;
+			goto bail;
+		}
+
 		dapl_os_memcpy(&xevent.remote_ah.ia_addr,
 			       &cm->msg.daddr,
 			       sizeof(union dcm_addr));
+
 		/* remote ia_addr reference includes ucm qpn, not IB qpn */
 		((union dcm_addr*)
 			&xevent.remote_ah.ia_addr)->ib.qpn = cm->msg.dqpn;
@@ -1086,10 +1099,6 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 	cm->state = DCM_CONNECTED;
 	dapl_os_unlock(&cm->lock);
 	
-	if (msg->p_size) 
-		dapl_os_memcpy(cm->msg.p_data, 
-			       msg->p_data, ntohs(msg->p_size));
-
 	/* final data exchange if remote QP state is good to go */
 	dapl_dbg_log(DAPL_DBG_TYPE_CM, " PASSIVE: connected!\n");
 
@@ -1101,11 +1110,19 @@ static void ucm_accept_rtu(dp_ib_cm_handle_t cm, ib_cm_msg_t *msg)
 		/* post EVENT, modify_qp, AH already created, ucm msg */
 		xevent.status = 0;
 		xevent.type = DAT_IB_UD_PASSIVE_REMOTE_AH;
-		xevent.remote_ah.ah = cm->hca->ib_trans.ah[lid];
 		xevent.remote_ah.qpn = ntohl(cm->msg.daddr.ib.qpn);
+		xevent.remote_ah.ah = dapls_create_ah(cm->hca, 
+						      cm->ep->qp_handle->pd, 
+						      cm->ep->qp_handle, 
+						      htons(lid), 
+						      NULL);
+		if (xevent.remote_ah.ah == NULL) 
+			goto bail;
+
 		dapl_os_memcpy(&xevent.remote_ah.ia_addr,
 			       &cm->msg.daddr,
 			        sizeof(union dcm_addr));
+
 		/* remote ia_addr reference includes ucm qpn, not IB qpn */
 		((union dcm_addr*)
 			&xevent.remote_ah.ia_addr)->ib.qpn = cm->msg.dqpn;
@@ -1238,9 +1255,14 @@ dapli_accept_usr(DAPL_EP *ep, DAPL_CR *cr, DAT_COUNT p_size, DAT_PVOID p_data)
 	cm->msg.saddr.ib.port_num = cm->hca->port_num;
 	cm->msg.saddr.ib.lid = cm->hca->ib_trans.addr.ib.lid; 
 	cm->msg.saddr.ib.gid = cm->hca->ib_trans.addr.ib.gid; 
-	dapl_os_memcpy(&cm->msg.p_data, p_data, p_size);
-		
-	if (ucm_send(&cm->hca->ib_trans, &cm->msg)) 		
+
+	/* 
+	 * UD: deliver p_data with REQ and EST event, keep REQ p_data in 
+	 * cm->msg.p_data and save REPLY accept data in cm->p_data for retries 
+	 */
+	cm->p_size = p_size;
+	dapl_os_memcpy(&cm->p_data, p_data, p_size);
+	if (ucm_send(&cm->hca->ib_trans, &cm->msg, p_data, p_size)) 		
 		goto bail;
 
 	/* save state and setup valid reference to EP, HCA */
@@ -1565,7 +1587,7 @@ dapls_ib_reject_connection(IN dp_ib_cm_handle_t cm,
 	if (psize)
 		dapl_os_memcpy(&cm->msg.p_data, pdata, psize);
 		
-	if (ucm_send(&cm->hca->ib_trans, &cm->msg)) {
+	if (ucm_send(&cm->hca->ib_trans, &cm->msg, NULL, 0)) {
 		dapl_log(DAPL_DBG_TYPE_WARN,
 			 " cm_reject: ERR: %s\n", strerror(errno));
 		return DAT_INTERNAL_ERROR;
diff --git a/dapl/openib_ucm/dapl_ib_util.h b/dapl/openib_ucm/dapl_ib_util.h
index dfee2b9..ef5358a 100644
--- a/dapl/openib_ucm/dapl_ib_util.h
+++ b/dapl/openib_ucm/dapl_ib_util.h
@@ -45,6 +45,8 @@ struct ib_cm_handle
 	struct dapl_hca		*hca;
 	struct dapl_sp		*sp;	
 	struct dapl_ep 		*ep;
+	uint16_t		p_size; /* accept p_data, for retries */
+	uint8_t			p_data[DCM_MAX_PDATA_SIZE];
 	ib_cm_msg_t		msg;
 };
 
-- 
1.5.2.5




More information about the ofw mailing list