[openib-general] [PATCH] uDAPL openib uAT retry fixes

Arlin Davis arlin.r.davis at intel.com
Wed Aug 3 13:34:46 PDT 2005


James,

Please review the following uDAPL patch. Fixes my broken uAT retry code. 

Thanks,

-arlin

 
Signed-off by: Arlin Davis <ardavis at ichips.intel.com>


Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c	(revision 2970)
+++ dapl/openib/dapl_ib_util.c	(working copy)
@@ -128,21 +128,34 @@ int dapli_get_hca_addr( struct dapl_hca 
 	at_comp.context = &at_rec; 
 	at_rec.addr = &hca_ptr->hca_address;
 	at_rec.wait_object = &hca_ptr->ib_trans.wait_object;
+	at_rec.hca_ptr = hca_ptr;
+	at_rec.retries = 0;
 
 	/*  call with async_comp until the sync version works */
 	status = ib_at_ips_by_gid(&hca_ptr->ib_trans.gid, &ipv4_addr->sin_addr.s_addr, 1, 
 				  &at_comp, &at_rec.req_id);
 	
-	if (status < 0) 
+	if (status < 0) {
+		dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
+			      " get_hca_addr: ERR ips_by_gid %d %s \n",
+				status, strerror(errno));
 		return 1;
+	}
  
-        if (status > 0)
-                dapli_ip_comp_handler(at_rec.req_id, (void*)ipv4_addr, status);
-	
-	/* wait for answer, 5 seconds max */
-	dat_status = dapl_os_wait_object_wait (&hca_ptr->ib_trans.wait_object,5000000);
-	 
-	if ((dat_status != DAT_SUCCESS ) || (!ipv4_addr->sin_addr.s_addr)) 
+	dapl_dbg_log (DAPL_DBG_TYPE_UTIL, 
+		      " get_hca_addr: ips_by_gid ret %d at_rec %p -> id %lld\n",
+			status, &at_rec, at_rec.req_id );
+
+        if (status > 0) { 
+                dapli_ip_comp_handler(at_rec.req_id, (void*)&at_rec, status);
+	} else {
+		dat_status = dapl_os_wait_object_wait(&hca_ptr->ib_trans.wait_object,500000);
+		return 0;
+		if (dat_status != DAT_SUCCESS)
+			ib_at_cancel(at_rec.req_id);
+	}
+
+	if (!ipv4_addr->sin_addr.s_addr) 
 		return 1;
 		
 	return 0;
@@ -252,6 +265,13 @@ DAT_RETURN dapls_ib_open_hca (
 			      ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
 		goto bail;
 	}
+
+	dapl_dbg_log(DAPL_DBG_TYPE_CM,
+		     " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
+		     hca_ptr->ib_trans.lid,
+		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+
 	/* get the IP address of the device */
 	if (dapli_get_hca_addr(hca_ptr)) {
 		dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
@@ -282,11 +302,6 @@ DAT_RETURN dapls_ib_open_hca (
 		      ((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
 		      hca_ptr->ib_trans.max_inline_send );
 
-	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
-		     hca_ptr->ib_trans.lid,
-		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
-		     (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
 
 	return DAT_SUCCESS;
 
Index: dapl/openib/dapl_ib_cm.c
===================================================================
--- dapl/openib/dapl_ib_cm.c	(revision 2970)
+++ dapl/openib/dapl_ib_cm.c	(working copy)
@@ -158,19 +158,49 @@ void dapli_at_thread_destroy(void)
 void dapli_ip_comp_handler(uint64_t req_id, void *context, int rec_num)
 {
 	struct dapl_at_record	*at_rec = context;
+	struct sockaddr_in      *ipv4_addr = (struct sockaddr_in*)at_rec->addr;
+	int			status;
 
 	dapl_dbg_log(DAPL_DBG_TYPE_CM,
-		     " ip_comp_handler: ctxt %p, req_id %lld rec_num %d\n",
-		     context, req_id, rec_num);
+		     " ip_comp_handler: at_rec %p ->id %lld id %lld rec_num %d %x\n",
+		     context, at_rec->req_id, req_id, rec_num,
+		     ipv4_addr->sin_addr.s_addr);
+
+        if (rec_num <= 0) {
+		struct ib_at_completion at_comp;
+
+                dapl_dbg_log(DAPL_DBG_TYPE_CM,
+			     " ip_comp_handler: resolution err %d retry %d\n",
+			     rec_num, at_rec->retries + 1);
+
+                if (++at_rec->retries > IB_MAX_AT_RETRY) 
+                        goto bail;
+
+		at_comp.fn = dapli_ip_comp_handler;
+		at_comp.context = at_rec;
+		ipv4_addr->sin_addr.s_addr = 0;
+
+		status = ib_at_ips_by_gid(&at_rec->hca_ptr->ib_trans.gid, 
+					  &ipv4_addr->sin_addr.s_addr, 1,
+					  &at_comp, &at_rec->req_id);
+		if (status < 0) 
+			goto bail;
+
+		dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
+			      " ip_comp_handler: NEW ips_by_gid ret %d at_rec %p -> id %lld\n",
+			      status, at_rec, at_rec->req_id );
+        } 
 
-	if ((at_rec) && ( at_rec->req_id == req_id)) {
+	if (ipv4_addr->sin_addr.s_addr)
 		dapl_os_wait_object_wakeup(at_rec->wait_object);
-		return;
-	}
-	
-	dapl_dbg_log(DAPL_DBG_TYPE_ERR,
-		     " ip_comp_handler: at_rec->req_id %lld != req_id %lld\n",
-		     at_rec->req_id, req_id );
+
+	return;
+bail:
+	dapl_dbg_log(DAPL_DBG_TYPE_CM,
+		     " ip_comp_handler: ERR: at_rec  %p, req_id %lld rec_num %d\n",
+		     at_rec, req_id, rec_num);
+
+	dapl_os_wait_object_wakeup(at_rec->wait_object);
 }
 
 static void dapli_path_comp_handler(uint64_t req_id, void *context, int rec_num)
@@ -622,20 +652,21 @@ void cm_thread(void *arg) 
 
 		dapl_os_unlock(&g_cm_lock);
                 ret = poll(&ufds, 1, -1); 
-		if ((ret <= 0) || (g_cm_destroy)) {
+		if (ret <= 0) {
 			dapl_dbg_log(DAPL_DBG_TYPE_CM,
 				     " cm_thread(%d): ERR %s poll\n",
 				     getpid(),strerror(errno));
                 	dapl_os_lock(&g_cm_lock);
-			break;
+			continue;
 		}
 
 		dapl_dbg_log(DAPL_DBG_TYPE_CM,
 			" cm_thread: GET EVENT fd=%d n=%d\n",
 			ib_cm_get_fd(),ret);
+
 		if (ib_cm_event_get_timed(0,&event)) { 
 			dapl_dbg_log(DAPL_DBG_TYPE_CM,
-				" cm_thread: ERR %s eventi_get on %d\n", 
+				" cm_thread: ERR %s event_get on %d\n", 
 				strerror(errno), ib_cm_get_fd() );
 			dapl_os_lock(&g_cm_lock);
 			continue;
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h	(revision 2970)
+++ dapl/openib/dapl_ib_util.h	(working copy)
@@ -97,6 +97,8 @@ struct dapl_at_record {
 	uint64_t                req_id;
 	DAT_SOCK_ADDR6		*addr;
 	DAPL_OS_WAIT_OBJECT	*wait_object;
+	struct dapl_hca		*hca_ptr;
+	int			retries;
 };
 
 /* 






More information about the general mailing list