[openib-general] [PATCH] uDAPL openib uAT retry fixes
Arlin Davis
arlin.r.davis at intel.com
Wed Aug 3 13:34:46 PDT 2005
James,
Please review the following uDAPL patch. Fixes my broken uAT retry code.
Thanks,
-arlin
Signed-off by: Arlin Davis <ardavis at ichips.intel.com>
Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c (revision 2970)
+++ dapl/openib/dapl_ib_util.c (working copy)
@@ -128,21 +128,34 @@ int dapli_get_hca_addr( struct dapl_hca
at_comp.context = &at_rec;
at_rec.addr = &hca_ptr->hca_address;
at_rec.wait_object = &hca_ptr->ib_trans.wait_object;
+ at_rec.hca_ptr = hca_ptr;
+ at_rec.retries = 0;
/* call with async_comp until the sync version works */
status = ib_at_ips_by_gid(&hca_ptr->ib_trans.gid, &ipv4_addr->sin_addr.s_addr, 1,
&at_comp, &at_rec.req_id);
- if (status < 0)
+ if (status < 0) {
+ dapl_dbg_log (DAPL_DBG_TYPE_ERR,
+ " get_hca_addr: ERR ips_by_gid %d %s \n",
+ status, strerror(errno));
return 1;
+ }
- if (status > 0)
- dapli_ip_comp_handler(at_rec.req_id, (void*)ipv4_addr, status);
-
- /* wait for answer, 5 seconds max */
- dat_status = dapl_os_wait_object_wait (&hca_ptr->ib_trans.wait_object,5000000);
-
- if ((dat_status != DAT_SUCCESS ) || (!ipv4_addr->sin_addr.s_addr))
+ dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
+ " get_hca_addr: ips_by_gid ret %d at_rec %p -> id %lld\n",
+ status, &at_rec, at_rec.req_id );
+
+ if (status > 0) {
+ dapli_ip_comp_handler(at_rec.req_id, (void*)&at_rec, status);
+ } else {
+ dat_status = dapl_os_wait_object_wait(&hca_ptr->ib_trans.wait_object,500000);
+ return 0;
+ if (dat_status != DAT_SUCCESS)
+ ib_at_cancel(at_rec.req_id);
+ }
+
+ if (!ipv4_addr->sin_addr.s_addr)
return 1;
return 0;
@@ -252,6 +265,13 @@ DAT_RETURN dapls_ib_open_hca (
ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
goto bail;
}
+
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
+ hca_ptr->ib_trans.lid,
+ (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+ (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+
/* get the IP address of the device */
if (dapli_get_hca_addr(hca_ptr)) {
dapl_dbg_log (DAPL_DBG_TYPE_ERR,
@@ -282,11 +302,6 @@ DAT_RETURN dapls_ib_open_hca (
((struct sockaddr_in *)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
hca_ptr->ib_trans.max_inline_send );
- dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
- hca_ptr->ib_trans.lid,
- (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
- (unsigned long long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
return DAT_SUCCESS;
Index: dapl/openib/dapl_ib_cm.c
===================================================================
--- dapl/openib/dapl_ib_cm.c (revision 2970)
+++ dapl/openib/dapl_ib_cm.c (working copy)
@@ -158,19 +158,49 @@ void dapli_at_thread_destroy(void)
void dapli_ip_comp_handler(uint64_t req_id, void *context, int rec_num)
{
struct dapl_at_record *at_rec = context;
+ struct sockaddr_in *ipv4_addr = (struct sockaddr_in*)at_rec->addr;
+ int status;
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " ip_comp_handler: ctxt %p, req_id %lld rec_num %d\n",
- context, req_id, rec_num);
+ " ip_comp_handler: at_rec %p ->id %lld id %lld rec_num %d %x\n",
+ context, at_rec->req_id, req_id, rec_num,
+ ipv4_addr->sin_addr.s_addr);
+
+ if (rec_num <= 0) {
+ struct ib_at_completion at_comp;
+
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " ip_comp_handler: resolution err %d retry %d\n",
+ rec_num, at_rec->retries + 1);
+
+ if (++at_rec->retries > IB_MAX_AT_RETRY)
+ goto bail;
+
+ at_comp.fn = dapli_ip_comp_handler;
+ at_comp.context = at_rec;
+ ipv4_addr->sin_addr.s_addr = 0;
+
+ status = ib_at_ips_by_gid(&at_rec->hca_ptr->ib_trans.gid,
+ &ipv4_addr->sin_addr.s_addr, 1,
+ &at_comp, &at_rec->req_id);
+ if (status < 0)
+ goto bail;
+
+ dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
+ " ip_comp_handler: NEW ips_by_gid ret %d at_rec %p -> id %lld\n",
+ status, at_rec, at_rec->req_id );
+ }
- if ((at_rec) && ( at_rec->req_id == req_id)) {
+ if (ipv4_addr->sin_addr.s_addr)
dapl_os_wait_object_wakeup(at_rec->wait_object);
- return;
- }
-
- dapl_dbg_log(DAPL_DBG_TYPE_ERR,
- " ip_comp_handler: at_rec->req_id %lld != req_id %lld\n",
- at_rec->req_id, req_id );
+
+ return;
+bail:
+ dapl_dbg_log(DAPL_DBG_TYPE_CM,
+ " ip_comp_handler: ERR: at_rec %p, req_id %lld rec_num %d\n",
+ at_rec, req_id, rec_num);
+
+ dapl_os_wait_object_wakeup(at_rec->wait_object);
}
static void dapli_path_comp_handler(uint64_t req_id, void *context, int rec_num)
@@ -622,20 +652,21 @@ void cm_thread(void *arg)
dapl_os_unlock(&g_cm_lock);
ret = poll(&ufds, 1, -1);
- if ((ret <= 0) || (g_cm_destroy)) {
+ if (ret <= 0) {
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_thread(%d): ERR %s poll\n",
getpid(),strerror(errno));
dapl_os_lock(&g_cm_lock);
- break;
+ continue;
}
dapl_dbg_log(DAPL_DBG_TYPE_CM,
" cm_thread: GET EVENT fd=%d n=%d\n",
ib_cm_get_fd(),ret);
+
if (ib_cm_event_get_timed(0,&event)) {
dapl_dbg_log(DAPL_DBG_TYPE_CM,
- " cm_thread: ERR %s eventi_get on %d\n",
+ " cm_thread: ERR %s event_get on %d\n",
strerror(errno), ib_cm_get_fd() );
dapl_os_lock(&g_cm_lock);
continue;
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h (revision 2970)
+++ dapl/openib/dapl_ib_util.h (working copy)
@@ -97,6 +97,8 @@ struct dapl_at_record {
uint64_t req_id;
DAT_SOCK_ADDR6 *addr;
DAPL_OS_WAIT_OBJECT *wait_object;
+ struct dapl_hca *hca_ptr;
+ int retries;
};
/*
More information about the general
mailing list