[ofw] [PATCH 5/9] dapl-2.0: scm: SOCKOPT ERR Connection timed out on large clusters

Davis, Arlin R arlin.r.davis at intel.com
Wed May 19 11:25:21 PDT 2010


Large scale all to all connections on +1500 cores
the listen backlog is reached and SYN's are dropped
which causes the connect to timeout. Retry connect
on timeout errors.

Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
 dapl/openib_scm/cm.c |   24 ++++++++++++++++++++++--
 1 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 7465190..4c8d4a1 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -60,6 +60,12 @@
 #include "dapl_ep_util.h"
 #include "dapl_osd.h"
 
+/* forward declarations */
+static DAT_RETURN
+dapli_socket_connect(DAPL_EP * ep_ptr,
+		     DAT_IA_ADDRESS_PTR r_addr,
+		     DAT_CONN_QUAL r_qual, DAT_COUNT p_size, DAT_PVOID p_data);
+
 #ifdef DAPL_DBG
 /* Check for EP linking to IA and proper connect state */
 void dapli_ep_check(DAPL_EP *ep)
@@ -494,13 +500,27 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
 
 	if (err) {
 		dapl_log(DAPL_DBG_TYPE_ERR,
-			 " CONN_PENDING: %s ERR %s -> %s %d\n",
+			 " CONN_PENDING: %s ERR %s -> %s %d - %s\n",
 			 err == -1 ? "POLL" : "SOCKOPT",
 			 err == -1 ? strerror(dapl_socket_errno()) : strerror(err), 
 			 inet_ntoa(((struct sockaddr_in *)
 				&cm_ptr->addr)->sin_addr), 
 			 ntohs(((struct sockaddr_in *)
-				&cm_ptr->addr)->sin_port));
+				&cm_ptr->addr)->sin_port),
+			 err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+
+		/* retry a timeout */
+		if (err == ETIMEDOUT) {
+			closesocket(cm_ptr->socket);
+			cm_ptr->socket = DAPL_INVALID_SOCKET;
+			dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr, 
+					     ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,
+					     ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data);
+			dapl_ep_unlink_cm(cm_ptr->ep, cm_ptr);
+			dapli_cm_free(cm_ptr);
+			return;
+		}
+
 		goto bail;
 	}
 
-- 
1.5.2.5




More information about the ofw mailing list