[ofw] [PATCH 5/9] dapl-2.0: scm: SOCKOPT ERR Connection timed out on large clusters
Davis, Arlin R
arlin.r.davis at intel.com
Wed May 19 11:25:21 PDT 2010
Large scale all to all connections on +1500 cores
the listen backlog is reached and SYN's are dropped
which causes the connect to timeout. Retry connect
on timeout errors.
Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
dapl/openib_scm/cm.c | 24 ++++++++++++++++++++++--
1 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index 7465190..4c8d4a1 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -60,6 +60,12 @@
#include "dapl_ep_util.h"
#include "dapl_osd.h"
+/* forward declarations */
+static DAT_RETURN
+dapli_socket_connect(DAPL_EP * ep_ptr,
+ DAT_IA_ADDRESS_PTR r_addr,
+ DAT_CONN_QUAL r_qual, DAT_COUNT p_size, DAT_PVOID p_data);
+
#ifdef DAPL_DBG
/* Check for EP linking to IA and proper connect state */
void dapli_ep_check(DAPL_EP *ep)
@@ -494,13 +500,27 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
if (err) {
dapl_log(DAPL_DBG_TYPE_ERR,
- " CONN_PENDING: %s ERR %s -> %s %d\n",
+ " CONN_PENDING: %s ERR %s -> %s %d - %s\n",
err == -1 ? "POLL" : "SOCKOPT",
err == -1 ? strerror(dapl_socket_errno()) : strerror(err),
inet_ntoa(((struct sockaddr_in *)
&cm_ptr->addr)->sin_addr),
ntohs(((struct sockaddr_in *)
- &cm_ptr->addr)->sin_port));
+ &cm_ptr->addr)->sin_port),
+ err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+
+ /* retry a timeout */
+ if (err == ETIMEDOUT) {
+ closesocket(cm_ptr->socket);
+ cm_ptr->socket = DAPL_INVALID_SOCKET;
+ dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr,
+ ntohs(((struct sockaddr_in *)&cm_ptr->addr)->sin_port) - 1000,
+ ntohs(cm_ptr->msg.p_size), &cm_ptr->msg.p_data);
+ dapl_ep_unlink_cm(cm_ptr->ep, cm_ptr);
+ dapli_cm_free(cm_ptr);
+ return;
+ }
+
goto bail;
}
--
1.5.2.5
More information about the ofw
mailing list