[ofw] [PATCH v2] uDAPL v2.0: scm: retry socket connect on ECONNREFUSED under heavy load
Davis, Arlin R
arlin.r.davis at intel.com
Tue Jan 4 20:27:40 PST 2011
>> + if ((err == ETIMEDOUT) || (ECONNREFUSED && --cm_ptr->retry)) {
>
>This is missing 'err ==' on the right-hand side.
Just making sure you are paying attention. Thanks!
---
with large scale workloads a linux server starts rejecting
socket connect requests. Add retry logic for connection refused
errors.
increasing net.ipv4.tcp_max_syn_backlog to 2048 will also reduce the
chance of these errors when scaling up.
Signed-off-by: Arlin Davis <arlin.r.davis at intel.com>
---
dapl/openib_scm/cm.c | 6 ++++--
dapl/openib_scm/dapl_ib_util.h | 2 ++
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/dapl/openib_scm/cm.c b/dapl/openib_scm/cm.c
index f82d0ff..c638663 100644
--- a/dapl/openib_scm/cm.c
+++ b/dapl/openib_scm/cm.c
@@ -390,6 +390,7 @@ static dp_ib_cm_handle_t dapli_cm_alloc(DAPL_EP *ep_ptr)
cm_ptr->msg.ver = htons(DCM_VER);
cm_ptr->socket = DAPL_INVALID_SOCKET;
+ cm_ptr->retry = SCM_CR_RETRY;
dapls_cm_acquire(cm_ptr);
/* Link EP and CM */
@@ -507,10 +508,11 @@ static void dapli_socket_connected(dp_ib_cm_handle_t cm_ptr, int err)
&cm_ptr->addr)->sin_addr),
ntohs(((struct sockaddr_in *)
&cm_ptr->addr)->sin_port),
- err == ETIMEDOUT ? "RETRYING...":"ABORTING");
+ (err == ETIMEDOUT || err == ECONNREFUSED) ?
+ "RETRYING...":"ABORTING");
/* retry a timeout */
- if (err == ETIMEDOUT) {
+ if ((err == ETIMEDOUT) || (err == ECONNREFUSED && --cm_ptr->retry)) {
closesocket(cm_ptr->socket);
cm_ptr->socket = DAPL_INVALID_SOCKET;
dapli_socket_connect(cm_ptr->ep, (DAT_IA_ADDRESS_PTR)&cm_ptr->addr,
diff --git a/dapl/openib_scm/dapl_ib_util.h b/dapl/openib_scm/dapl_ib_util.h
index 4bb1a4a..5f9fb43 100644
--- a/dapl/openib_scm/dapl_ib_util.h
+++ b/dapl/openib_scm/dapl_ib_util.h
@@ -40,6 +40,7 @@ struct ib_cm_handle
DAPL_OS_LOCK lock;
int ref_count;
int state;
+ int retry;
DAPL_SOCKET socket;
struct dapl_hca *hca;
struct dapl_sp *sp;
@@ -63,6 +64,7 @@ typedef dp_ib_cm_handle_t ib_cm_srvc_handle_t;
#define SCM_ACK_RETRY 7 /* 3 bits, 7 * 268ms = 1.8 seconds */
#define SCM_RNR_TIMER 12 /* 5 bits, 12 =.64ms, 28 =163ms, 31 =491ms */
#define SCM_RNR_RETRY 7 /* 3 bits, 7 == infinite */
+#define SCM_CR_RETRY 5 /* retries for busy server, connect refused */
#define SCM_IB_MTU 2048
/* Global routing defaults */
--
1.7.3
More information about the ofw
mailing list