[ofa-general] [PATCH] perftest Add rdma_cm retries

davem at systemfabricworks.com davem at systemfabricworks.com
Wed Jul 22 16:12:23 PDT 2009


  The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
  some fabrics, and there is no retry in rdma_cm so this patch adds process
  level retries.  A combined total of 10 retries for the pair is allowed.

Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
---
 rdma_bw.c  |   16 ++++++++++++++++
 rdma_lat.c |   15 +++++++++++++++
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/rdma_bw.c b/rdma_bw.c
index 2628ac4..737558a 100755
--- a/rdma_bw.c
+++ b/rdma_bw.c
@@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 	char *service;
 	int n;
 	int sockfd = -1;
+	int n_retries = 10;
 	struct rdma_cm_event *event;
 	struct sockaddr_in sin;
 	struct pingpong_context *ctx = NULL;
@@ -152,6 +153,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
 		sin.sin_family = AF_INET;
 		sin.sin_port = htons(data->port);
+retry_addr:
 		if (rdma_resolve_addr(data->cm_id, NULL,
 					 (struct sockaddr *)&sin, 2000)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
@@ -162,6 +164,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event)) 
 			goto err2;
 
+
+		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+		 && n_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_addr;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 				pid, __func__, event->event);
@@ -169,6 +178,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		}
 		rdma_ack_cm_event(event);
 	
+retry_route:
 		if (rdma_resolve_route(data->cm_id, 2000)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
 						pid, __func__);
@@ -178,6 +188,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event))
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+		 && n_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_route;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 					pid, __func__, event->event);
diff --git a/rdma_lat.c b/rdma_lat.c
index 3681b35..1f65086 100755
--- a/rdma_lat.c
+++ b/rdma_lat.c
@@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 	char *service;
 	int n;
 	int sockfd = -1;
+	int n_retries = 10;
 	struct rdma_cm_event *event;
 	struct sockaddr_in sin;
 	struct pingpong_context *ctx = NULL;
@@ -228,6 +229,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
 		sin.sin_family = AF_INET;
 		sin.sin_port = htons(data->port);
+retry_addr:
 		if (rdma_resolve_addr(data->cm_id, NULL,
 					 (struct sockaddr *)&sin, 2000)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
@@ -238,6 +240,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event)) 
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+		 && n_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_addr;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 				pid, __func__, event->event);
@@ -245,6 +253,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		}
 		rdma_ack_cm_event(event);
 	
+retry_route:
 		if (rdma_resolve_route(data->cm_id, 2000)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
 						pid, __func__);
@@ -254,6 +263,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event))
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+		 && n_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_route;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 					pid, __func__, event->event);



More information about the general mailing list