[ofa-general] [PATCH] perftest Add rdma_cm retries

davem at systemfabricworks.com davem at systemfabricworks.com
Fri Jul 24 11:01:58 PDT 2009


Here is version 3 of the patch.  Between Steve and Sean's comments, it
seems there is no universally accepted answer, which is why it would be
nice if the underlying system could provide good defaults for the user
mode programs.  However, that isn't here yet, and I am not prepared to
try to create such a thing, so I have redone this patch to allow command
line specification of the timeout values and retry counts.  The timeout
values are the same as the original code, and the retry counts are both
set to 10.

Dave


  The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
  some fabrics.  This adds command line options to set the number of retries
  for each of the calls, with a default of 10.  Since there may be cases
  where larger timeouts are desired, probably along with fewer retries, this
  patch also adds the ability to specify the timeout values on the command
  line.  If none of the command line options are chosen, it will now do the
  retries and not fail in the larger and busier fabrics.

Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
---
 rdma_bw.c  |   52 ++++++++++++++++++++++++++++++++++++++++++++++++-
 rdma_lat.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/rdma_bw.c b/rdma_bw.c
index 2628ac4..14ff80b 100755
--- a/rdma_bw.c
+++ b/rdma_bw.c
@@ -61,6 +61,10 @@
 #define PINGPONG_RDMA_WRID	3
 
 static int sl = 0;
+static int addr_timeout = 2000;
+static int addr_retries = 10;
+static int route_timeout = 2000;
+static int route_retries = 10;
 static int page_size;
 static pid_t pid;
 
@@ -152,8 +156,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
 		sin.sin_family = AF_INET;
 		sin.sin_port = htons(data->port);
+retry_addr:
 		if (rdma_resolve_addr(data->cm_id, NULL,
-					 (struct sockaddr *)&sin, 2000)) {
+				      (struct sockaddr *)&sin, addr_retries)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
 					 pid, __func__ );
 			goto err2;
@@ -162,6 +167,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event)) 
 			goto err2;
 
+
+		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+		 && addr_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_addr;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 				pid, __func__, event->event);
@@ -169,7 +181,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		}
 		rdma_ack_cm_event(event);
 	
-		if (rdma_resolve_route(data->cm_id, 2000)) {
+retry_route:
+		if (rdma_resolve_route(data->cm_id, route_timeout)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
 						pid, __func__);
 			goto err2;
@@ -178,6 +191,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event))
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+		 && route_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_route;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 					pid, __func__, event->event);
@@ -863,6 +882,10 @@ static void usage(const char *argv0)
 	printf("  -S, --sl=<sl>          SL (default 0)\n");
 	printf("  -b, --bidirectional    measure bidirectional bandwidth (default unidirectional)\n");
 	printf("  -c, --cma		 use RDMA CM\n");
+	printf("  --addr-timeout=<ms>	 RDMA CM resolve_addr timeout ms (default 2000)\n");
+	printf("  --addr-retries=<num>	 RDMA CM resolve_addr retry count (default 10)\n");
+	printf("  --route-timeout=<ms>	 RDMA CM resolve_route timeout ms (default 2000)\n");
+	printf("  --route-retries=<num>	 RDMA CM resolve_route retry count (default 10)\n");
 }
 
 static void print_report(unsigned int iters, unsigned size, int duplex,
@@ -949,6 +972,10 @@ int main(int argc, char *argv[])
 			{ .name = "sl",             .has_arg = 1, .val = 'S' },
 			{ .name = "bidirectional",  .has_arg = 0, .val = 'b' },
 			{ .name = "cma", 	    .has_arg = 0, .val = 'c' },
+			{ .name = "addr-timeout",   .has_arg = 1, .val = 1 },
+			{ .name = "addr-retries",   .has_arg = 1, .val = 2 },
+			{ .name = "route-timeout",  .has_arg = 1, .val = 3 },
+			{ .name = "route-retries",  .has_arg = 1, .val = 4 },
 			{ 0 }
 		};
 
@@ -1011,6 +1038,27 @@ int main(int argc, char *argv[])
 		case 'c':
 			data.use_cma = 1;
 			break;
+
+		case 1:
+			addr_timeout = strtol(optarg, NULL, 0);
+			if (addr_timeout <= 0) { usage(argv[0]); return 1; }
+			break;
+
+		case 2:
+			addr_retries = strtol(optarg, NULL, 0);
+			if (addr_retries < 0) { usage(argv[0]); return 1; }
+			break;
+
+		case 3:
+			route_timeout = strtol(optarg, NULL, 0);
+			if (route_timeout <= 0) { usage(argv[0]); return 1; }
+			break;
+
+		case 4:
+			route_retries = strtol(optarg, NULL, 0);
+			if (route_retries < 0) { usage(argv[0]); return 1; }
+			break;
+
 		default:
 			usage(argv[0]);
 			return 1;
diff --git a/rdma_lat.c b/rdma_lat.c
index 3681b35..cb5a6e4 100755
--- a/rdma_lat.c
+++ b/rdma_lat.c
@@ -63,6 +63,10 @@
 
 static int inline_size = MAX_INLINE;
 static int sl = 0;
+static int addr_timeout = 2000;
+static int addr_retries = 10;
+static int route_timeout = 2000;
+static int route_retries = 10;
 static int page_size;
 static pid_t pid;
 
@@ -228,8 +232,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
 		sin.sin_family = AF_INET;
 		sin.sin_port = htons(data->port);
+retry_addr:
 		if (rdma_resolve_addr(data->cm_id, NULL,
-					 (struct sockaddr *)&sin, 2000)) {
+				      (struct sockaddr *)&sin, addr_timeout)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
 					 pid, __func__ );
 			goto err2;
@@ -238,6 +243,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event)) 
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
+		 && addr_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_addr;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 				pid, __func__, event->event);
@@ -245,7 +256,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		}
 		rdma_ack_cm_event(event);
 	
-		if (rdma_resolve_route(data->cm_id, 2000)) {
+retry_route:
+		if (rdma_resolve_route(data->cm_id, route_timeout)) {
 			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
 						pid, __func__);
 			goto err2;
@@ -254,6 +266,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
 		if (rdma_get_cm_event(data->cm_channel, &event))
 			goto err2;
 
+		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
+		 && route_retries-- > 0) {
+			rdma_ack_cm_event(event);
+			goto retry_route;
+		}
+
 		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
 					pid, __func__, event->event);
@@ -929,6 +947,10 @@ static void usage(const char *argv0)
 	printf("  -H, --report-histogram print out all results (default print summary only)\n");
 	printf("  -U, --report-unsorted  (implies -H) print out unsorted results (default sorted)\n");
 	printf("  -c, --cma              Use the RDMA CMA to setup the RDMA connection\n");
+	printf("  --addr-timeout=<ms>	 RDMA CM resolve_addr timeout ms (default 2000)\n");
+	printf("  --addr-retries=<num>	 RDMA CM resolve_addr retry count (default 10)\n");
+	printf("  --route-timeout=<ms>	 RDMA CM resolve_route timeout ms (default 2000)\n");
+	printf("  --route-retries=<num>	 RDMA CM resolve_route retry count (default 10)\n");
 }
 
 /*
@@ -1052,6 +1074,10 @@ int main(int argc, char *argv[])
 			{ .name = "report-histogram",.has_arg = 0, .val = 'H' },
 			{ .name = "report-unsorted",.has_arg = 0, .val = 'U' },
 			{ .name = "cma", 	    .has_arg = 0, .val = 'c' },
+			{ .name = "addr-timeout",   .has_arg = 1, .val = 1 },
+			{ .name = "addr-retries",   .has_arg = 1, .val = 2 },
+			{ .name = "route-timeout",  .has_arg = 1, .val = 3 },
+			{ .name = "route-retries",  .has_arg = 1, .val = 4 },
 			{ 0 }
 		};
 
@@ -1123,6 +1149,38 @@ int main(int argc, char *argv[])
 				data.use_cma = 1;
 				break;
 
+			case 1:
+				addr_timeout = strtol(optarg, NULL, 0);
+				if (addr_timeout <= 0) {
+					usage(argv[0]);
+					return 7;
+				}
+				break;
+
+			case 2:
+				addr_retries = strtol(optarg, NULL, 0);
+				if (addr_retries < 0) {
+					usage(argv[0]);
+					return 7;
+				}
+				break;
+
+			case 3:
+				route_timeout = strtol(optarg, NULL, 0);
+				if (route_timeout <= 0) {
+					usage(argv[0]);
+					return 7;
+				}
+				break;
+
+			case 4:
+				route_retries = strtol(optarg, NULL, 0);
+				if (route_retries < 0) {
+					usage(argv[0]);
+					return 7;
+				}
+				break;
+
 			default:
 				usage(argv[0]);
 				return 7;



More information about the general mailing list