[ewg] Re: [Fwd: [Fwd: [ofa-general] [PATCH] perftest Add rdma_cm retries]]

Ido Shamai idos at dev.mellanox.co.il
Sun Oct 4 04:40:41 PDT 2009


Tried to apply it again and now it worked .
So you can see the Ex message as cancelled .

Applied.

Regards ,
Ido Shamay




Ido Shamai wrote:
>
> Hey Dave ,
>
> Sorry about the late respond , I'm new and it took some time to order 
> up things.
> From some reason the patch doesn't work , please check it out.
> I've added the log below.
>
> Thanks Ido
>
>
>
> git am ../patches/patch_no_3.eml
>
> Applying perftest Add rdma_cm retries
>
> error: patch failed: rdma_bw.c:152
> error: rdma_bw.c: patch does not apply
> error: patch failed: rdma_lat.c:228
> error: rdma_lat.c: patch does not apply
> Patch failed at 0001.
> When you have resolved this problem run "git-am --resolved".
> If you would prefer to skip this patch, instead run "git-am --skip".
>
> ------------------------------------------------------------------------
>
> Subject:
> [Fwd: [ofa-general] [PATCH] perftest Add rdma_cm retries]
> From:
> Vladimir Sokolovsky <vlad at dev.mellanox.co.il>
> Date:
> Tue, 08 Sep 2009 16:47:05 +0300
> To:
> idos at dev.mellanox.co.il
>
> To:
> idos at dev.mellanox.co.il
>
>
>
>
> ------------------------------------------------------------------------
>
> Subject:
> [ofa-general] [PATCH] perftest Add rdma_cm retries
> From:
> davem at systemfabricworks.com
> Date:
> Fri, 24 Jul 2009 13:01:58 -0500
> To:
> general at lists.openfabrics.org, orenmeron at dev.mellanox.co.il
>
> To:
> general at lists.openfabrics.org, orenmeron at dev.mellanox.co.il
>
>
> Here is version 3 of the patch.  Between Steve and Sean's comments, it
> seems there is no universally accepted answer, which is why it would be
> nice if the underlying system could provide good defaults for the user
> mode programs.  However, that isn't here yet, and I am not prepared to
> try to create such a thing, so I have redone this patch to allow command
> line specification of the timeout values and retry counts.  The timeout
> values are the same as the original code, and the retry counts are both
> set to 10.
>
> Dave
>
>
>   The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
>   some fabrics.  This adds command line options to set the number of retries
>   for each of the calls, with a default of 10.  Since there may be cases
>   where larger timeouts are desired, probably along with fewer retries, this
>   patch also adds the ability to specify the timeout values on the command
>   line.  If none of the command line options are chosen, it will now do the
>   retries and not fail in the larger and busier fabrics.
>
> Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
> ---
>  rdma_bw.c  |   52 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  rdma_lat.c |   62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/rdma_bw.c b/rdma_bw.c
> index 2628ac4..14ff80b 100755
> --- a/rdma_bw.c
> +++ b/rdma_bw.c
> @@ -61,6 +61,10 @@
>  #define PINGPONG_RDMA_WRID	3
>  
>  static int sl = 0;
> +static int addr_timeout = 2000;
> +static int addr_retries = 10;
> +static int route_timeout = 2000;
> +static int route_retries = 10;
>  static int page_size;
>  static pid_t pid;
>  
> @@ -152,8 +156,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>  		sin.sin_family = AF_INET;
>  		sin.sin_port = htons(data->port);
> +retry_addr:
>  		if (rdma_resolve_addr(data->cm_id, NULL,
> -					 (struct sockaddr *)&sin, 2000)) {
> +				      (struct sockaddr *)&sin, addr_retries)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
>  					 pid, __func__ );
>  			goto err2;
> @@ -162,6 +167,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event)) 
>  			goto err2;
>  
> +
> +		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> +		 && addr_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_addr;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  				pid, __func__, event->event);
> @@ -169,7 +181,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		}
>  		rdma_ack_cm_event(event);
>  	
> -		if (rdma_resolve_route(data->cm_id, 2000)) {
> +retry_route:
> +		if (rdma_resolve_route(data->cm_id, route_timeout)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
>  						pid, __func__);
>  			goto err2;
> @@ -178,6 +191,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event))
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> +		 && route_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_route;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  					pid, __func__, event->event);
> @@ -863,6 +882,10 @@ static void usage(const char *argv0)
>  	printf("  -S, --sl=<sl>          SL (default 0)\n");
>  	printf("  -b, --bidirectional    measure bidirectional bandwidth (default unidirectional)\n");
>  	printf("  -c, --cma		 use RDMA CM\n");
> +	printf("  --addr-timeout=<ms>	 RDMA CM resolve_addr timeout ms (default 2000)\n");
> +	printf("  --addr-retries=<num>	 RDMA CM resolve_addr retry count (default 10)\n");
> +	printf("  --route-timeout=<ms>	 RDMA CM resolve_route timeout ms (default 2000)\n");
> +	printf("  --route-retries=<num>	 RDMA CM resolve_route retry count (default 10)\n");
>  }
>  
>  static void print_report(unsigned int iters, unsigned size, int duplex,
> @@ -949,6 +972,10 @@ int main(int argc, char *argv[])
>  			{ .name = "sl",             .has_arg = 1, .val = 'S' },
>  			{ .name = "bidirectional",  .has_arg = 0, .val = 'b' },
>  			{ .name = "cma", 	    .has_arg = 0, .val = 'c' },
> +			{ .name = "addr-timeout",   .has_arg = 1, .val = 1 },
> +			{ .name = "addr-retries",   .has_arg = 1, .val = 2 },
> +			{ .name = "route-timeout",  .has_arg = 1, .val = 3 },
> +			{ .name = "route-retries",  .has_arg = 1, .val = 4 },
>  			{ 0 }
>  		};
>  
> @@ -1011,6 +1038,27 @@ int main(int argc, char *argv[])
>  		case 'c':
>  			data.use_cma = 1;
>  			break;
> +
> +		case 1:
> +			addr_timeout = strtol(optarg, NULL, 0);
> +			if (addr_timeout <= 0) { usage(argv[0]); return 1; }
> +			break;
> +
> +		case 2:
> +			addr_retries = strtol(optarg, NULL, 0);
> +			if (addr_retries < 0) { usage(argv[0]); return 1; }
> +			break;
> +
> +		case 3:
> +			route_timeout = strtol(optarg, NULL, 0);
> +			if (route_timeout <= 0) { usage(argv[0]); return 1; }
> +			break;
> +
> +		case 4:
> +			route_retries = strtol(optarg, NULL, 0);
> +			if (route_retries < 0) { usage(argv[0]); return 1; }
> +			break;
> +
>  		default:
>  			usage(argv[0]);
>  			return 1;
> diff --git a/rdma_lat.c b/rdma_lat.c
> index 3681b35..cb5a6e4 100755
> --- a/rdma_lat.c
> +++ b/rdma_lat.c
> @@ -63,6 +63,10 @@
>  
>  static int inline_size = MAX_INLINE;
>  static int sl = 0;
> +static int addr_timeout = 2000;
> +static int addr_retries = 10;
> +static int route_timeout = 2000;
> +static int route_retries = 10;
>  static int page_size;
>  static pid_t pid;
>  
> @@ -228,8 +232,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>  		sin.sin_family = AF_INET;
>  		sin.sin_port = htons(data->port);
> +retry_addr:
>  		if (rdma_resolve_addr(data->cm_id, NULL,
> -					 (struct sockaddr *)&sin, 2000)) {
> +				      (struct sockaddr *)&sin, addr_timeout)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
>  					 pid, __func__ );
>  			goto err2;
> @@ -238,6 +243,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event)) 
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> +		 && addr_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_addr;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  				pid, __func__, event->event);
> @@ -245,7 +256,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		}
>  		rdma_ack_cm_event(event);
>  	
> -		if (rdma_resolve_route(data->cm_id, 2000)) {
> +retry_route:
> +		if (rdma_resolve_route(data->cm_id, route_timeout)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
>  						pid, __func__);
>  			goto err2;
> @@ -254,6 +266,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event))
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> +		 && route_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_route;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  					pid, __func__, event->event);
> @@ -929,6 +947,10 @@ static void usage(const char *argv0)
>  	printf("  -H, --report-histogram print out all results (default print summary only)\n");
>  	printf("  -U, --report-unsorted  (implies -H) print out unsorted results (default sorted)\n");
>  	printf("  -c, --cma              Use the RDMA CMA to setup the RDMA connection\n");
> +	printf("  --addr-timeout=<ms>	 RDMA CM resolve_addr timeout ms (default 2000)\n");
> +	printf("  --addr-retries=<num>	 RDMA CM resolve_addr retry count (default 10)\n");
> +	printf("  --route-timeout=<ms>	 RDMA CM resolve_route timeout ms (default 2000)\n");
> +	printf("  --route-retries=<num>	 RDMA CM resolve_route retry count (default 10)\n");
>  }
>  
>  /*
> @@ -1052,6 +1074,10 @@ int main(int argc, char *argv[])
>  			{ .name = "report-histogram",.has_arg = 0, .val = 'H' },
>  			{ .name = "report-unsorted",.has_arg = 0, .val = 'U' },
>  			{ .name = "cma", 	    .has_arg = 0, .val = 'c' },
> +			{ .name = "addr-timeout",   .has_arg = 1, .val = 1 },
> +			{ .name = "addr-retries",   .has_arg = 1, .val = 2 },
> +			{ .name = "route-timeout",  .has_arg = 1, .val = 3 },
> +			{ .name = "route-retries",  .has_arg = 1, .val = 4 },
>  			{ 0 }
>  		};
>  
> @@ -1123,6 +1149,38 @@ int main(int argc, char *argv[])
>  				data.use_cma = 1;
>  				break;
>  
> +			case 1:
> +				addr_timeout = strtol(optarg, NULL, 0);
> +				if (addr_timeout <= 0) {
> +					usage(argv[0]);
> +					return 7;
> +				}
> +				break;
> +
> +			case 2:
> +				addr_retries = strtol(optarg, NULL, 0);
> +				if (addr_retries < 0) {
> +					usage(argv[0]);
> +					return 7;
> +				}
> +				break;
> +
> +			case 3:
> +				route_timeout = strtol(optarg, NULL, 0);
> +				if (route_timeout <= 0) {
> +					usage(argv[0]);
> +					return 7;
> +				}
> +				break;
> +
> +			case 4:
> +				route_retries = strtol(optarg, NULL, 0);
> +				if (route_retries < 0) {
> +					usage(argv[0]);
> +					return 7;
> +				}
> +				break;
> +
>  			default:
>  				usage(argv[0]);
>  				return 7;
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>
>   




More information about the ewg mailing list