[ewg] Re: [Fwd: [Fwd: [ofa-general] [PATCH] perftest Add rdma_cm retries]]
Ido Shamai
idos at dev.mellanox.co.il
Sun Oct 4 04:40:41 PDT 2009
Tried to apply it again and now it worked .
So you can see the Ex message as cancelled .
Applied.
Regards ,
Ido Shamay
Ido Shamai wrote:
>
> Hey Dave ,
>
> Sorry about the late respond , I'm new and it took some time to order
> up things.
> From some reason the patch doesn't work , please check it out.
> I've added the log below.
>
> Thanks Ido
>
>
>
> git am ../patches/patch_no_3.eml
>
> Applying perftest Add rdma_cm retries
>
> error: patch failed: rdma_bw.c:152
> error: rdma_bw.c: patch does not apply
> error: patch failed: rdma_lat.c:228
> error: rdma_lat.c: patch does not apply
> Patch failed at 0001.
> When you have resolved this problem run "git-am --resolved".
> If you would prefer to skip this patch, instead run "git-am --skip".
>
> ------------------------------------------------------------------------
>
> Subject:
> [Fwd: [ofa-general] [PATCH] perftest Add rdma_cm retries]
> From:
> Vladimir Sokolovsky <vlad at dev.mellanox.co.il>
> Date:
> Tue, 08 Sep 2009 16:47:05 +0300
> To:
> idos at dev.mellanox.co.il
>
> To:
> idos at dev.mellanox.co.il
>
>
>
>
> ------------------------------------------------------------------------
>
> Subject:
> [ofa-general] [PATCH] perftest Add rdma_cm retries
> From:
> davem at systemfabricworks.com
> Date:
> Fri, 24 Jul 2009 13:01:58 -0500
> To:
> general at lists.openfabrics.org, orenmeron at dev.mellanox.co.il
>
> To:
> general at lists.openfabrics.org, orenmeron at dev.mellanox.co.il
>
>
> Here is version 3 of the patch. Between Steve and Sean's comments, it
> seems there is no universally accepted answer, which is why it would be
> nice if the underlying system could provide good defaults for the user
> mode programs. However, that isn't here yet, and I am not prepared to
> try to create such a thing, so I have redone this patch to allow command
> line specification of the timeout values and retry counts. The timeout
> values are the same as the original code, and the retry counts are both
> set to 10.
>
> Dave
>
>
> The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
> some fabrics. This adds command line options to set the number of retries
> for each of the calls, with a default of 10. Since there may be cases
> where larger timeouts are desired, probably along with fewer retries, this
> patch also adds the ability to specify the timeout values on the command
> line. If none of the command line options are chosen, it will now do the
> retries and not fail in the larger and busier fabrics.
>
> Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
> ---
> rdma_bw.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-
> rdma_lat.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 110 insertions(+), 4 deletions(-)
>
> diff --git a/rdma_bw.c b/rdma_bw.c
> index 2628ac4..14ff80b 100755
> --- a/rdma_bw.c
> +++ b/rdma_bw.c
> @@ -61,6 +61,10 @@
> #define PINGPONG_RDMA_WRID 3
>
> static int sl = 0;
> +static int addr_timeout = 2000;
> +static int addr_retries = 10;
> +static int route_timeout = 2000;
> +static int route_retries = 10;
> static int page_size;
> static pid_t pid;
>
> @@ -152,8 +156,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
> sin.sin_family = AF_INET;
> sin.sin_port = htons(data->port);
> +retry_addr:
> if (rdma_resolve_addr(data->cm_id, NULL,
> - (struct sockaddr *)&sin, 2000)) {
> + (struct sockaddr *)&sin, addr_retries)) {
> fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> pid, __func__ );
> goto err2;
> @@ -162,6 +167,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> +
> + if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> + && addr_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_addr;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -169,7 +181,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> }
> rdma_ack_cm_event(event);
>
> - if (rdma_resolve_route(data->cm_id, 2000)) {
> +retry_route:
> + if (rdma_resolve_route(data->cm_id, route_timeout)) {
> fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
> pid, __func__);
> goto err2;
> @@ -178,6 +191,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> + && route_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_route;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -863,6 +882,10 @@ static void usage(const char *argv0)
> printf(" -S, --sl=<sl> SL (default 0)\n");
> printf(" -b, --bidirectional measure bidirectional bandwidth (default unidirectional)\n");
> printf(" -c, --cma use RDMA CM\n");
> + printf(" --addr-timeout=<ms> RDMA CM resolve_addr timeout ms (default 2000)\n");
> + printf(" --addr-retries=<num> RDMA CM resolve_addr retry count (default 10)\n");
> + printf(" --route-timeout=<ms> RDMA CM resolve_route timeout ms (default 2000)\n");
> + printf(" --route-retries=<num> RDMA CM resolve_route retry count (default 10)\n");
> }
>
> static void print_report(unsigned int iters, unsigned size, int duplex,
> @@ -949,6 +972,10 @@ int main(int argc, char *argv[])
> { .name = "sl", .has_arg = 1, .val = 'S' },
> { .name = "bidirectional", .has_arg = 0, .val = 'b' },
> { .name = "cma", .has_arg = 0, .val = 'c' },
> + { .name = "addr-timeout", .has_arg = 1, .val = 1 },
> + { .name = "addr-retries", .has_arg = 1, .val = 2 },
> + { .name = "route-timeout", .has_arg = 1, .val = 3 },
> + { .name = "route-retries", .has_arg = 1, .val = 4 },
> { 0 }
> };
>
> @@ -1011,6 +1038,27 @@ int main(int argc, char *argv[])
> case 'c':
> data.use_cma = 1;
> break;
> +
> + case 1:
> + addr_timeout = strtol(optarg, NULL, 0);
> + if (addr_timeout <= 0) { usage(argv[0]); return 1; }
> + break;
> +
> + case 2:
> + addr_retries = strtol(optarg, NULL, 0);
> + if (addr_retries < 0) { usage(argv[0]); return 1; }
> + break;
> +
> + case 3:
> + route_timeout = strtol(optarg, NULL, 0);
> + if (route_timeout <= 0) { usage(argv[0]); return 1; }
> + break;
> +
> + case 4:
> + route_retries = strtol(optarg, NULL, 0);
> + if (route_retries < 0) { usage(argv[0]); return 1; }
> + break;
> +
> default:
> usage(argv[0]);
> return 1;
> diff --git a/rdma_lat.c b/rdma_lat.c
> index 3681b35..cb5a6e4 100755
> --- a/rdma_lat.c
> +++ b/rdma_lat.c
> @@ -63,6 +63,10 @@
>
> static int inline_size = MAX_INLINE;
> static int sl = 0;
> +static int addr_timeout = 2000;
> +static int addr_retries = 10;
> +static int route_timeout = 2000;
> +static int route_retries = 10;
> static int page_size;
> static pid_t pid;
>
> @@ -228,8 +232,9 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
> sin.sin_family = AF_INET;
> sin.sin_port = htons(data->port);
> +retry_addr:
> if (rdma_resolve_addr(data->cm_id, NULL,
> - (struct sockaddr *)&sin, 2000)) {
> + (struct sockaddr *)&sin, addr_timeout)) {
> fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> pid, __func__ );
> goto err2;
> @@ -238,6 +243,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> + && addr_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_addr;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -245,7 +256,8 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> }
> rdma_ack_cm_event(event);
>
> - if (rdma_resolve_route(data->cm_id, 2000)) {
> +retry_route:
> + if (rdma_resolve_route(data->cm_id, route_timeout)) {
> fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
> pid, __func__);
> goto err2;
> @@ -254,6 +266,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> + && route_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_route;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -929,6 +947,10 @@ static void usage(const char *argv0)
> printf(" -H, --report-histogram print out all results (default print summary only)\n");
> printf(" -U, --report-unsorted (implies -H) print out unsorted results (default sorted)\n");
> printf(" -c, --cma Use the RDMA CMA to setup the RDMA connection\n");
> + printf(" --addr-timeout=<ms> RDMA CM resolve_addr timeout ms (default 2000)\n");
> + printf(" --addr-retries=<num> RDMA CM resolve_addr retry count (default 10)\n");
> + printf(" --route-timeout=<ms> RDMA CM resolve_route timeout ms (default 2000)\n");
> + printf(" --route-retries=<num> RDMA CM resolve_route retry count (default 10)\n");
> }
>
> /*
> @@ -1052,6 +1074,10 @@ int main(int argc, char *argv[])
> { .name = "report-histogram",.has_arg = 0, .val = 'H' },
> { .name = "report-unsorted",.has_arg = 0, .val = 'U' },
> { .name = "cma", .has_arg = 0, .val = 'c' },
> + { .name = "addr-timeout", .has_arg = 1, .val = 1 },
> + { .name = "addr-retries", .has_arg = 1, .val = 2 },
> + { .name = "route-timeout", .has_arg = 1, .val = 3 },
> + { .name = "route-retries", .has_arg = 1, .val = 4 },
> { 0 }
> };
>
> @@ -1123,6 +1149,38 @@ int main(int argc, char *argv[])
> data.use_cma = 1;
> break;
>
> + case 1:
> + addr_timeout = strtol(optarg, NULL, 0);
> + if (addr_timeout <= 0) {
> + usage(argv[0]);
> + return 7;
> + }
> + break;
> +
> + case 2:
> + addr_retries = strtol(optarg, NULL, 0);
> + if (addr_retries < 0) {
> + usage(argv[0]);
> + return 7;
> + }
> + break;
> +
> + case 3:
> + route_timeout = strtol(optarg, NULL, 0);
> + if (route_timeout <= 0) {
> + usage(argv[0]);
> + return 7;
> + }
> + break;
> +
> + case 4:
> + route_retries = strtol(optarg, NULL, 0);
> + if (route_retries < 0) {
> + usage(argv[0]);
> + return 7;
> + }
> + break;
> +
> default:
> usage(argv[0]);
> return 7;
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>
>
More information about the ewg
mailing list