[ofa-general] [PATCH] perftest Add rdma_cm retries

David McMillen davem at systemfabricworks.com
Thu Jul 23 19:42:09 PDT 2009


On Thu, Jul 23, 2009 at 8:29 PM, Steve Wise <swise at opengridcomputing.com>wrote:

> Can't you just up the value passed into rdma_resolve_addr()?  Currently
> this code passes in 2000 (ms).  Did you try changing this to say 20000?


I didn't try that.  Timeouts on rdma_resolve_addr are much more rare than on
rdma_resolve_route, so test cases are harder to come by.  I did want to
offer a solution that seemed to work.

I have not looked at every code path for every possible subsystem that
rdma_cm will use.  I don't even have a good reason to know that any
particular timeout value is appropriate.  It would be nice if there was some
way to get that information for a particular instance of an rdma_cm_id.  The
same goes for the retry mechanism - is it worthwhile to retry, and how many
times is enough?  The values in this patch happen to work for the Infiniband
fabrics I use, but my experience is limited.

Are you saying that one rdma_resolve_addr with a 20,000 ms timeout is as
good (or maybe even better) than 10 repeats of failed calls using 2,000 ms
timeouts?  If that is true, and always will be for any fabric rdma_cm uses,
then it seems obvious that we should just change the timeout and not do the
retry.

Thanks for thinking about this problem.

Dave



>
>
> davem at systemfabricworks.com wrote:
>
>>  The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen
>> in
>>  some fabrics, and there is no retry in rdma_cm so this patch adds process
>>  level retries.  A combined total of 10 retries for the pair is allowed.
>>
>> Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
>> ---
>>  rdma_bw.c  |   16 ++++++++++++++++
>>  rdma_lat.c |   15 +++++++++++++++
>>  2 files changed, 31 insertions(+), 0 deletions(-)
>>
>> diff --git a/rdma_bw.c b/rdma_bw.c
>> index 2628ac4..737558a 100755
>> --- a/rdma_bw.c
>> +++ b/rdma_bw.c
>> @@ -131,6 +131,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>        char *service;
>>        int n;
>>        int sockfd = -1;
>> +       int n_retries = 10;
>>        struct rdma_cm_event *event;
>>        struct sockaddr_in sin;
>>        struct pingpong_context *ctx = NULL;
>> @@ -152,6 +153,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                sin.sin_addr.s_addr = ((struct
>> sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>>                sin.sin_family = AF_INET;
>>                sin.sin_port = htons(data->port);
>> +retry_addr:
>>                if (rdma_resolve_addr(data->cm_id, NULL,
>>                                         (struct sockaddr *)&sin, 2000)) {
>>                        fprintf(stderr, "%d:%s: rdma_resolve_addr
>> failed\n",
>> @@ -162,6 +164,13 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                if (rdma_get_cm_event(data->cm_channel, &event))
>>              goto err2;
>>  +
>> +               if (event->event == RDMA_CM_EVENT_ADDR_ERROR
>> +                && n_retries-- > 0) {
>> +                       rdma_ack_cm_event(event);
>> +                       goto retry_addr;
>> +               }
>> +
>>                if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>>                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",
>>                                pid, __func__, event->event);
>> @@ -169,6 +178,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                }
>>                rdma_ack_cm_event(event);
>>
>> +retry_route:
>>                if (rdma_resolve_route(data->cm_id, 2000)) {
>>                        fprintf(stderr, "%d:%s: rdma_resolve_route
>> failed\n",                                                pid, __func__);
>> @@ -178,6 +188,12 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                if (rdma_get_cm_event(data->cm_channel, &event))
>>                        goto err2;
>>  +               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
>> +                && n_retries-- > 0) {
>> +                       rdma_ack_cm_event(event);
>> +                       goto retry_route;
>> +               }
>> +
>>                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>>                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",
>>                                        pid, __func__, event->event);
>> diff --git a/rdma_lat.c b/rdma_lat.c
>> index 3681b35..1f65086 100755
>> --- a/rdma_lat.c
>> +++ b/rdma_lat.c
>> @@ -207,6 +207,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>        char *service;
>>        int n;
>>        int sockfd = -1;
>> +       int n_retries = 10;
>>        struct rdma_cm_event *event;
>>        struct sockaddr_in sin;
>>        struct pingpong_context *ctx = NULL;
>> @@ -228,6 +229,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                sin.sin_addr.s_addr = ((struct
>> sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>>                sin.sin_family = AF_INET;
>>                sin.sin_port = htons(data->port);
>> +retry_addr:
>>                if (rdma_resolve_addr(data->cm_id, NULL,
>>                                         (struct sockaddr *)&sin, 2000)) {
>>                        fprintf(stderr, "%d:%s: rdma_resolve_addr
>> failed\n",
>> @@ -238,6 +240,12 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                if (rdma_get_cm_event(data->cm_channel, &event))
>>              goto err2;
>>  +               if (event->event == RDMA_CM_EVENT_ADDR_ERROR
>> +                && n_retries-- > 0) {
>> +                       rdma_ack_cm_event(event);
>> +                       goto retry_addr;
>> +               }
>> +
>>                if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>>                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",
>>                                pid, __func__, event->event);
>> @@ -245,6 +253,7 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                }
>>                rdma_ack_cm_event(event);
>>
>> +retry_route:
>>                if (rdma_resolve_route(data->cm_id, 2000)) {
>>                        fprintf(stderr, "%d:%s: rdma_resolve_route
>> failed\n",                                                pid, __func__);
>> @@ -254,6 +263,12 @@ static struct pingpong_context
>> *pp_client_connect(struct pp_data *data)
>>                if (rdma_get_cm_event(data->cm_channel, &event))
>>                        goto err2;
>>  +               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
>> +                && n_retries-- > 0) {
>> +                       rdma_ack_cm_event(event);
>> +                       goto retry_route;
>> +               }
>> +
>>                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>>                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",
>>                                        pid, __func__, event->event);
>> _______________________________________________
>> general mailing list
>> general at lists.openfabrics.org
>> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>>
>> To unsubscribe, please visit
>> http://openib.org/mailman/listinfo/openib-general
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20090723/5e70bb99/attachment.html>


More information about the general mailing list