<br><br><div class="gmail_quote">On Thu, Jul 23, 2009 at 8:29 PM, Steve Wise <span dir="ltr"><<a href="mailto:swise@opengridcomputing.com">swise@opengridcomputing.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
Can't you just up the value passed into rdma_resolve_addr()?  Currently this code passes in 2000 (ms).  Did you try changing this to say 20000?</blockquote><div><br>I didn't try that.  Timeouts on rdma_resolve_addr are much more rare than on rdma_resolve_route, so test cases are harder to come by.  I did want to offer a solution that seemed to work.<br>
<br>I have not looked at every code path for every possible subsystem that rdma_cm will use.  I don't even have a good reason to know that any particular timeout value is appropriate.  It would be nice if there was some way to get that information for a particular instance of an rdma_cm_id.  The same goes for the retry mechanism - is it worthwhile to retry, and how many times is enough?  The values in this patch happen to work for the Infiniband fabrics I use, but my experience is limited.<br>
<br>Are you saying that one rdma_resolve_addr with a 20,000 ms timeout is as good (or maybe even better) than 10 repeats of failed calls using 2,000 ms timeouts?  If that is true, and always will be for any fabric rdma_cm uses, then it seems obvious that we should just change the timeout and not do the retry.<br>
<br>Thanks for thinking about this problem.<br><br>Dave<br><br> <br></div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;"><br>
<br>
<a href="mailto:davem@systemfabricworks.com" target="_blank">davem@systemfabricworks.com</a> wrote:<br>
<blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;"><div><div></div><div class="h5">
  The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in<br>
  some fabrics, and there is no retry in rdma_cm so this patch adds process<br>
  level retries.  A combined total of 10 retries for the pair is allowed.<br>
<br>
Signed-off-by: David A. McMillen <<a href="mailto:davem@systemfabricworks.com" target="_blank">davem@systemfabricworks.com</a>><br>
---<br>
 rdma_bw.c  |   16 ++++++++++++++++<br>
 rdma_lat.c |   15 +++++++++++++++<br>
 2 files changed, 31 insertions(+), 0 deletions(-)<br>
<br>
diff --git a/rdma_bw.c b/rdma_bw.c<br>
index 2628ac4..737558a 100755<br>
--- a/rdma_bw.c<br>
+++ b/rdma_bw.c<br>
@@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
        char *service;<br>
        int n;<br>
        int sockfd = -1;<br>
+       int n_retries = 10;<br>
        struct rdma_cm_event *event;<br>
        struct sockaddr_in sin;<br>
        struct pingpong_context *ctx = NULL;<br>
@@ -152,6 +153,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;<br>
                sin.sin_family = AF_INET;<br>
                sin.sin_port = htons(data->port);<br>
+retry_addr:<br>
                if (rdma_resolve_addr(data->cm_id, NULL,<br>
                                         (struct sockaddr *)&sin, 2000)) {<br>
                        fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",<br>
@@ -162,6 +164,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                if (rdma_get_cm_event(data->cm_channel, &event))                         goto err2;<br>
 +<br>
+               if (event->event == RDMA_CM_EVENT_ADDR_ERROR<br>
+                && n_retries-- > 0) {<br>
+                       rdma_ack_cm_event(event);<br>
+                       goto retry_addr;<br>
+               }<br>
+<br>
                if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {<br>
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",                                 pid, __func__, event->event);<br>
@@ -169,6 +178,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                }<br>
                rdma_ack_cm_event(event);<br>
        <br>
+retry_route:<br>
                if (rdma_resolve_route(data->cm_id, 2000)) {<br>
                        fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",                                                 pid, __func__);<br>
@@ -178,6 +188,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                if (rdma_get_cm_event(data->cm_channel, &event))<br>
                        goto err2;<br>
 +               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR<br>
+                && n_retries-- > 0) {<br>
+                       rdma_ack_cm_event(event);<br>
+                       goto retry_route;<br>
+               }<br>
+<br>
                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {<br>
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",                                         pid, __func__, event->event);<br>
diff --git a/rdma_lat.c b/rdma_lat.c<br>
index 3681b35..1f65086 100755<br>
--- a/rdma_lat.c<br>
+++ b/rdma_lat.c<br>
@@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
        char *service;<br>
        int n;<br>
        int sockfd = -1;<br>
+       int n_retries = 10;<br>
        struct rdma_cm_event *event;<br>
        struct sockaddr_in sin;<br>
        struct pingpong_context *ctx = NULL;<br>
@@ -228,6 +229,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;<br>
                sin.sin_family = AF_INET;<br>
                sin.sin_port = htons(data->port);<br>
+retry_addr:<br>
                if (rdma_resolve_addr(data->cm_id, NULL,<br>
                                         (struct sockaddr *)&sin, 2000)) {<br>
                        fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",<br>
@@ -238,6 +240,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                if (rdma_get_cm_event(data->cm_channel, &event))                         goto err2;<br>
 +               if (event->event == RDMA_CM_EVENT_ADDR_ERROR<br>
+                && n_retries-- > 0) {<br>
+                       rdma_ack_cm_event(event);<br>
+                       goto retry_addr;<br>
+               }<br>
+<br>
                if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {<br>
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",                                 pid, __func__, event->event);<br>
@@ -245,6 +253,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                }<br>
                rdma_ack_cm_event(event);<br>
        <br>
+retry_route:<br>
                if (rdma_resolve_route(data->cm_id, 2000)) {<br>
                        fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",                                                 pid, __func__);<br>
@@ -254,6 +263,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)<br>
                if (rdma_get_cm_event(data->cm_channel, &event))<br>
                        goto err2;<br>
 +               if (event->event == RDMA_CM_EVENT_ROUTE_ERROR<br>
+                && n_retries-- > 0) {<br>
+                       rdma_ack_cm_event(event);<br>
+                       goto retry_route;<br>
+               }<br>
+<br>
                if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {<br>
                        fprintf(stderr, "%d:%s: unexpected CM event %d\n",                                         pid, __func__, event->event);<br></div></div>
_______________________________________________<br>
general mailing list<br>
<a href="mailto:general@lists.openfabrics.org" target="_blank">general@lists.openfabrics.org</a><br>
<a href="http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general" target="_blank">http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general</a><br>
<br>
To unsubscribe, please visit <a href="http://openib.org/mailman/listinfo/openib-general" target="_blank">http://openib.org/mailman/listinfo/openib-general</a><br>
  <br>
</blockquote>
</blockquote></div><br>