[ofa-general] [PATCH] perftest Add rdma_cm retries
Steve Wise
swise at opengridcomputing.com
Thu Jul 23 18:29:16 PDT 2009
Can't you just up the value passed into rdma_resolve_addr()? Currently
this code passes in 2000 (ms). Did you try changing this to say 20000?
davem at systemfabricworks.com wrote:
> The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
> some fabrics, and there is no retry in rdma_cm so this patch adds process
> level retries. A combined total of 10 retries for the pair is allowed.
>
> Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
> ---
> rdma_bw.c | 16 ++++++++++++++++
> rdma_lat.c | 15 +++++++++++++++
> 2 files changed, 31 insertions(+), 0 deletions(-)
>
> diff --git a/rdma_bw.c b/rdma_bw.c
> index 2628ac4..737558a 100755
> --- a/rdma_bw.c
> +++ b/rdma_bw.c
> @@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> char *service;
> int n;
> int sockfd = -1;
> + int n_retries = 10;
> struct rdma_cm_event *event;
> struct sockaddr_in sin;
> struct pingpong_context *ctx = NULL;
> @@ -152,6 +153,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
> sin.sin_family = AF_INET;
> sin.sin_port = htons(data->port);
> +retry_addr:
> if (rdma_resolve_addr(data->cm_id, NULL,
> (struct sockaddr *)&sin, 2000)) {
> fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> @@ -162,6 +164,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> +
> + if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> + && n_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_addr;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -169,6 +178,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> }
> rdma_ack_cm_event(event);
>
> +retry_route:
> if (rdma_resolve_route(data->cm_id, 2000)) {
> fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
> pid, __func__);
> @@ -178,6 +188,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> + && n_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_route;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> diff --git a/rdma_lat.c b/rdma_lat.c
> index 3681b35..1f65086 100755
> --- a/rdma_lat.c
> +++ b/rdma_lat.c
> @@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> char *service;
> int n;
> int sockfd = -1;
> + int n_retries = 10;
> struct rdma_cm_event *event;
> struct sockaddr_in sin;
> struct pingpong_context *ctx = NULL;
> @@ -228,6 +229,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
> sin.sin_family = AF_INET;
> sin.sin_port = htons(data->port);
> +retry_addr:
> if (rdma_resolve_addr(data->cm_id, NULL,
> (struct sockaddr *)&sin, 2000)) {
> fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> @@ -238,6 +240,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> + && n_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_addr;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> @@ -245,6 +253,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> }
> rdma_ack_cm_event(event);
>
> +retry_route:
> if (rdma_resolve_route(data->cm_id, 2000)) {
> fprintf(stderr, "%d:%s: rdma_resolve_route failed\n",
> pid, __func__);
> @@ -254,6 +263,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
> if (rdma_get_cm_event(data->cm_channel, &event))
> goto err2;
>
> + if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> + && n_retries-- > 0) {
> + rdma_ack_cm_event(event);
> + goto retry_route;
> + }
> +
> if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> fprintf(stderr, "%d:%s: unexpected CM event %d\n",
> pid, __func__, event->event);
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>
More information about the general
mailing list