[ofa-general] [PATCH] perftest Add rdma_cm retries

Steve Wise swise at opengridcomputing.com
Thu Jul 23 18:29:16 PDT 2009


Can't you just up the value passed into rdma_resolve_addr()?  Currently 
this code passes in 2000 (ms).  Did you try changing this to say 20000?

davem at systemfabricworks.com wrote:
>   The timeouts in rdma_resolve_addr and rdma_resolve_route actually happen in
>   some fabrics, and there is no retry in rdma_cm so this patch adds process
>   level retries.  A combined total of 10 retries for the pair is allowed.
>
> Signed-off-by: David A. McMillen <davem at systemfabricworks.com>
> ---
>  rdma_bw.c  |   16 ++++++++++++++++
>  rdma_lat.c |   15 +++++++++++++++
>  2 files changed, 31 insertions(+), 0 deletions(-)
>
> diff --git a/rdma_bw.c b/rdma_bw.c
> index 2628ac4..737558a 100755
> --- a/rdma_bw.c
> +++ b/rdma_bw.c
> @@ -131,6 +131,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  	char *service;
>  	int n;
>  	int sockfd = -1;
> +	int n_retries = 10;
>  	struct rdma_cm_event *event;
>  	struct sockaddr_in sin;
>  	struct pingpong_context *ctx = NULL;
> @@ -152,6 +153,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>  		sin.sin_family = AF_INET;
>  		sin.sin_port = htons(data->port);
> +retry_addr:
>  		if (rdma_resolve_addr(data->cm_id, NULL,
>  					 (struct sockaddr *)&sin, 2000)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> @@ -162,6 +164,13 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event)) 
>  			goto err2;
>  
> +
> +		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> +		 && n_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_addr;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  				pid, __func__, event->event);
> @@ -169,6 +178,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		}
>  		rdma_ack_cm_event(event);
>  	
> +retry_route:
>  		if (rdma_resolve_route(data->cm_id, 2000)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
>  						pid, __func__);
> @@ -178,6 +188,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event))
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> +		 && n_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_route;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  					pid, __func__, event->event);
> diff --git a/rdma_lat.c b/rdma_lat.c
> index 3681b35..1f65086 100755
> --- a/rdma_lat.c
> +++ b/rdma_lat.c
> @@ -207,6 +207,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  	char *service;
>  	int n;
>  	int sockfd = -1;
> +	int n_retries = 10;
>  	struct rdma_cm_event *event;
>  	struct sockaddr_in sin;
>  	struct pingpong_context *ctx = NULL;
> @@ -228,6 +229,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		sin.sin_addr.s_addr = ((struct sockaddr_in*)res->ai_addr)->sin_addr.s_addr;
>  		sin.sin_family = AF_INET;
>  		sin.sin_port = htons(data->port);
> +retry_addr:
>  		if (rdma_resolve_addr(data->cm_id, NULL,
>  					 (struct sockaddr *)&sin, 2000)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_addr failed\n",
> @@ -238,6 +240,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event)) 
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ADDR_ERROR
> +		 && n_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_addr;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  				pid, __func__, event->event);
> @@ -245,6 +253,7 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		}
>  		rdma_ack_cm_event(event);
>  	
> +retry_route:
>  		if (rdma_resolve_route(data->cm_id, 2000)) {
>  			fprintf(stderr, "%d:%s: rdma_resolve_route failed\n", 
>  						pid, __func__);
> @@ -254,6 +263,12 @@ static struct pingpong_context *pp_client_connect(struct pp_data *data)
>  		if (rdma_get_cm_event(data->cm_channel, &event))
>  			goto err2;
>  
> +		if (event->event == RDMA_CM_EVENT_ROUTE_ERROR
> +		 && n_retries-- > 0) {
> +			rdma_ack_cm_event(event);
> +			goto retry_route;
> +		}
> +
>  		if (event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
>  			fprintf(stderr, "%d:%s: unexpected CM event %d\n", 
>  					pid, __func__, event->event);
> _______________________________________________
> general mailing list
> general at lists.openfabrics.org
> http://lists.openfabrics.org/cgi-bin/mailman/listinfo/general
>
> To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general
>   




More information about the general mailing list