[ofa-general] Fwd: [rds-devel] RDS: Fix a bug in RDMA signalling

Vladimir Sokolovsky vlad at dev.mellanox.co.il
Tue May 27 06:31:44 PDT 2008


Olaf Kirch wrote:
> Oops, this should have gone to ofa-general as well.
> 
> Olaf
> ----------  Forwarded Message  ----------
> 
> Subject: [rds-devel] RDS: Fix a bug in RDMA signalling
> Date: Tuesday 27 May 2008
> From: Olaf Kirch <olaf.kirch at oracle.com>
> To: rds-devel at oss.oracle.com
> 
> I found an issue with RDMA completions vs. IB connection teardown
> yesterday - the patch seems correct to me (and passed my testing),
> but I would appreciate a second set of eyeballs, and some additional
> testing (Rick, can you spin the crload wheel with this one, please?)
> 
> Vlad, is there any chance to get this into 1.3.1? I think I remember
> that rc2 (which is out already) was supposed to be the final release
> candidate for 1.3.1 - correct?
> 
> The patch is also available from my tree at
> git://git.openfabrics.org/~okir/ofed_1_3/linux-2.6.git code-drop-20080527
> 
> Olaf
> -------------------
> From: Olaf Kirch <olaf.kirch at oracle.com>
> Subject: RDS: Fix a bug in RDMA signalling
> 
> Code inspection revealed a problem in the way we signal RDMA
> completions to user space when a connection goes down.
> 
> The send CQ handler calls rds_ib_send_unmap_rm, indicating success
> no matter what the WC status code says. This means we may
> signal success for RDMAs that get flushed out with WR_FLUSH_ERR.
> 
> This patch fixes the problem by passing the wc.status value to
> rds_ib_send_unmap_rm for inspection.
> 
> While I was at it, I moved the code that translates WC status codes
> to RDMA notifications from the send CQ handler to rds_ib_send_unmap_rm
> where it belongs.
> 
> Signed-off-by: Olaf Kirch <olaf.kirch at oracle.com>
> ---
>  net/rds/ib_send.c |   39 ++++++++++++++++++++-------------------
>  net/rds/rdma.h    |    2 +-
>  net/rds/send.c    |    3 ++-
>  3 files changed, 23 insertions(+), 21 deletions(-)
> 
> Index: ofa_kernel-1.3.1/net/rds/ib_send.c
> ===================================================================
> --- ofa_kernel-1.3.1.orig/net/rds/ib_send.c
> +++ ofa_kernel-1.3.1/net/rds/ib_send.c
> @@ -41,7 +41,7 @@
>  
>  void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
>  		          struct rds_ib_send_work *send,
> -			  int success)
> +			  int wc_status)
>  {
>  	struct rds_message *rm = send->s_rm;
>  
> @@ -52,7 +52,9 @@ void rds_ib_send_unmap_rm(struct rds_ib_
>  		     DMA_TO_DEVICE);
>  
>  	/* raise rdma completion hwm */
> -	if (rm->m_rdma_op && success) {
> +	if (rm->m_rdma_op && wc_status != IB_WC_WR_FLUSH_ERR) {
> +		int notify_status;
> +
>  		/* If the user asked for a completion notification on this
>  		 * message, we can implement three different semantics:
>  		 *  1.	Notify when we received the ACK on the RDS message
> @@ -68,7 +70,20 @@ void rds_ib_send_unmap_rm(struct rds_ib_
>  		 * don't call rds_rdma_send_complete at all, and fall back to the notify
>  		 * handling in the ACK processing code.
>  		 */
> -		rds_rdma_send_complete(rm);
> +		switch (wc_status) {
> +		case IB_WC_SUCCESS:
> +			notify_status = RDS_RDMA_SUCCESS;
> +			break;
> +
> +		case IB_WC_REM_ACCESS_ERR:
> +			notify_status = RDS_RDMA_REMOTE_ERROR;
> +			break;
> +
> +		default:
> +			notify_status = RDS_RDMA_OTHER_ERROR;
> +			break;
> +		}
> +		rds_rdma_send_complete(rm, notify_status);
>  
>  		if (rm->m_rdma_op->r_write)
>  			rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
> @@ -118,7 +133,7 @@ void rds_ib_send_clear_ring(struct rds_i
>  		if (send->s_wr.opcode == 0xdead)
>  			continue;
>  		if (send->s_rm)
> -			rds_ib_send_unmap_rm(ic, send, 0);
> +			rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
>  		if (send->s_op)
>  			ib_dma_unmap_sg(ic->i_cm_id->device,
>  				send->s_op->r_sg, send->s_op->r_nents,
> @@ -174,7 +189,7 @@ void rds_ib_send_cq_comp_handler(struct 
>  			switch (send->s_wr.opcode) {
>  			case IB_WR_SEND:
>  				if (send->s_rm)
> -					rds_ib_send_unmap_rm(ic, send, 1);
> +					rds_ib_send_unmap_rm(ic, send, wc.status);
>  				break;
>  			case IB_WR_RDMA_WRITE:
>  				if (send->s_op)
> @@ -204,20 +219,6 @@ void rds_ib_send_cq_comp_handler(struct 
>  			oldest = (oldest + 1) % ic->i_send_ring.w_nr;
>  		}
>  
> -		if (unlikely(wc.status != IB_WC_SUCCESS && send->s_op && send->s_op->r_notifier)) {
> -			switch (wc.status) {
> -			default:
> -				send->s_op->r_notifier->n_status = RDS_RDMA_OTHER_ERROR;
> -				break;
> -			case IB_WC_REM_ACCESS_ERR:
> -				send->s_op->r_notifier->n_status = RDS_RDMA_REMOTE_ERROR;
> -				break;
> -			case IB_WC_WR_FLUSH_ERR:
> -				/* flushed out; not an error */
> -				break;
> -			}
> -		}
> -
>  		rds_ib_ring_free(&ic->i_send_ring, completed);
>  
>  		if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
> Index: ofa_kernel-1.3.1/net/rds/rdma.h
> ===================================================================
> --- ofa_kernel-1.3.1.orig/net/rds/rdma.h
> +++ ofa_kernel-1.3.1/net/rds/rdma.h
> @@ -71,6 +71,6 @@ int rds_cmsg_rdma_args(struct rds_sock *
>  int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
>  			  struct cmsghdr *cmsg);
>  void rds_rdma_free_op(struct rds_rdma_op *ro);
> -void rds_rdma_send_complete(struct rds_message *rm);
> +void rds_rdma_send_complete(struct rds_message *rm, int);
>  
>  #endif
> Index: ofa_kernel-1.3.1/net/rds/send.c
> ===================================================================
> --- ofa_kernel-1.3.1.orig/net/rds/send.c
> +++ ofa_kernel-1.3.1/net/rds/send.c
> @@ -361,7 +361,7 @@ int rds_send_acked_before(struct rds_con
>   * the IB send completion on the RDMA op and the accompanying
>   * message.
>   */
> -void rds_rdma_send_complete(struct rds_message *rm)
> +void rds_rdma_send_complete(struct rds_message *rm, int status)
>  {
>  	struct rds_sock *rs = NULL;
>  	struct rds_rdma_op *ro;
> @@ -376,6 +376,7 @@ void rds_rdma_send_complete(struct rds_m
>  		rs = rm->m_rs;
>  		sock_hold(rds_rs_to_sk(rs));
>  
> +		notifier->n_status = status;
>  		spin_lock(&rs->rs_lock);
>  		list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
>  		spin_unlock(&rs->rs_lock);
> 

Applied to OFED-1.3.1 kernel git tree.

Regards,
Vladimir



More information about the general mailing list