[ofa-general] [PATCH 3 OF 5 v2] libmlx4: avoid adding unneeded extra CQE when creating a cq

Sun Oct 28 00:51:38 PDT 2007

commit c04463eb343a0f038eb7a2a877be90cd3e3e19a3
Author: Jack Morgenstein <jackm at mellanox.co.il>
Date:   Thu Oct 25 19:17:42 2007 +0200

    Do not add an extra CQE when creating a CQ.
    Sanity-check against returned device capabilities,
    to avoid breaking ABI.
    Set minimum to 2, to avoid rejection by kernel.
    
    Adjust num cqes passed to verbs layer.
    
    Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

---
Roland,
The previous patch neglected to increase the number of CQEs returned
to the verbs-layer caller by 1. If the mlx4 layer was invoked with a 
power of 2, the returned value was <power-of-2> - 1, which is not in
conformance with the the IB spec.

This patch fixes that oversight.  In order to preserve the ABI, the
corresponding kernel patch still returns <power-of-2> - 1; however,
the user layer can determine if the kernel has adjusted the number
of CQEs per qp by examining if the device-capability max_cqes is a
power of 2 -- if so, then create_cq() can increment the returned
cqe value by 1.

Its possible that this increment can be done unconditionally 
(i.e., even if there is a previous kernel driver installed) -- I've
not yet checked this out.

- Jack

diff --git a/src/cq.c b/src/cq.c
index c0d7a8b..aac84da 100644
--- a/src/cq.c
+++ b/src/cq.c
@@ -114,10 +114,10 @@ static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry)
 
 static void *get_sw_cqe(struct mlx4_cq *cq, int n)
 {
-	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe);
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->cqe_mask);
 
 	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
-		!!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe;
+		!!(n & (cq->cqe_mask + 1))) ? NULL : cqe;
 }
 
 static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq)
@@ -417,7 +417,7 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
 	 * from our QP and therefore don't need to be checked.
 	 */
 	for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
-		if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
+		if (prod_index == cq->cons_index + cq->cqe_mask)
 			break;
 
 	/*
@@ -425,7 +425,7 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
 	 * that match our QP by copying older entries on top of them.
 	 */
 	while ((int) --prod_index - (int) cq->cons_index >= 0) {
-		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
+		cqe = get_cqe(cq, prod_index & cq->cqe_mask);
 		if (is_xrc_srq &&
 		    (ntohl(cqe->g_mlpath_rqpn & 0xffffff) == srq->srqn) &&
 		    !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK)) {
@@ -436,7 +436,7 @@ void mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq)
 				mlx4_free_srq_wqe(srq, ntohs(cqe->wqe_index));
 			++nfreed;
 		} else if (nfreed) {
-			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe);
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->cqe_mask);
 			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
 			memcpy(dest, cqe, sizeof *cqe);
 			dest->owner_sr_opcode = owner_bit |
diff --git a/src/mlx4.h b/src/mlx4.h
index 09e2bdd..707061b 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -216,6 +216,7 @@ struct mlx4_cq {
 	uint32_t		       *set_ci_db;
 	uint32_t		       *arm_db;
 	int				arm_sn;
+	uint32_t			cqe_mask;
 };
 
 struct mlx4_srq {
diff --git a/src/verbs.c b/src/verbs.c
index 059b534..d2a15d5 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -168,11 +168,22 @@ struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
 	struct mlx4_create_cq_resp resp;
 	struct mlx4_cq		  *cq;
 	int			   ret;
+	struct mlx4_context	  *mctx = to_mctx(context);
+	int			   no_spare_cqe = 0;
 
 	/* Sanity check CQ size before proceeding */
-	if (cqe > 0x3fffff)
+	if (cqe < 1 || cqe > mctx->max_cqe)
 		return NULL;
 
+	/* if max allowable cqes is a power-of-2, no spare cqe fix is in
+	 * the kernel
+	 */
+	if (mctx->max_cqe == align_queue_size(mctx->max_cqe))
+		no_spare_cqe = 1;
+
+	/* raise minimum, to avoid breaking ABI */
+	cqe = (cqe == 1) ? 2 : cqe;
+
 	cq = malloc(sizeof *cq);
 	if (!cq)
 		return NULL;
@@ -182,7 +193,7 @@ struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
 	if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE))
 		goto err;
 
-	cqe = align_queue_size(cqe + 1);
+	cqe = align_queue_size(cqe);
 
 	if (mlx4_alloc_buf(&cq->buf, cqe * MLX4_CQ_ENTRY_SIZE,
 			   to_mdev(context->device)->page_size))
@@ -209,6 +220,9 @@ struct ibv_cq *mlx4_create_cq(struct ibv_context *context, int cqe,
 		goto err_db;
 
 	cq->cqn = resp.cqn;
+	cq->cqe_mask = cq->ibv_cq.cqe;
+	if (no_spare_cqe)
+		cq->ibv_cq.cqe++;
 
 	return &cq->ibv_cq;