[ofa-general] [PATCH 5 of 5 v2] mlx4: Do not allocate an extra (unneeded) CQE when creating a CQ

Sun Oct 28 00:59:57 PDT 2007

mlx4: Do not allocate an extra (unneeded) CQE when creating a CQ.

The extra CQE can cause a huge waste of memory if requesting
a power-of-2 number of CQEs.

The number of CQEs in the cq that is returned to the kernel_caller
is now a power-of-2.  The value returned to userspace callers
is the same as before, in order to preserve the ABI.

Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

---

Roland,
The previous patch neglected to increase the number of CQEs returned
to the verbs-layer caller by 1. If the mlx4 layer was invoked with a 
power of 2, the returned value was <power-of-2> - 1, which is not in
conformance with the the IB spec.

This patch fixes that oversight.

In order to preserve the ABI, the kernel still returns <power-of-2> - 1 cqes
to a userspace caller; adjustments are made in userspace by libmlx4.

- Jack
Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
===================================================================

--- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c	2007-10-28 09:34:17.055937000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/cq.c	2007-10-28 09:36:22.457431000 +0200
@@ -80,10 +80,10 @@ static void *get_cqe(struct mlx4_ib_cq *
 
 static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
 {
-	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx4_cqe *cqe = get_cqe(cq, n & (cq->ibcq.cqe - 1));
 
 	return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
-		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+		!!(n & cq->ibcq.cqe)) ? NULL : cqe;
 }
 
 static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
@@ -108,8 +108,16 @@ struct ib_cq *mlx4_ib_create_cq(struct i
 	if (!cq)
 		return ERR_PTR(-ENOMEM);
 
-	entries      = roundup_pow_of_two(entries + 1);
-	cq->ibcq.cqe = entries - 1;
+	/* eliminate using extra CQE (for kernel space).
+	 * For userspace, do in libmlx4, so that don't break ABI.
+	 */
+	if (context) {
+		entries      = roundup_pow_of_two(entries + 1);
+		cq->ibcq.cqe = entries - 1;
+	} else {
+		entries      = roundup_pow_of_two(entries);
+		cq->ibcq.cqe = entries;
+	}
 	buf_size     = entries * sizeof (struct mlx4_cqe);
 	spin_lock_init(&cq->lock);
 
@@ -222,7 +230,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
 		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
 		ib_umem_release(mcq->umem);
 	} else {
-		mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
+		mlx4_buf_free(dev->dev, (cq->cqe) * sizeof (struct mlx4_cqe),
 			      &mcq->buf.buf);
 		mlx4_ib_db_free(dev, &mcq->db);
 	}
@@ -489,7 +497,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
 	 * from our QP and therefore don't need to be checked.
 	 */
 	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
-		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe - 1)
 			break;
 
 	/*
@@ -497,13 +505,13 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
 	 * that match our QP by copying older entries on top of them.
 	 */
 	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
-		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe = get_cqe(cq, prod_index & (cq->ibcq.cqe - 1));
 		if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
 			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
 				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
 			++nfreed;
 		} else if (nfreed) {
-			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest = get_cqe(cq, (prod_index + nfreed) & (cq->ibcq.cqe - 1));
 			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
 			memcpy(dest, cqe, sizeof *cqe);
 			dest->owner_sr_opcode = owner_bit |
Index: infiniband/drivers/net/mlx4/main.c
===================================================================
--- infiniband.orig/drivers/net/mlx4/main.c	2007-10-28 09:34:17.077932000 +0200
+++ infiniband/drivers/net/mlx4/main.c	2007-10-28 09:36:22.465430000 +0200
@@ -141,12 +141,7 @@ static int mlx4_dev_cap(struct mlx4_dev 
 	dev->caps.max_sq_desc_sz     = dev_cap->max_sq_desc_sz;
 	dev->caps.max_rq_desc_sz     = dev_cap->max_rq_desc_sz;
 	dev->caps.num_qp_per_mgm     = MLX4_QP_PER_MGM;
-	/*
-	 * Subtract 1 from the limit because we need to allocate a
-	 * spare CQE so the HCA HW can tell the difference between an
-	 * empty CQ and a full CQ.
-	 */
-	dev->caps.max_cqes	     = dev_cap->max_cq_sz - 1;
+	dev->caps.max_cqes	     = dev_cap->max_cq_sz;
 	dev->caps.reserved_cqs	     = dev_cap->reserved_cqs;
 	dev->caps.reserved_eqs	     = dev_cap->reserved_eqs;
 	dev->caps.reserved_mtts	     = DIV_ROUND_UP(dev_cap->reserved_mtts,