[ofa-general] [PATCH 5 of 5 v2] mlx4: Do not allocate an extra (unneeded) CQE when creating a CQ
Jack Morgenstein
jackm at dev.mellanox.co.il
Sun Oct 28 00:59:57 PDT 2007
mlx4: Do not allocate an extra (unneeded) CQE when creating a CQ.
The extra CQE can cause a huge waste of memory if requesting
a power-of-2 number of CQEs.
The number of CQEs in the cq that is returned to the kernel_caller
is now a power-of-2. The value returned to userspace callers
is the same as before, in order to preserve the ABI.
Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>
---
Roland,
The previous patch neglected to increase the number of CQEs returned
to the verbs-layer caller by 1. If the mlx4 layer was invoked with a
power of 2, the returned value was <power-of-2> - 1, which is not in
conformance with the the IB spec.
This patch fixes that oversight.
In order to preserve the ABI, the kernel still returns <power-of-2> - 1 cqes
to a userspace caller; adjustments are made in userspace by libmlx4.
- Jack
Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c 2007-10-28 09:34:17.055937000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/cq.c 2007-10-28 09:36:22.457431000 +0200
@@ -80,10 +80,10 @@ static void *get_cqe(struct mlx4_ib_cq *
static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
{
- struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+ struct mlx4_cqe *cqe = get_cqe(cq, n & (cq->ibcq.cqe - 1));
return (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
- !!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+ !!(n & cq->ibcq.cqe)) ? NULL : cqe;
}
static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
@@ -108,8 +108,16 @@ struct ib_cq *mlx4_ib_create_cq(struct i
if (!cq)
return ERR_PTR(-ENOMEM);
- entries = roundup_pow_of_two(entries + 1);
- cq->ibcq.cqe = entries - 1;
+ /* eliminate using extra CQE (for kernel space).
+ * For userspace, do in libmlx4, so that don't break ABI.
+ */
+ if (context) {
+ entries = roundup_pow_of_two(entries + 1);
+ cq->ibcq.cqe = entries - 1;
+ } else {
+ entries = roundup_pow_of_two(entries);
+ cq->ibcq.cqe = entries;
+ }
buf_size = entries * sizeof (struct mlx4_cqe);
spin_lock_init(&cq->lock);
@@ -222,7 +230,7 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
ib_umem_release(mcq->umem);
} else {
- mlx4_buf_free(dev->dev, (cq->cqe + 1) * sizeof (struct mlx4_cqe),
+ mlx4_buf_free(dev->dev, (cq->cqe) * sizeof (struct mlx4_cqe),
&mcq->buf.buf);
mlx4_ib_db_free(dev, &mcq->db);
}
@@ -489,7 +497,7 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
* from our QP and therefore don't need to be checked.
*/
for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
- if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+ if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe - 1)
break;
/*
@@ -497,13 +505,13 @@ void __mlx4_ib_cq_clean(struct mlx4_ib_c
* that match our QP by copying older entries on top of them.
*/
while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
- cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+ cqe = get_cqe(cq, prod_index & (cq->ibcq.cqe - 1));
if ((be32_to_cpu(cqe->my_qpn) & 0xffffff) == qpn) {
if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
++nfreed;
} else if (nfreed) {
- dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+ dest = get_cqe(cq, (prod_index + nfreed) & (cq->ibcq.cqe - 1));
owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
memcpy(dest, cqe, sizeof *cqe);
dest->owner_sr_opcode = owner_bit |
Index: infiniband/drivers/net/mlx4/main.c
===================================================================
--- infiniband.orig/drivers/net/mlx4/main.c 2007-10-28 09:34:17.077932000 +0200
+++ infiniband/drivers/net/mlx4/main.c 2007-10-28 09:36:22.465430000 +0200
@@ -141,12 +141,7 @@ static int mlx4_dev_cap(struct mlx4_dev
dev->caps.max_sq_desc_sz = dev_cap->max_sq_desc_sz;
dev->caps.max_rq_desc_sz = dev_cap->max_rq_desc_sz;
dev->caps.num_qp_per_mgm = MLX4_QP_PER_MGM;
- /*
- * Subtract 1 from the limit because we need to allocate a
- * spare CQE so the HCA HW can tell the difference between an
- * empty CQ and a full CQ.
- */
- dev->caps.max_cqes = dev_cap->max_cq_sz - 1;
+ dev->caps.max_cqes = dev_cap->max_cq_sz;
dev->caps.reserved_cqs = dev_cap->reserved_cqs;
dev->caps.reserved_eqs = dev_cap->reserved_eqs;
dev->caps.reserved_mtts = DIV_ROUND_UP(dev_cap->reserved_mtts,
More information about the general
mailing list