[ofa-general] [PATCH] IB/mlx4: shrinking WQE

Thu Aug 30 06:00:04 PDT 2007

ConnectX supports shrinking wqe, such that a single WR can include
multiple units of wqe_shift.  This way, WRs can differ in size, and
do not have to be a power of 2 in size, saving memory and speeding up
send WR posting.  Unfortunately, if we do this wqe_index field in CQE
can't be used to look up the WR ID anymore, so do this only if
selective signalling is off.

Further, on 32-bit platforms, we can't use vmap to make
the QP buffer virtually contigious. Thus we have to use
constant-sized WRs to make sure a WR is always fully within
a single page-sized chunk.

Finally, we use NOP opcode to avoid wrap-around in the middle of WR.
Since MLX QPs only support SEND, we use constant-sized WRs in this
case.  We look for the smallest value of wqe_shift such that the
resulting number of wqes does not exceed device capabilities.

Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>

---

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 8bf44da..0981f3c 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -331,6 +331,11 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 		MLX4_CQE_OPCODE_ERROR;
 
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP)) {
+		printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+		return -EINVAL;
+	}
+
 	if (!*cur_qp ||
 	    (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
 		/*
@@ -353,8 +358,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 
 	if (is_send) {
 		wq = &(*cur_qp)->sq;
-		wqe_ctr = be16_to_cpu(cqe->wqe_index);
-		wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		++wq->tail;
 	} else if ((*cur_qp)->ibqp.srq) {
@@ -403,6 +410,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 		case MLX4_OPCODE_BIND_MW:
 			wc->opcode    = IB_WC_BIND_MW;
 			break;
+		default:
+			printk("Unrecognized send opcode 0x%x!\n",
+			       cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK);
+			return -EINVAL;
 		}
 	} else {
 		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
@@ -422,6 +433,10 @@ static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
 			wc->wc_flags = IB_WC_WITH_IMM;
 			wc->imm_data = cqe->immed_rss_invalid;
 			break;
+		default:
+			printk("Unrecognized recv opcode 0x%x!\n",
+			       cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK);
+			return -EINVAL;
 		}
 
 		wc->slid	   = be16_to_cpu(cqe->rlid);
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 705ff2f..a72ecb9 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -115,6 +115,8 @@ struct mlx4_ib_qp {
 
 	u32			doorbell_qpn;
 	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
 	int			sq_spare_wqes;
 	struct mlx4_ib_wq	sq;
 
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index ba0428d..fd88d99 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/log2.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
 
@@ -92,7 +93,7 @@ static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 
 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
 {
-	if (qp->buf.nbufs == 1)
+	if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1)
 		return qp->buf.u.direct.buf + offset;
 	else
 		return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
@@ -111,16 +112,70 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
 
 /*
  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
- * first four bytes of every 64 byte chunk with 0xffffffff, except for
- * the very first chunk of the WQE.
+ * first four bytes of every 64 byte chunk with
+ * 0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When max WR is than or equal to the WQE size,
+ * as an optimization, we can stamp WQE with 0xffffffff,
+ * and skip the very first chunk of the WQE.
  */
-static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 {
 	u32 *wqe = get_send_wqe(qp, n);
 	int i;
+	int s;
+	__be32 stamp;
+
+	s = roundup(size, 1 << qp->sq.wqe_shift) / sizeof *wqe;
+	if (qp->sq_max_wqes_per_wr > 1) {
+		stamp = cpu_to_be32(0x7fffffff | (n & qp->sq.wqe_cnt ? 0 : 1 << 31));
+		for (i = 0; i < s; i += 16)
+			wqe[i] = stamp;
+	} else {
+		for (i = 16; i < s; i += 16)
+			wqe[i] = 0xffffffff;
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	stamp_send_wqe(qp, (n + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1), size);
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = qp->ibqp.qp_type == IB_QPT_UD ? sizeof(struct mlx4_wqe_datagram_seg) : 0;
+
+	/* Pad the remainder of the WQE with inline data segments. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before
+	 * setting ownership bit (because HW can start
+	 * executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+}
 
-	for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
-		wqe[i] = 0xffffffff;
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
 }
 
 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -234,9 +289,35 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	return 0;
 }
 
+static int nop_wqe_shift(enum ib_qp_type type)
+{
+	/*
+	 * WQE size is at least 0x20.
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs must have control segment.
+	 * MLX WQEs do not support NOP.
+	 */
+	switch (type) {
+	case IB_QPT_UD:
+		return ilog2(roundup_pow_of_two(max(sizeof (struct mlx4_wqe_ctrl_seg) +
+						    sizeof (struct mlx4_wqe_datagram_seg),
+						    (size_t)0x20)));
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return -EINVAL;
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	default:
+		return ilog2(roundup_pow_of_two(max(sizeof (struct mlx4_wqe_ctrl_seg),
+						    (size_t)0x20)));
+	}
+}
+
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
 {
+	int s;
+
 	/* Sanity check SQ size before proceeding */
 	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
 	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
@@ -252,20 +333,60 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 		return -EINVAL;
 
-	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
-							sizeof (struct mlx4_wqe_data_seg),
-							cap->max_inline_data +
-							sizeof (struct mlx4_wqe_inline_seg)) +
-						    send_wqe_overhead(type)));
-	qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
-		sizeof (struct mlx4_wqe_data_seg);
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type);
 
 	/*
-	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
-	 * allow HW to prefetch.
+	 * Hermon supports shrinking wqe, such that a single WR can include
+	 * multiple units of wqe_shift.  This way, WRs can differ in size, and
+	 * do not have to be a power of 2 in size, saving memory and speeding up
+	 * send WR posting.  Unfortunately, if we do this wqe_index field in CQE
+	 * can't be used to look up the WR ID anymore, so do this only if
+	 * selective signalling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap to make
+	 * the QP buffer virtually contigious. Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP opcode to avoid wrap-around in the middle of WR.
+	 * Since MLX QPs only support SEND, we use constant-sized WRs in this
+	 * case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the resulting
+	 * number of wqes does not exceed device capabilities.
 	 */
-	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
-	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
+	qp->sq.wqe_shift = nop_wqe_shift(type);
+	if (!qp->sq_signal_bits || BITS_PER_LONG == 64 || qp->sq.wqe_shift < 0)
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
+			return -EINVAL;
+
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
+			 send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
 
 	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -277,7 +398,8 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 		qp->sq.offset = 0;
 	}
 
-	cap->max_send_wr  = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
 	cap->max_send_sge = qp->sq.max_gs;
 	/* We don't support inline sends for kernel QPs (yet) */
 	cap->max_inline_data = 0;
@@ -315,6 +437,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	qp->rq.tail	    = 0;
 	qp->sq.head	    = 0;
 	qp->sq.tail	    = 0;
+	qp->sq_next_wqe     = 0;
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+	else
+		qp->sq_signal_bits = 0;
 
 	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
 	if (err)
@@ -405,11 +533,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
 	 */
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
-	else
-		qp->sq_signal_bits = 0;
-
 	qp->mqp.event = mlx4_ib_qp_event;
 
 	return 0;
@@ -904,7 +1027,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
 			ctrl = get_send_wqe(qp, i);
 			ctrl->owner_opcode = cpu_to_be32(1 << 31);
 
-			stamp_send_wqe(qp, i);
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
 		}
 	}
 
@@ -1228,14 +1351,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	unsigned long flags;
 	int nreq;
 	int err = 0;
-	int ind;
-	int size;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
 	int i;
 
 	spin_lock_irqsave(&qp->rq.lock, flags);
 
-	ind = qp->sq.head;
-
+	ind = qp->sq_next_wqe;
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
 			err = -ENOMEM;
@@ -1250,7 +1373,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		}
 
 		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
-		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 
 		ctrl->srcrb_flags =
 			(wr->send_flags & IB_SEND_SIGNALED ?
@@ -1266,7 +1389,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			ctrl->imm = 0;
 
 		wqe += sizeof *ctrl;
-		size = sizeof *ctrl / 16;
+		size = sizeof *ctrl;
 
 		switch (ibqp->qp_type) {
 		case IB_QPT_RC:
@@ -1281,8 +1404,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				set_atomic_seg(wqe, wr);
 				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
 
-				size += (sizeof (struct mlx4_wqe_raddr_seg) +
-					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+				size += sizeof (struct mlx4_wqe_raddr_seg) +
+					sizeof (struct mlx4_wqe_atomic_seg);
 
 				break;
 
@@ -1292,7 +1415,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
 					      wr->wr.rdma.rkey);
 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
-				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+				size += sizeof (struct mlx4_wqe_raddr_seg);
 				break;
 
 			default:
@@ -1304,7 +1427,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		case IB_QPT_UD:
 			set_datagram_seg(wqe, wr);
 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
-			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			size += sizeof (struct mlx4_wqe_datagram_seg);
 			break;
 
 		case IB_QPT_SMI:
@@ -1315,7 +1438,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				goto out;
 			}
 			wqe  += err;
-			size += err / 16;
+			size += err;
 
 			err = 0;
 			break;
@@ -1328,7 +1451,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 			set_data_seg(wqe, wr->sg_list + i);
 
 			wqe  += sizeof (struct mlx4_wqe_data_seg);
-			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+			size += sizeof (struct mlx4_wqe_data_seg);
 		}
 
 		/* Add one more inline data segment for ICRC for MLX sends */
@@ -1337,11 +1460,11 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 				cpu_to_be32((1 << 31) | 4);
 			((u32 *) wqe)[1] = 0;
 			wqe  += sizeof (struct mlx4_wqe_data_seg);
-			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+			size += sizeof (struct mlx4_wqe_data_seg);
 		}
 
 		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
-				    MLX4_WQE_CTRL_FENCE : 0) | size;
+				    MLX4_WQE_CTRL_FENCE : 0) | (size / 16);
 
 		/*
 		 * Make sure descriptor is fully written before
@@ -1358,16 +1481,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
 			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
 
+		stamp = (ind + qp->sq_spare_wqes) & (qp->sq.wqe_cnt - 1);
+		ind += DIV_ROUND_UP(size, 1 << qp->sq.wqe_shift);
+
 		/*
 		 * We can improve latency by not stamping the last
 		 * send queue WQE until after ringing the doorbell, so
 		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
 		 */
-		if (wr->next)
-			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
-				       (qp->sq.wqe_cnt - 1));
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size);
+			ind = pad_wraparound(qp, ind);
+		}
 
-		++ind;
 	}
 
 out:
@@ -1389,8 +1519,10 @@ out:
 		 */
 		mmiowb();
 
-		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
-			       (qp->sq.wqe_cnt - 1));
+		stamp_send_wqe(qp, stamp, size);
+
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
 	}
 
 	spin_unlock_irqrestore(&qp->rq.lock, flags);

-- 
MST