[ofa-general] [PATCH v9] IB/mlx4: shrinking WQE

Tue Dec 4 00:32:38 PST 2007

IB/mlx4: shrinking WQE

ConnectX supports shrinking wqe, such that a single WR can include
multiple units of wqe_shift.  This way, WRs can differ in size, and
do not have to be a power of 2 in size, saving memory and speeding up
send WR posting.  Unfortunately, if we do this wqe_index field in CQE
can't be used to look up the WR ID anymore, so do this only if
selective signalling is off.

Further, on 32-bit platforms, we can't use vmap to make
the QP buffer virtually contigious. Thus we have to use
constant-sized WRs to make sure a WR is always fully within
a single page-sized chunk.

Finally, we use WR with NOP opcode to avoid wrap-around
in the middle of WR. We set NoErrorCompletion bit to avoid getting
completions with error for NOP WRs. Since NEC is only supported
starting with firmware 2.2.232, we use constant-sized WRs
for older firmware. And, since MLX QPs only support SEND, we use
constant-sized WRs in this case.

When stamping during NOP posting, do stamping following setting of
the NOP wqe valid bit.

Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>
Signed-off-by: Jack Morgenstein <jackm at dev.mellanox.co.il>

---

changes since v8:
	fixed bug in __mlx4_ib_modify_qp:  did not reset qp->sq_next_wqe
	in the TO_RESET transition.
	Found by Mellanox regression testing.

Index: infiniband/drivers/infiniband/hw/mlx4/cq.c
===================================================================

--- infiniband.orig/drivers/infiniband/hw/mlx4/cq.c	2007-10-29 11:09:24.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/cq.c	2007-12-04 09:29:39.391728000 +0200
@@ -331,6 +331,12 @@ static int mlx4_ib_poll_one(struct mlx4_
 	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
 		MLX4_CQE_OPCODE_ERROR;
 
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+		     is_send)) {
+		printk(KERN_WARNING "Completion for NOP opcode detected!\n");
+		return -EINVAL;
+	}
+
 	if (!*cur_qp ||
 	    (be32_to_cpu(cqe->my_qpn) & 0xffffff) != (*cur_qp)->mqp.qpn) {
 		/*
@@ -353,8 +359,10 @@ static int mlx4_ib_poll_one(struct mlx4_
 
 	if (is_send) {
 		wq = &(*cur_qp)->sq;
-		wqe_ctr = be16_to_cpu(cqe->wqe_index);
-		wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		++wq->tail;
 	} else if ((*cur_qp)->ibqp.srq) {
Index: infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/mlx4_ib.h	2007-10-24 18:19:56.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/mlx4_ib.h	2007-12-04 09:29:39.394732000 +0200
@@ -120,6 +120,8 @@ struct mlx4_ib_qp {
 
 	u32			doorbell_qpn;
 	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
 	int			sq_spare_wqes;
 	struct mlx4_ib_wq	sq;
 
Index: infiniband/drivers/infiniband/hw/mlx4/qp.c
===================================================================
--- infiniband.orig/drivers/infiniband/hw/mlx4/qp.c	2007-11-01 09:17:50.000000000 +0200
+++ infiniband/drivers/infiniband/hw/mlx4/qp.c	2007-12-04 09:29:39.404731000 +0200
@@ -30,6 +30,7 @@
  * SOFTWARE.
  */
 
+#include <linux/log2.h>
 #include <rdma/ib_cache.h>
 #include <rdma/ib_pack.h>
 
@@ -96,7 +97,7 @@ static int is_qp0(struct mlx4_ib_dev *de
 
 static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
 {
-	if (qp->buf.nbufs == 1)
+	if (BITS_PER_LONG == 64 || qp->buf.nbufs == 1)
 		return qp->buf.u.direct.buf + offset;
 	else
 		return qp->buf.u.page_list[offset >> PAGE_SHIFT].buf +
@@ -115,16 +116,88 @@ static void *get_send_wqe(struct mlx4_ib
 
 /*
  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
- * first four bytes of every 64 byte chunk with 0xffffffff, except for
- * the very first chunk of the WQE.
+ * first four bytes of every 64 byte chunk with
+ * 0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When max WR is than or equal to the WQE size,
+ * as an optimization, we can stamp WQE with 0xffffffff,
+ * and skip the very first chunk of the WQE.
  */
-static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n)
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
 {
-	u32 *wqe = get_send_wqe(qp, n);
+	u32 *wqe;
 	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+
+	s = roundup(size, 1 << qp->sq.wqe_shift);
+	if (qp->sq_max_wqes_per_wr > 1) {
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ?  cpu_to_be32(0x7fffffff) :
+							cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = 0xffffffff;
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before
+	 * setting ownership bit (because HW can start
+	 * executing as soon as we do).
+	 */
+	wmb();
 
-	for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
-		wqe[i] = 0xffffffff;
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
 }
 
 static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
@@ -241,6 +314,8 @@ static int set_rq_size(struct mlx4_ib_de
 static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
 			      enum ib_qp_type type, struct mlx4_ib_qp *qp)
 {
+	int s;
+
 	/* Sanity check SQ size before proceeding */
 	if (cap->max_send_wr	 > dev->dev->caps.max_wqes  ||
 	    cap->max_send_sge	 > dev->dev->caps.max_sq_sg ||
@@ -256,20 +331,69 @@ static int set_kernel_sq_size(struct mlx
 	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
 		return -EINVAL;
 
-	qp->sq.wqe_shift = ilog2(roundup_pow_of_two(max(cap->max_send_sge *
-							sizeof (struct mlx4_wqe_data_seg),
-							cap->max_inline_data +
-							sizeof (struct mlx4_wqe_inline_seg)) +
-						    send_wqe_overhead(type)));
-	qp->sq.max_gs    = ((1 << qp->sq.wqe_shift) - send_wqe_overhead(type)) /
-		sizeof (struct mlx4_wqe_data_seg);
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type);
 
 	/*
-	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
-	 * allow HW to prefetch.
+	 * Hermon supports shrinking wqe, such that a single WR can include
+	 * multiple units of wqe_shift.  This way, WRs can differ in size, and
+	 * do not have to be a power of 2 in size, saving memory and speeding up
+	 * send WR posting.  Unfortunately, if we do this wqe_index field in CQE
+	 * can't be used to look up the WR ID anymore, so do this only if
+	 * selective signalling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap to make
+	 * the QP buffer virtually contigious. Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP opcode to avoid wrap-around in the middle of WR.
+	 * We set NEC bit to avoid getting completions with error for NOP WRs.
+	 * Since NEC is only supported starting with firmware 2.2.232,
+	 * we use constant-sized WRs for older firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized WRs in this
+	 * case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the resulting
+	 * number of wqes does not exceed device capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping invalidates each WQE.
 	 */
-	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
-	qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + qp->sq_spare_wqes);
+	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != IB_QPT_SMI && type != IB_QPT_GSI)
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		if (1 << qp->sq.wqe_shift > dev->dev->caps.max_sq_desc_sz)
+			return -EINVAL;
+
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1 << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = ((qp->sq_max_wqes_per_wr << qp->sq.wqe_shift) -
+			 send_wqe_overhead(type)) / sizeof (struct mlx4_wqe_data_seg);
 
 	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
@@ -281,7 +405,8 @@ static int set_kernel_sq_size(struct mlx
 		qp->sq.offset = 0;
 	}
 
-	cap->max_send_wr  = qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
 	cap->max_send_sge = qp->sq.max_gs;
 	/* We don't support inline sends for kernel QPs (yet) */
 	cap->max_inline_data = 0;
@@ -327,6 +452,12 @@ static int create_qp_common(struct mlx4_
 	qp->rq.tail	    = 0;
 	qp->sq.head	    = 0;
 	qp->sq.tail	    = 0;
+	qp->sq_next_wqe     = 0;
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+	else
+		qp->sq_signal_bits = 0;
 
 	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, !!init_attr->srq, qp);
 	if (err)
@@ -417,11 +548,6 @@ static int create_qp_common(struct mlx4_
 	 */
 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
 
-	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
-	else
-		qp->sq_signal_bits = 0;
-
 	qp->mqp.event = mlx4_ib_qp_event;
 
 	return 0;
@@ -916,7 +1042,7 @@ static int __mlx4_ib_modify_qp(struct ib
 			ctrl = get_send_wqe(qp, i);
 			ctrl->owner_opcode = cpu_to_be32(1 << 31);
 
-			stamp_send_wqe(qp, i);
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
 		}
 	}
 
@@ -969,6 +1095,7 @@ static int __mlx4_ib_modify_qp(struct ib
 		qp->rq.tail = 0;
 		qp->sq.head = 0;
 		qp->sq.tail = 0;
+		qp->sq_next_wqe = 0;
 		if (!ibqp->srq)
 			*qp->db.db  = 0;
 	}
@@ -1278,13 +1405,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
 	unsigned long flags;
 	int nreq;
 	int err = 0;
-	int ind;
-	int size;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
 	int i;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
 
-	ind = qp->sq.head;
+	ind = qp->sq_next_wqe;
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
@@ -1300,7 +1428,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
 		}
 
 		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
-		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 
 		ctrl->srcrb_flags =
 			(wr->send_flags & IB_SEND_SIGNALED ?
@@ -1413,16 +1541,23 @@ int mlx4_ib_post_send(struct ib_qp *ibqp
 		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
 			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
 
+		stamp = ind + qp->sq_spare_wqes;
+		ind += DIV_ROUND_UP(size * 16, 1 << qp->sq.wqe_shift);
+
 		/*
 		 * We can improve latency by not stamping the last
 		 * send queue WQE until after ringing the doorbell, so
 		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
 		 */
-		if (wr->next)
-			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
-				       (qp->sq.wqe_cnt - 1));
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size * 16);
+			ind = pad_wraparound(qp, ind);
+		}
 
-		++ind;
 	}
 
 out:
@@ -1444,8 +1579,10 @@ out:
 		 */
 		mmiowb();
 
-		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
-			       (qp->sq.wqe_cnt - 1));
+		stamp_send_wqe(qp, stamp, size * 16);
+
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
 	}
 
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
Index: infiniband/drivers/net/mlx4/alloc.c
===================================================================
--- infiniband.orig/drivers/net/mlx4/alloc.c	2007-11-08 10:01:19.000000000 +0200
+++ infiniband/drivers/net/mlx4/alloc.c	2007-12-04 09:29:39.410730000 +0200
@@ -151,6 +151,19 @@ int mlx4_buf_alloc(struct mlx4_dev *dev,
 
 			memset(buf->u.page_list[i].buf, 0, PAGE_SIZE);
 		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof *pages * buf->nbufs, GFP_KERNEL);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; ++i)
+				pages[i] = virt_to_page(buf->u.page_list[i].buf);
+			buf->u.direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->u.direct.buf)
+				goto err_free;
+		}
 	}
 
 	return 0;
@@ -170,6 +183,9 @@ void mlx4_buf_free(struct mlx4_dev *dev,
 		dma_free_coherent(&dev->pdev->dev, size, buf->u.direct.buf,
 				  buf->u.direct.map);
 	else {
+		if (BITS_PER_LONG == 64)
+			vunmap(buf->u.direct.buf);
+
 		for (i = 0; i < buf->nbufs; ++i)
 			if (buf->u.page_list[i].buf)
 				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
Index: infiniband/include/linux/mlx4/device.h
===================================================================
--- infiniband.orig/include/linux/mlx4/device.h	2007-10-16 15:50:08.000000000 +0200
+++ infiniband/include/linux/mlx4/device.h	2007-12-04 09:29:39.417729000 +0200
@@ -133,6 +133,11 @@ enum {
 	MLX4_STAT_RATE_OFFSET	= 5
 };
 
+static inline u64 mlx4_fw_ver(u64 major, u64 minor, u64 subminor)
+{
+	return (major << 32) | (minor << 16) | subminor;
+}
+
 struct mlx4_caps {
 	u64			fw_ver;
 	int			num_ports;
@@ -189,7 +194,7 @@ struct mlx4_buf_list {
 };
 
 struct mlx4_buf {
-	union {
+	struct {
 		struct mlx4_buf_list	direct;
 		struct mlx4_buf_list   *page_list;
 	} u;
Index: infiniband/include/linux/mlx4/qp.h
===================================================================
--- infiniband.orig/include/linux/mlx4/qp.h	2007-10-16 15:50:08.000000000 +0200
+++ infiniband/include/linux/mlx4/qp.h	2007-12-04 09:29:39.420729000 +0200
@@ -154,7 +154,11 @@ struct mlx4_qp_context {
 	u32			reserved5[10];
 };
 
+/* Which firmware version adds support for NEC (NoErrorCompletion) bit */
+#define MLX4_FW_VER_WQE_CTRL_NEC mlx4_fw_ver(2, 2, 232)
+
 enum {
+	MLX4_WQE_CTRL_NEC	= 1 << 29,
 	MLX4_WQE_CTRL_FENCE	= 1 << 6,
 	MLX4_WQE_CTRL_CQ_UPDATE	= 3 << 2,
 	MLX4_WQE_CTRL_SOLICITED	= 1 << 1,