[ofa-general] [PATCH/RFC] libmlx4: Handle new FW requirement for send request prefetching

Wed Jun 13 10:34:39 PDT 2007

Similarly I just added this to libmlx4.  The change to handle alignment
for inline send segments will be a separate patch, and I'm still
cleaning it up.  Anyway, let me know if you see any problems with
this.

BTW, with FW 2.0.158, I am seeing the HCA FW crash after running
ibv_srq_pingpong with default parameters.  Not sure if this is a
driver bug (I am using my latest kernel driver and libmlx4) or a
firmware problem.

commit 561da8d10e419ffb333fe6faf05004d9a3670e7a
Author: Roland Dreier <rolandd at cisco.com>
Date:   Wed Jun 13 10:31:16 2007 -0700

    Handle new FW requirement for send request prefetching
    
    New ConnectX firmware introduces FW command interface revision 2,
    which requires that for each QP, a chunk of send queue entries (the
    "headroom") is kept marked as invalid, so that the HCA doesn't get
    confused if it prefetches entries that haven't been posted yet.  Add
    code to libmlx4 to do this.
    
    Also, handle the new kernel ABI that adds the sq_no_prefetch parameter
    to the create QP operation.  We just hard-code sq_no_prefetch to 0 and
    always provide the full SQ headroom for now.
    
    Based on a patch from Jack Morgenstein <jackm at dev.mellanox.co.il>.
    
    Signed-off-by: Roland Dreier <rolandd at cisco.com>

diff --git a/src/cq.c b/src/cq.c
index a1831ff..f3e3e3c 100644
--- a/src/cq.c
+++ b/src/cq.c
@@ -239,7 +239,7 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
 		wq = &(*cur_qp)->sq;
 		wqe_index = ntohs(cqe->wqe_index);
 		wq->tail += (uint16_t) (wqe_index - (uint16_t) wq->tail);
-		wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		++wq->tail;
 	} else if ((*cur_qp)->ibv_qp.srq) {
 		srq = to_msrq((*cur_qp)->ibv_qp.srq);
@@ -248,7 +248,7 @@ static int mlx4_poll_one(struct mlx4_cq *cq,
 		mlx4_free_srq_wqe(srq, wqe_index);
 	} else {
 		wq = &(*cur_qp)->rq;
-		wc->wr_id = wq->wrid[wq->tail & (wq->max - 1)];
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
 		++wq->tail;
 	}
 
diff --git a/src/mlx4-abi.h b/src/mlx4-abi.h
index 97f5dcd..20a40c9 100644
--- a/src/mlx4-abi.h
+++ b/src/mlx4-abi.h
@@ -36,7 +36,7 @@
 #include <infiniband/kern-abi.h>
 
 #define MLX4_UVERBS_MIN_ABI_VERSION	2
-#define MLX4_UVERBS_MAX_ABI_VERSION	2
+#define MLX4_UVERBS_MAX_ABI_VERSION	3
 
 struct mlx4_alloc_ucontext_resp {
 	struct ibv_get_context_resp	ibv_resp;
@@ -86,7 +86,8 @@ struct mlx4_create_qp {
 	__u64				db_addr;
 	__u8				log_sq_bb_count;
 	__u8				log_sq_stride;
-	__u8				reserved[6];
+	__u8				sq_no_prefetch;	/* was reserved in ABI 2 */
+	__u8				reserved[5];
 };
 
 #endif /* MLX4_ABI_H */
diff --git a/src/mlx4.h b/src/mlx4.h
index e29f456..3710a17 100644
--- a/src/mlx4.h
+++ b/src/mlx4.h
@@ -200,7 +200,8 @@ struct mlx4_srq {
 struct mlx4_wq {
 	uint64_t		       *wrid;
 	pthread_spinlock_t		lock;
-	int				max;
+	int				wqe_cnt;
+	int				max_post;
 	unsigned			head;
 	unsigned			tail;
 	int				max_gs;
@@ -216,6 +217,7 @@ struct mlx4_qp {
 
 	uint32_t			doorbell_qpn;
 	uint32_t			sq_signal_bits;
+	int				sq_spare_wqes;
 	struct mlx4_wq			sq;
 
 	uint32_t		       *db;
@@ -342,6 +344,8 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 			  struct ibv_send_wr **bad_wr);
 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			  struct ibv_recv_wr **bad_wr);
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp);
 int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
 		       enum ibv_qp_type type, struct mlx4_qp *qp);
 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
diff --git a/src/qp.c b/src/qp.c
index 7df3311..301f7cb 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -65,6 +65,20 @@ static void *get_send_wqe(struct mlx4_qp *qp, int n)
 	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
 }
 
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with 0xffffffff, except for
+ * the very first chunk of the WQE.
+ */
+static void stamp_send_wqe(struct mlx4_qp *qp, int n)
+{
+	uint32_t *wqe = get_send_wqe(qp, n);
+	int i;
+
+	for (i = 16; i < 1 << (qp->sq.wqe_shift - 2); i += 16)
+		wqe[i] = 0xffffffff;
+}
+
 void mlx4_init_qp_indices(struct mlx4_qp *qp)
 {
 	qp->sq.head	 = 0;
@@ -78,9 +92,11 @@ void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
 	struct mlx4_wqe_ctrl_seg *ctrl;
 	int i;
 
-	for (i = 0; i < qp->sq.max; ++i) {
+	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
 		ctrl = get_send_wqe(qp, i);
 		ctrl->owner_opcode = htonl(1 << 31);
+
+		stamp_send_wqe(qp, i);
 	}
 }
 
@@ -89,14 +105,14 @@ static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
 	unsigned cur;
 
 	cur = wq->head - wq->tail;
-	if (cur + nreq < wq->max)
+	if (cur + nreq < wq->max_post)
 		return 0;
 
 	pthread_spin_lock(&cq->lock);
 	cur = wq->head - wq->tail;
 	pthread_spin_unlock(&cq->lock);
 
-	return cur + nreq >= wq->max;
+	return cur + nreq >= wq->max_post;
 }
 
 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
@@ -138,8 +154,8 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 			goto out;
 		}
 
-		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.max - 1));
-		qp->sq.wrid[ind & (qp->sq.max - 1)] = wr->wr_id;
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
 
 		ctrl->srcrb_flags =
 			(wr->send_flags & IBV_SEND_SIGNALED ?
@@ -274,7 +290,16 @@ int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 		wmb();
 
 		ctrl->owner_opcode = htonl(mlx4_ib_opcode[wr->opcode]) |
-			(ind & qp->sq.max ? htonl(1 << 31) : 0);
+			(ind & qp->sq.wqe_cnt ? htonl(1 << 31) : 0);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 */
+		if (wr->next)
+			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
+				       (qp->sq.wqe_cnt - 1));
 
 		++ind;
 	}
@@ -313,6 +338,10 @@ out:
 		*(uint32_t *) (ctx->uar + MLX4_SEND_DOORBELL) = qp->doorbell_qpn;
 	}
 
+	if (nreq)
+		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
+			       (qp->sq.wqe_cnt - 1));
+
 	pthread_spin_unlock(&qp->sq.lock);
 
 	return ret;
@@ -332,7 +361,7 @@ int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 
 	/* XXX check that state is OK to post receive */
 
-	ind = qp->rq.head & (qp->rq.max - 1);
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		if (wq_overflow(&qp->rq, nreq, to_mcq(qp->ibv_qp.recv_cq))) {
@@ -363,7 +392,7 @@ int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 
 		qp->rq.wrid[ind] = wr->wr_id;
 
-		ind = (ind + 1) & (qp->rq.max - 1);
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
 	}
 
 out:
@@ -384,36 +413,17 @@ out:
 	return ret;
 }
 
-int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
-		       enum ibv_qp_type type, struct mlx4_qp *qp)
+void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
+			   struct mlx4_qp *qp)
 {
 	int size;
 	int max_sq_sge;
 
-	qp->rq.max_gs	 = cap->max_recv_sge;
 	max_sq_sge	 = align(cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg),
 				 sizeof (struct mlx4_wqe_data_seg)) / sizeof (struct mlx4_wqe_data_seg);
 	if (max_sq_sge < cap->max_send_sge)
 		max_sq_sge = cap->max_send_sge;
 
-	qp->sq.wrid = malloc(qp->sq.max * sizeof (uint64_t));
-	if (!qp->sq.wrid)
-		return -1;
-
-	if (qp->rq.max) {
-		qp->rq.wrid = malloc(qp->rq.max * sizeof (uint64_t));
-		if (!qp->rq.wrid) {
-			free(qp->sq.wrid);
-			return -1;
-		}
-	}
-
-	size = qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
-
-	for (qp->rq.wqe_shift = 4; 1 << qp->rq.wqe_shift < size;
-	     qp->rq.wqe_shift++)
-		; /* nothing */
-
 	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
 	switch (type) {
 	case IBV_QPT_UD:
@@ -451,14 +461,37 @@ int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
 	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
 	     qp->sq.wqe_shift++)
 		; /* nothing */
+}
+
+int mlx4_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
+		       enum ibv_qp_type type, struct mlx4_qp *qp)
+{
+	qp->rq.max_gs	 = cap->max_recv_sge;
+
+	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
+	if (!qp->sq.wrid)
+		return -1;
+
+	if (qp->rq.wqe_cnt) {
+		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
+		if (!qp->rq.wrid) {
+			free(qp->sq.wrid);
+			return -1;
+		}
+	}
+
+	for (qp->rq.wqe_shift = 4;
+	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
+	     qp->rq.wqe_shift++)
+		; /* nothing */
 
-	qp->buf_size = (qp->rq.max << qp->rq.wqe_shift) +
-		(qp->sq.max << qp->sq.wqe_shift);
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
 	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
 		qp->rq.offset = 0;
-		qp->sq.offset = qp->rq.max << qp->rq.wqe_shift;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 	} else {
-		qp->rq.offset = qp->sq.max << qp->sq.wqe_shift;
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
 		qp->sq.offset = 0;
 	}
 
@@ -499,6 +532,8 @@ void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
 	cap->max_send_sge    = qp->sq.max_gs;
 	qp->max_inline_data  = wqe_size - sizeof (struct mlx4_wqe_inline_seg);
 	cap->max_inline_data = qp->max_inline_data;
+	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
+	cap->max_send_wr     = qp->sq.max_post;
 }
 
 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
diff --git a/src/verbs.c b/src/verbs.c
index 52ca0c8..2243b6c 100644
--- a/src/verbs.c
+++ b/src/verbs.c
@@ -355,11 +355,18 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
 	if (!qp)
 		return NULL;
 
-	qp->sq.max = align_queue_size(attr->cap.max_send_wr);
-	qp->rq.max = align_queue_size(attr->cap.max_recv_wr);
+	mlx4_calc_sq_wqe_size(&attr->cap, attr->qp_type, qp);
+
+	/*
+	 * We need to leave 2 KB + 1 WQE of headroom in the SQ to
+	 * allow HW to prefetch.
+	 */
+	qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1;
+	qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr + qp->sq_spare_wqes);
+	qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr);
 
 	if (attr->srq)
-		attr->cap.max_recv_wr = qp->rq.max = 0;
+		attr->cap.max_recv_wr = qp->rq.wqe_cnt = 0;
 	else if (attr->cap.max_recv_sge < 1)
 		attr->cap.max_recv_sge = 1;
 
@@ -387,9 +394,10 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
 		cmd.db_addr = (uintptr_t) qp->db;
 	cmd.log_sq_stride   = qp->sq.wqe_shift;
 	for (cmd.log_sq_bb_count = 0;
-	     qp->sq.max > 1 << cmd.log_sq_bb_count;
+	     qp->sq.wqe_cnt > 1 << cmd.log_sq_bb_count;
 	     ++cmd.log_sq_bb_count)
 		; /* nothing */
+	cmd.sq_no_prefetch = 0;	/* OK for ABI 2: just a reserved field */
 	memset(cmd.reserved, 0, sizeof cmd.reserved);
 
 	ret = ibv_cmd_create_qp(pd, &qp->ibv_qp, attr, &cmd.ibv_cmd, sizeof cmd,
@@ -401,8 +409,8 @@ struct ibv_qp *mlx4_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr)
 	if (ret)
 		goto err_destroy;
 
-	qp->rq.max    = attr->cap.max_recv_wr;
-	qp->rq.max_gs = attr->cap.max_recv_sge;
+	qp->rq.wqe_cnt = qp->rq.max_post = attr->cap.max_recv_wr;
+	qp->rq.max_gs  = attr->cap.max_recv_sge;
 	mlx4_set_sq_sizes(qp, &attr->cap, attr->qp_type);
 
 	qp->doorbell_qpn    = htonl(qp->ibv_qp.qp_num << 8);
@@ -422,7 +430,7 @@ err_rq_db:
 
 err_free:
 	free(qp->sq.wrid);
-	if (qp->rq.max)
+	if (qp->rq.wqe_cnt)
 		free(qp->rq.wrid);
 	mlx4_free_buf(&qp->buf);
 
@@ -527,7 +535,7 @@ int mlx4_destroy_qp(struct ibv_qp *ibqp)
 	if (!ibqp->srq)
 		mlx4_free_db(to_mctx(ibqp->context), MLX4_DB_TYPE_RQ, qp->db);
 	free(qp->sq.wrid);
-	if (qp->rq.max)
+	if (qp->rq.wqe_cnt)
 		free(qp->rq.wrid);
 	mlx4_free_buf(&qp->buf);
 	free(qp);