[ofa-general] [PATCH] userspace/libmthca: Support forwarding of packets in router mode

Fri Aug 31 17:07:27 PDT 2007

This patch enables the forwarding of packets (all headers and payload)
   from user space when the mthca device is operating in mthca router mode.
   Since any given device either operates in router mode or not, the library
   determines the mode at context allocation and overrides the default post
   send routines when in router mode.  The decision was made to override
   the routines since the majority of the users will never operate in router
   mode and should not incur the overhead of mode checking on each post.
   Selection of the routines is dependent on the previous patch to the
   libibverbs that initializes the existing ibv_device->node_type.

   The post routines expect the sg list to point to the packet's LRH and
   end at the ICRC.

   These routines could be ported to the mthca kernel driver as well if
   a kernel module consumer is desireble; it didn't seem like a good fit
   at this time.

Signed-off-by: Steve Welch <swelch at systemfabricworks.com>
---
 src/mthca.c |   10 ++-
 src/mthca.h |    4 +
 src/qp.c    |  393 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/wqe.h   |   20 +++
 4 files changed, 425 insertions(+), 2 deletions(-)

diff --git a/src/mthca.c b/src/mthca.c
index 0f7e953..6acd78e 100644
--- a/src/mthca.c
+++ b/src/mthca.c
@@ -187,13 +187,19 @@ static struct ibv_context *mthca_alloc_context(struct ibv_device *ibdev, int cmd
 	if (mthca_is_memfree(&context->ibv_ctx)) {
 		context->ibv_ctx.ops.req_notify_cq = mthca_arbel_arm_cq;
 		context->ibv_ctx.ops.cq_event      = mthca_arbel_cq_event;
-		context->ibv_ctx.ops.post_send     = mthca_arbel_post_send;
+		context->ibv_ctx.ops.post_send     = 
+				ibdev->node_type == IBV_NODE_ROUTER ?
+					mthca_arbel_router_post_send :
+					mthca_arbel_post_send;
 		context->ibv_ctx.ops.post_recv     = mthca_arbel_post_recv;
 		context->ibv_ctx.ops.post_srq_recv = mthca_arbel_post_srq_recv;
 	} else {
 		context->ibv_ctx.ops.req_notify_cq = mthca_tavor_arm_cq;
 		context->ibv_ctx.ops.cq_event      = NULL;
-		context->ibv_ctx.ops.post_send     = mthca_tavor_post_send;
+                context->ibv_ctx.ops.post_send     = 
+				ibdev->node_type == IBV_NODE_ROUTER ?
+					mthca_tavor_router_post_send :
+					mthca_tavor_post_send;
 		context->ibv_ctx.ops.post_recv     = mthca_tavor_post_recv;
 		context->ibv_ctx.ops.post_srq_recv = mthca_tavor_post_srq_recv;
 	}
diff --git a/src/mthca.h b/src/mthca.h
index 1f31bc3..eb264d1 100644
--- a/src/mthca.h
+++ b/src/mthca.h
@@ -354,10 +354,14 @@ int mthca_destroy_qp(struct ibv_qp *qp);
 void mthca_init_qp_indices(struct mthca_qp *qp);
 int mthca_tavor_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 			  struct ibv_send_wr **bad_wr);
+int mthca_tavor_router_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+				 struct ibv_send_wr **bad_wr);
 int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			  struct ibv_recv_wr **bad_wr);
 int mthca_arbel_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
 			  struct ibv_send_wr **bad_wr);
+int mthca_arbel_router_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+				 struct ibv_send_wr **bad_wr);
 int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			  struct ibv_recv_wr **bad_wr);
 int mthca_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap,
diff --git a/src/qp.c b/src/qp.c
index 2ea9dc0..1617c08 100644
--- a/src/qp.c
+++ b/src/qp.c
@@ -331,6 +331,184 @@ out:
 	return ret;
 }
 
+int mthca_tavor_router_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+				 struct ibv_send_wr **bad_wr)
+{
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe, *prev_wqe;
+	int ind;
+	int nreq;
+	int ret = 0;
+	int size;
+	int size0 = 0;
+	int i;
+	struct mthca_mlx_seg *mlx;
+	struct mthca_lrh *lrh;
+
+	/*
+	 * f0 and op0 cannot be used unless nreq > 0, which means this
+	 * function makes it through the loop at least once.  So the
+	 * code inside the if (!size0) will be executed, and f0 and
+	 * op0 will be initialized.  So any gcc warning about "may be
+	 * used unitialized" is bogus.
+	 */
+	uint32_t f0;
+	uint32_t op0;
+
+	pthread_spin_lock(&qp->sq.lock);
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IBV_SEND_SIGNALED) ?
+			htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IBV_SEND_SOLICITED) ?
+			htonl(MTHCA_NEXT_SOLICIT) : 0)   |
+			htonl(1);
+
+		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		/*
+		 * In router mode all QP appear to the user as UD, the actual 
+		 * implementation uses the MTHCA specific MLX transport.
+		 */
+		if (ibqp->qp_type != IBV_QPT_UD) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		/*
+		 * First address should point to a contiguous LRH, last 4 
+		 * bytes should point to the existing ICRC.
+		 */
+		mlx = wqe - sizeof(struct mthca_next_seg);
+		lrh = (struct mthca_lrh *)wr->sg_list[0].addr;
+
+		/* CQE, do not calculate ICRC, use packet SLID */
+		mlx->flags  = htonl((1 << 16) | (3 << 4) | 1 << 3);
+
+		if (MTHCA_LRH_GET_VL(lrh)==15) 
+			mlx->flags  |= htonl( 1 << 17 );
+
+		mlx->flags  |= htonl(MTHCA_LRH_GET_SL(lrh) << 8);
+		mlx->flags2  = htonl((MTHCA_LRH_GET_DLID(lrh)) << 16);
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->send_flags & IBV_SEND_INLINE) {
+			if (wr->num_sge) {
+				struct mthca_inline_seg *seg = wqe;
+				int s = 0;
+
+				wqe += sizeof *seg;
+				for (i = 0; i < wr->num_sge; ++i) {
+					struct ibv_sge *sge = &wr->sg_list[i];
+
+					s += sge->length;
+
+					if (s > qp->max_inline_data) {
+						ret = -1;
+						*bad_wr = wr;
+						goto out;
+					}
+
+					memcpy(wqe,(void *)(intptr_t)sge->addr,
+					       sge->length);
+					wqe += sge->length;
+				}
+
+				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
+				size += align(s + sizeof *seg, 16) / 16;
+			}
+		} else {
+			struct mthca_data_seg *seg;
+
+			for (i = 0; i < wr->num_sge; ++i) {
+				seg = wqe;
+				seg->byte_count = htonl(wr->sg_list[i].length);
+				seg->lkey = htonl(wr->sg_list[i].lkey);
+				seg->addr = htonll(wr->sg_list[i].addr);
+				wqe += sizeof *seg;
+                        }
+
+			size += wr->num_sge * (sizeof *seg / 16);
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= sizeof mthca_opcode / 
+		    sizeof mthca_opcode[0]) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			htonl(((ind << qp->sq.wqe_shift) +
+				qp->send_wqe_offset) |
+				mthca_opcode[wr->opcode]);
+		/*
+		 * Make sure that nda_op is written before setting ee_nds.
+		 */
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			htonl((size0 ? 0 : MTHCA_NEXT_DBD) | size |
+			((wr->send_flags & IBV_SEND_FENCE) ?
+			 MTHCA_NEXT_FENCE : 0));
+
+		if (!size0) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IBV_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (ind >= qp->sq.max)
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (nreq) {
+		uint32_t doorbell[2];
+
+		doorbell[0] = htonl(((qp->sq.next_ind << qp->sq.wqe_shift) +
+				      qp->send_wqe_offset) | f0 | op0);
+		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
+
+		mthca_write64(doorbell, to_mctx(ibqp->context), MTHCA_SEND_DOORBELL);
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	pthread_spin_unlock(&qp->sq.lock);
+	return ret;
+}
+
 int mthca_tavor_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			  struct ibv_recv_wr **bad_wr)
 {
@@ -707,6 +885,221 @@ out:
 	return ret;
 }
 
+int mthca_arbel_router_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
+				 struct ibv_send_wr **bad_wr)
+{
+	struct mthca_qp *qp = to_mqp(ibqp);
+	uint32_t doorbell[2];
+	void *wqe, *prev_wqe;
+	int ind;
+	int nreq;
+	int ret = 0;
+	int size;
+	int size0 = 0;
+	int i;
+	struct mthca_mlx_seg *mlx;
+	struct mthca_lrh *lrh;
+
+	/*
+	 * f0 and op0 cannot be used unless nreq > 0, which means this
+	 * function makes it through the loop at least once.  So the
+	 * code inside the if (!size0) will be executed, and f0 and
+	 * op0 will be initialized.  So any gcc warning about "may be
+	 * used unitialized" is bogus.
+	 */
+	uint32_t f0;
+	uint32_t op0;
+
+	pthread_spin_lock(&qp->sq.lock);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB) {
+			nreq = 0;
+
+			doorbell[0] = 
+				htonl((MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				    ((qp->sq.head & 0xffff) << 8) | f0 | op0);
+			doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
+
+			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+			/*
+			 * Make sure that descriptors are written before
+			 * doorbell record.
+			 */
+			wmb();
+			*qp->sq.db = htonl(qp->sq.head & 0xffff);
+
+			/*
+			 * Make sure doorbell record is written before we
+			 * write MMIO send doorbell.
+			 */
+			wmb();
+			mthca_write64(doorbell, to_mctx(ibqp->context), 
+				      MTHCA_SEND_DOORBELL);
+
+			size0 = 0;
+		}
+
+		if (wq_overflow(&qp->sq, nreq, to_mcq(qp->ibv_qp.send_cq))) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+				((wr->send_flags & IBV_SEND_SIGNALED) ?
+				 htonl(MTHCA_NEXT_CQ_UPDATE) : 0) |
+				((wr->send_flags & IBV_SEND_SOLICITED) ?
+				 htonl(MTHCA_NEXT_SOLICIT) : 0)   |
+				htonl(1);
+		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
+		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
+				((struct mthca_next_seg *) wqe)->imm = 
+						wr->imm_data;
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		/*
+		 * In router mode all QP appear to the user as UD, the actual
+		 *  implementation uses the MTHCA specific MLX transport.
+		 */
+		if (ibqp->qp_type != IBV_QPT_UD) {
+                        ret = -1;
+                        *bad_wr = wr;
+                        goto out;
+		}
+
+		/*
+		 * First address should point to a contiguous LRH, last 4 
+		 * bytes should point to the existing ICRC.
+		 */
+		mlx = wqe - sizeof(struct mthca_next_seg);
+		lrh = (struct mthca_lrh *)wr->sg_list[0].addr;
+
+		/* CQE, do not calculate ICRC, SLID from packet */
+		mlx->flags  = htonl((1 << 16) | (3 << 4) | 1 << 3);
+
+		if (MTHCA_LRH_GET_VL(lrh)==15) 
+			mlx->flags  |= htonl( 1 << 17 );
+
+		mlx->flags  |= htonl(MTHCA_LRH_GET_SL(lrh) << 8);   /* SL */
+		mlx->flags2  = htonl((MTHCA_LRH_GET_DLID(lrh)) << 16);
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->send_flags & IBV_SEND_INLINE) {
+			if (wr->num_sge) {
+				struct mthca_inline_seg *seg = wqe;
+				int s = 0;
+
+				wqe += sizeof *seg;
+				for (i = 0; i < wr->num_sge; ++i) {
+					struct ibv_sge *sge = &wr->sg_list[i];
+
+					s += sge->length;
+
+					if (s > qp->max_inline_data) {
+						ret = -1;
+						*bad_wr = wr;
+						goto out;
+					}
+
+					memcpy(wqe, 
+					       (void *) (uintptr_t) sge->addr,
+					       sge->length);
+					wqe += sge->length;
+				}
+
+				seg->byte_count = htonl(MTHCA_INLINE_SEG | s);
+				size += align(s + sizeof *seg, 16) / 16;
+			}
+		} else {
+			struct mthca_data_seg *seg;
+
+			for (i = 0; i < wr->num_sge; ++i) {
+				seg = wqe;
+				seg->byte_count = htonl(wr->sg_list[i].length);
+				seg->lkey = htonl(wr->sg_list[i].lkey);
+				seg->addr = htonll(wr->sg_list[i].addr);
+				wqe += sizeof *seg;
+			}
+
+			size += wr->num_sge * (sizeof *seg / 16);
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= sizeof mthca_opcode / 
+		    sizeof mthca_opcode[0]) {
+			ret = -1;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			htonl(((ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) |
+			      mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			htonl(MTHCA_NEXT_DBD | size |
+			      ((wr->send_flags & IBV_SEND_FENCE) ?
+			       MTHCA_NEXT_FENCE : 0));
+
+		if (!size0) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IBV_SEND_FENCE ?
+			        MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (ind >= qp->sq.max)
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (nreq) {
+		doorbell[0] = htonl((nreq << 24)                  |
+				    ((qp->sq.head & 0xffff) << 8) |
+				    f0 | op0);
+		doorbell[1] = htonl((ibqp->qp_num << 8) | size0);
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = htonl(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+		mthca_write64(doorbell, to_mctx(ibqp->context), 
+			      MTHCA_SEND_DOORBELL);
+	}
+
+	pthread_spin_unlock(&qp->sq.lock);
+	return ret;
+}
+
 int mthca_arbel_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
 			  struct ibv_recv_wr **bad_wr)
 {
diff --git a/src/wqe.h b/src/wqe.h
index 602f512..4db7226 100644
--- a/src/wqe.h
+++ b/src/wqe.h
@@ -111,4 +111,24 @@ struct mthca_inline_seg {
 	uint32_t	byte_count;
 };
 
+struct mthca_mlx_seg {
+	uint32_t        nda_op;
+	uint32_t        nds;
+	uint32_t        flags;   /* [17]vl15, [16]slr, [14:12] static rate */
+	                         /* [8] sl, [5:4] icrc, [3] c, [2] e */
+	uint32_t        flags2;  /* [31:16]DLID, [15:0] vcrc */
+};
+
+struct mthca_lrh {
+	uint32_t        data[2];
+} __attribute__((packed));
+
+#define MTHCA_LRH_GET_VL(x)       (htonl((x)->data[0]) >> 28)
+#define MTHCA_LRH_GET_LVER(x)     ((htonl((x)->data[0]) >> 24) & 0x000f)
+#define MTHCA_LRH_GET_SL(x)       ((htonl((x)->data[0]) >> 20) & 0x000f)
+#define MTHCA_LRH_GET_LNH(x)      ((htonl((x)->data[0]) >> 16) & 0x0003)
+#define MTHCA_LRH_GET_DLID(x)     (htonl((x)->data[0]) & 0x0000FFFF)
+#define MTHCA_LRH_GET_PKTLEN(x)   ((htonl((x)->data[1]) >> 16) & 0x000007FF)
+#define MTHCA_LRH_GET_SLID(x)     (htonl((x)->data[1]) & 0x0000FFFF)
+
 #endif /* WQE_H */