[ofa-general] [PATCH draft, untested] ehca srq emulation (for IPoIB CM)

Michael S. Tsirkin mst at dev.mellanox.co.il
Wed Jun 13 09:38:21 PDT 2007


Here's how I would go about emulating SRQ in ehca in software.  I knocked this
out in several hours, so this is completely untested (not even compiled,
that's why there are no Makefile bits), but it seemed an easiest way
to get the message across on what I consider the right way to do it.
Note how this both has no overhead for HCAs with hardware srq
support and is smaller than nosrq patches.

The idea here is that you can emulate enough of the SRQ
interface in ehca to make IPoIB CM work without changes:
keep QPs on a list, and distribute posted WRs between them evenly.

This naturally does not solve the scalability problems
that IPoIB CM without SRQ would have, but at least it contains
them within ehca.

Another advantage of this approach: noSRQ issues are separated out, so we'll be
able to continue working on IPoIB CM without maintaining two code paths.

There are obvious optimizations that can be done (e.g. each wr is copied
twice on data path, we only need a unidirectional list of cqes ...)
hopefully someone at IBM will look into this: I wanted to avoid touching
low-level code I don't understand and can't test, as much as possible.

Known bugs:
Last wqe reached event is missing in this implementation:
I've run out of time, and it's pretty trivial to add anyway,
by adding a per-QP counter of outstanding WRs.
We'll need a tasklet or a thread for the callback though:
is there a tasklet/thread that can be reused for this?

Caveats:
As an optimization, I used a bit in qp_token to signal SRQ presence.
No idea whether this works in practice in your hardware. If not,
another way to detect SRQ WC will have to be found.

Again, hopefully someone at IBM will look into this.

Signed-off-by: Michael S. Tsirkin <mst at dev.mellanox.co.il>

---

ehca_classes.h |    6 +
ehca_irq.c     |    2
ehca_iverbs.h  |    6 +
ehca_main.c    |    3
ehca_qp.c      |   14 ++-
ehca_reqs.c    |    3
ehca_srq.c     |  237 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ehca_uverbs.c  |    2
8 files changed, 269 insertions(+), 4 deletions(-)


diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index 1d286d3..e54bb82 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -281,6 +281,9 @@ extern spinlock_t hcall_lock;
 extern struct idr ehca_qp_idr;
 extern struct idr ehca_cq_idr;
 
+#define EHCA_QP_TOKEN_SRQ (1 << 31)
+#define EHCA_QP_TOKEN(token) (token & ~EHCA_QP_TOKEN_SRQ)
+
 extern int ehca_static_rate;
 extern int ehca_port_act_time;
 extern int ehca_use_hp_mr;
@@ -344,4 +347,7 @@ int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
 int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
 struct ehca_qp* ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
 
+int ehca_srq_handle_wc(struct ib_wc *wc, unsigned token);
+int ehca_srq_attach(struct ib_srq *srq, struct ib_qp *qp);
+
 #endif
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index 100329b..f3b078c 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -182,7 +182,7 @@ static void qp_event_callback(struct ehca_shca *shca,
 	u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
 
 	spin_lock_irqsave(&ehca_qp_idr_lock, flags);
-	qp = idr_find(&ehca_qp_idr, token);
+	qp = idr_find(&ehca_qp_idr, EHCA_QP_TOKEN(token));
 	spin_unlock_irqrestore(&ehca_qp_idr_lock, flags);
 
 
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
index 37e7fe0..0f530cc 100644
--- a/drivers/infiniband/hw/ehca/ehca_iverbs.h
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -178,4 +178,10 @@ void ehca_free_fw_ctrlblock(void *ptr);
 #define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
 #endif
 
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr);
+int ehca_destroy_srq(struct ib_srq *srq);
+int ehca_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr);
+
 #endif
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index c3f99f3..bfab202 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -330,6 +330,9 @@ int ehca_init_device(struct ehca_shca *shca)
 	/* shca->ib_device.modify_ah	    = ehca_modify_ah;	    */
 	shca->ib_device.query_ah	    = ehca_query_ah;
 	shca->ib_device.destroy_ah	    = ehca_destroy_ah;
+	shca->ib_device.create_srq	    = ehca_create_srq;
+	shca->ib_device.destroy_srq	    = ehca_destroy_srq;
+	shca->ib_device.post_srq_recv	    = ehca_post_srq_recv;
 	shca->ib_device.create_qp	    = ehca_create_qp;
 	shca->ib_device.modify_qp	    = ehca_modify_qp;
 	shca->ib_device.query_qp	    = ehca_query_qp;
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index b5bc787..9a14e90 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -486,6 +486,9 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
 		goto create_qp_exit0;
 	}
 
+	if (init_attr->srq)
+		my_qp->token |= EHCA_QP_TOKEN_SRQ;
+
 	parms.servicetype = ibqptype2servicetype(init_attr->qp_type);
 	if (parms.servicetype < 0) {
 		ret = -EINVAL;
@@ -663,6 +666,13 @@ struct ib_qp *ehca_create_qp(struct ib_pd *pd,
 		}
 	}
 
+	if (my_qp->ib_qp.srq) {
+		ret = ehca_srq_attach(my_qp->ib_qp.srq, my_qp->ib_qp);
+		if (ret)
+			goto create_qp_exit3;
+	}
+
+
 	return &my_qp->ib_qp;
 
 create_qp_exit3:
@@ -674,7 +684,7 @@ create_qp_exit2:
 
 create_qp_exit1:
 	spin_lock_irqsave(&ehca_qp_idr_lock, flags);
-	idr_remove(&ehca_qp_idr, my_qp->token);
+	idr_remove(&ehca_qp_idr, EHCA_QP_TOKEN(my_qp->token));
 	spin_unlock_irqrestore(&ehca_qp_idr_lock, flags);
 
 create_qp_exit0:
@@ -1408,7 +1418,7 @@ int ehca_destroy_qp(struct ib_qp *ibqp)
 	}
 
 	spin_lock_irqsave(&ehca_qp_idr_lock, flags);
-	idr_remove(&ehca_qp_idr, my_qp->token);
+	idr_remove(&ehca_qp_idr, EHCA_QP_TOKEN(my_qp->token));
 	spin_unlock_irqrestore(&ehca_qp_idr_lock, flags);
 
 	h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index caec9de..b151c67 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -601,6 +601,9 @@ poll_cq_one_exit0:
 	if (cqe_count > 0)
 		hipz_update_feca(my_cq, cqe_count);
 
+	if ((wc->opcode & IB_WC_RECV) && (cqe->qp_token & EHCA_QP_TOKEN_SRQ))
+		ret = ehca_srq_handle_wc(wc, cqe->qp_token);
+
 	return ret;
 }
 
diff --git a/drivers/infiniband/hw/ehca/ehca_srq.c b/drivers/infiniband/hw/ehca/ehca_srq.c
new file mode 100644
index 0000000..1e1574a
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_srq.c
@@ -0,0 +1,237 @@
+/*
+ *  SRQ emulation for ehca.
+ *
+ *  Author: Michael S. Tsirkin <mst at mellanox.co.il>
+ *
+ *  Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <rdma/ib_verbs.h>
+#include "ehca_classes.h"
+
+#define EHCA_QPS_PER_SRQ 16
+
+struct ehca_srq_cqe {
+	struct list_head list;
+	struct ib_qp *qp;
+};
+
+struct ehca_srq {
+	struct ib_srq ib_srq;
+	struct ib_srq_attr attr;
+	struct spinlock lock;
+
+	struct ib_recv_wr *wrs;
+	struct ehca_srq_cqe *cqes;
+
+	struct ib_recv_wr *first_polled; /* Polled or unused */
+	struct ib_recv_wr *first_posted; /* Posted on SRQ but not on QP */
+
+	struct list_head polled_cqes; /* Polled */
+	struct list_head free_cqes; /* Posted or unused */
+};
+
+static int ehca_srq_repost(struct ehca_srq *srq)
+{
+	struct ib_recv_wr wr, *wrp, *bad_recv_wr;
+	struct ehca_srq_cqe *c, n;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	list_for_each_entry_safe(c, n, &srq->polled_cqes, list) {
+		wrp = srq->first_posted;
+		if (!wrp)
+			break;
+		memcpy(&wr, wrp, sizeof wr);
+		wr.next = NULL;
+		wr.wr_id = (u64)wrp;
+		rc = ib_post_recv(c->qp, &wr, &bad_recv_wr);
+		if (rc)
+			break;
+
+		srq->first_posted = wrp->next;
+		wrp->next = NULL;
+		list_del(&c->list);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return rc;
+}
+
+int ehca_srq_handle_wc(struct ib_wc *wc, unsigned token)
+{
+	struct ehca_qp *qp;
+	struct ehca_srq *srq;
+	struct ehca_srq_cqe *cqe;
+	struct ib_recv_wr *wr;
+
+	spin_lock_irqsave(&ehca_qp_idr_lock, flags);
+	qp = idr_find(&ehca_qp_idr, EHCA_QP_TOKEN(token));
+	spin_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+	if (!qp)
+		return -EINVAL;
+
+	wc->qp = &qp->ib_qp;
+	srq = container_of(qp->ib_qp.srq, *srq, ib_srq);
+	spin_lock_irqsave(&srq->lock, flags);
+	BUG_ON(list_empty(&srq->free_cqes));
+	cqe = container_of(srq->free_cqes.next, typeof *cqe, list);
+	cqe->qp = &qp->ib_qp;
+	list_move(&cqe->list, &srq->polled_cqes);
+	wr = (void *)wc->wr_id;
+	wc->wr_id = wr->wr_id;
+	wr->next = srq->first_polled;
+	srq->first_polled = wr;
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return 0;
+}
+
+int ehca_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr);
+{
+	struct ib_recv_wr *wr, *copy;
+	struct ehca_srq *srq;
+
+	srq = container_of(ib_srq, *srq, ib_srq);
+	for (wr = recv_wr; wr; wr = wr->next) {
+		copy = srq->first_polled;
+		if (!copy) {
+			*bad_recv_wr = wr;
+			return -ENOMEM;
+		}
+		srq->first_polled = copy->next;
+
+		memcpy(copy, wr, sizeof *copy);
+		if (wr->num_sge)
+			memcpy(copy->sg_list, wr->sg_list,
+			       wr->num_sge * sizeof *copy->sg_list);
+
+		copy->next = srq->first_posted;
+		srq->first_posted = copy;
+	}
+
+	ehca_srq_repost(srq);
+	return 0;
+}
+
+int ehca_srq_attach(struct ib_srq *ib_srq, struct ib_qp *qp)
+{
+	int i;
+	struct ehca_srq_cqe *cqe;
+	struct ehca_srq *srq;
+
+	srq = container_of(ib_srq, *srq, ib_srq);
+
+	spin_lock_irq(&srq->lock);
+	for (i = 0; i < srq->attr.max_wrs / EHCA_QPS_PER_SRQ; ++i) {
+		if (list_empty(&srq->free_cqes))
+			break;
+		cqe = list_entry(srq->free_cqes.next, typeof *cqe, list);
+		cqe->qp = qp;
+		list_move_tail(&cqe->list, &srq->polled_cqes);
+	}
+	spin_unlock_irq(&srq->lock);
+	if (!i)
+		return -ENOMEM;
+
+	return ehca_srq_repost(srq);
+}
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			       struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ehca_srq *srq;
+	int i = 0;
+
+	srq = kmalloc(*srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&srq->attr, srq_init_attr, sizeof srq->attr);
+	spin_lock_init(&srq->lock);
+	INIT_LIST_HEAD(&srq->polled_cqes);
+	INIT_LIST_HEAD(&srq->free_cqes);
+	srq->first_posted = NULL;
+	srq->first_polled = NULL;
+
+	srq->wrs = kmalloc(sizeof *srq->wrs * srq->attr.max_wrs, GFP_KERNEL);
+	srq->cqes = kmalloc(sizeof *srq->cqes * srq->attr.max_wrs, GFP_KERNEL);
+	if (!srq->wrs || !srq->cqes)
+		goto err_arrays;
+
+	for(i = 0; i < srq->attr.max_wrs; ++i) {
+		srq->wrs[i] = kmalloc(sizeof srq->wrs[i], GFP_KERNEL);
+		if (!srq->wrs[i])
+			goto err_wr;
+		srq->wrs[i]->sg_list = kmalloc(sizeof srq->wrs[i]->sg_list *
+					       srq->attr.max_sge, GFP_KERNEL);
+		if (!srq->wrs[i]->sg_list) {
+			kfree(srq->wrs[i]);
+			goto err_wr;
+		}
+		list_add(&srq->cqes[i].list, &srq->free_cqes);
+		srq->wrs[i]->next = srq->first_polled;
+		srq->first_polled = srq->wrs[i];
+	}
+
+	return &srq->ib_srq;
+
+err_wr:
+	while(--i >= 0) {
+		kfree(srq->wrs[i]->sg_list);
+		kfree(srq->wrs[i]);
+	}
+
+err_arrays:
+	kfree(srq->wrs);
+	kfree(srq->cqes);
+	return ERR_PTR(-ENOMEM);
+}
+
+int ehca_destroy_srq(struct ib_srq *ib_srq)
+{
+	struct ehca_srq *srq;
+	int i;
+
+	srq = container_of(ib_srq, *srq, ib_srq);
+	for (i = 0; i < srq->attr.max_wrs; ++i) {
+		kfree(srq->wrs[i]->sg_list);
+		kfree(srq->wrs[i]);
+	}
+	kfree(srq->wrs);
+	kfree(srq->cqes);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
index 73db920..a44354c 100644
--- a/drivers/infiniband/hw/ehca/ehca_uverbs.c
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -289,7 +289,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 
 	case 2: /* QP */
 		spin_lock_irqsave(&ehca_qp_idr_lock, flags);
-		qp = idr_find(&ehca_qp_idr, idr_handle);
+		qp = idr_find(&ehca_qp_idr, RHCA_QP_TOKEN(idr_handle));
 		spin_unlock_irqrestore(&ehca_qp_idr_lock, flags);
 
 		/* make sure this mmap really belongs to the authorized user */
-- 
MST



More information about the general mailing list