[ofa-general] Draft patch to address bugzilla bug#728

Thu Oct 11 16:41:46 PDT 2007

This is a draft patch to address the following bug:
https://bugs.openfabrics.org/show_bug.cgi?id=728

There are still a few debug prints and the like which needs 
to be cleaned up. A few error conditions still need to be 
addressed. Please ignore them. I have done some minimal test 
runs on both mthca and ehca and it seems to work.

I have not yet updated the no srq code in this patch. 
I wanted to post this patch for comments before proceeding
further.

While working on this I observed that for mthca max_srq_sge
returned by ib_query_device() is not equal to max_sge returned
by ib_query_srq(). Why is that?


Signed-off-by: Pradeep Satyanarayana <pradeeps at linux.vnet.ibm.com>
---

--- a/linux-2.6.23-rc7/drivers/infiniband/ulp/ipoib/ipoib.h	2007-10-03 12:01:58.000000000 -0500
+++ b/linux-2.6.23-rc7/drivers/infiniband/ulp/ipoib/ipoib.h	2007-10-10 20:31:43.000000000 -0500
@@ -212,7 +212,7 @@ struct ipoib_cm_tx {
 
 struct ipoib_cm_rx_buf {
 	struct sk_buff *skb;
-	u64 mapping[IPOIB_CM_RX_SG];
+	u64 *mapping;
 };
 
 struct ipoib_cm_dev_priv {
--- a/linux-2.6.23-rc7/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-07-31 12:14:30.000000000 -0500
+++ b/linux-2.6.23-rc7/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-10-11 17:55:35.000000000 -0500
@@ -61,24 +61,27 @@ static struct ib_qp_attr ipoib_cm_err_at
 };
 
 #define IPOIB_CM_RX_DRAIN_WRID 0x7fffffff
+#define CM_PACKET_SIZE (ALIGN(IPOIB_CM_MTU, PAGE_SIZE))
 
 static struct ib_send_wr ipoib_cm_rx_drain_wr = {
 	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
 	.opcode = IB_WR_SEND,
 };
 
+static int num_frags, order, fragment_size;
+
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
 static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
-				  u64 mapping[IPOIB_CM_RX_SG])
+				  u64 *mapping)
 {
 	int i;
 
 	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
 
 	for (i = 0; i < frags; ++i)
-		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+		ib_dma_unmap_single(priv->ca, mapping[i + 1], fragment_size, DMA_FROM_DEVICE);
 }
 
 static int ipoib_cm_post_receive(struct net_device *dev, int id)
@@ -89,13 +92,13 @@ static int ipoib_cm_post_receive(struct 
 
 	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 0; i < num_frags; ++i)
 		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+		ipoib_cm_dma_unmap_rx(priv, num_frags - 1,
 				      priv->cm.srq_ring[id].mapping);
 		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
 		priv->cm.srq_ring[id].skb = NULL;
@@ -105,7 +108,7 @@ static int ipoib_cm_post_receive(struct 
 }
 
 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, int id, int frags,
-					     u64 mapping[IPOIB_CM_RX_SG])
+					     u64 *mapping)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
@@ -129,7 +132,7 @@ static struct sk_buff *ipoib_cm_alloc_rx
 	}
 
 	for (i = 0; i < frags; i++) {
-		struct page *page = alloc_page(GFP_ATOMIC);
+		struct page *page = alloc_pages(GFP_ATOMIC, order);
 
 		if (!page)
 			goto partial_error;
@@ -405,12 +408,15 @@ void ipoib_cm_handle_rx_wc(struct net_de
 	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
-	u64 mapping[IPOIB_CM_RX_SG];
+	u64 *mapping;
 	int frags;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
+	/* What happens if this fails, look at the kfree() also */
+	mapping = (u64 *)kzalloc(num_frags * sizeof(u64 *), GFP_ATOMIC);
+
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
 		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
 			spin_lock_irqsave(&priv->lock, flags);
@@ -448,7 +454,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
 	}
 
 	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
-					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
+					      (unsigned)IPOIB_CM_HEAD_SIZE)) / fragment_size;
 
 	newskb = ipoib_cm_alloc_rx_skb(dev, wr_id, frags, mapping);
 	if (unlikely(!newskb)) {
@@ -486,6 +492,7 @@ repost:
 	if (unlikely(ipoib_cm_post_receive(dev, wr_id)))
 		ipoib_warn(priv, "ipoib_cm_post_receive failed "
 			   "for buf %d\n", wr_id);
+	kfree(mapping);
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
@@ -1281,10 +1288,11 @@ int ipoib_cm_dev_init(struct net_device 
 	struct ib_srq_init_attr srq_init_attr = {
 		.attr = {
 			.max_wr  = ipoib_recvq_size,
-			.max_sge = IPOIB_CM_RX_SG
 		}
 	};
-	int ret, i;
+	int ret, i, j, max_sge_supported;
+	struct ib_srq_attr srq_attr;
+	struct ib_device_attr attr;
 
 	INIT_LIST_HEAD(&priv->cm.passive_ids);
 	INIT_LIST_HEAD(&priv->cm.reap_list);
@@ -1301,13 +1309,48 @@ int ipoib_cm_dev_init(struct net_device 
 
 	skb_queue_head_init(&priv->cm.skb_queue);
 
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret)
+		return ret;
+
+	printk(KERN_WARNING "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	srq_init_attr.attr.max_sge = attr.max_srq_sge;
+
 	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
 	if (IS_ERR(priv->cm.srq)) {
+		printk(KERN_WARNING "ib_create_srq() failed!!!\n");
 		ret = PTR_ERR(priv->cm.srq);
 		priv->cm.srq = NULL;
 		return ret;
 	}
 
+	ret = ib_query_srq(priv->cm.srq, &srq_attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_srq() failed with %d\n", ret);
+		return -EINVAL;
+	}
+
+	/* We want max_sge_supported to be a power of 2, but
+	 * <= srq_attr.max_sge
+	 */
+	printk(KERN_WARNING "srq_attr.max_sge=%d\n", srq_attr.max_sge);
+	max_sge_supported = roundup_pow_of_two(min((u32)(attr.max_srq_sge), srq_attr.max_sge));
+	if (max_sge_supported != srq_attr.max_sge)
+		max_sge_supported = max_sge_supported >> 1;
+
+	if (IPOIB_CM_RX_SG >= max_sge_supported) {
+		fragment_size	= CM_PACKET_SIZE/max_sge_supported;
+		num_frags	= CM_PACKET_SIZE/fragment_size;
+	} else {
+		fragment_size	= CM_PACKET_SIZE/IPOIB_CM_RX_SG;
+		num_frags	= IPOIB_CM_RX_SG;
+	}
+	order = get_order(fragment_size);
+	printk(KERN_WARNING "Computed values of order=%d, max_sge_supported=%d,"
+	       " fragment_size=0x%x, num_frags=%d\n", order, max_sge_supported,
+	       fragment_size, num_frags);
+
 	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring,
 				    GFP_KERNEL);
 	if (!priv->cm.srq_ring) {
@@ -1317,18 +1360,32 @@ int ipoib_cm_dev_init(struct net_device 
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		priv->cm.srq_ring[i].mapping = kzalloc(num_frags * sizeof(u64 *),
+						       GFP_KERNEL);
+		if (!priv->cm.srq_ring[i].mapping) {
+			printk(KERN_WARNING "%s: failed to allocate mapping for srq_ring\n",
+			       priv->ca->name);
+			for (j = i; j > 0; j--)
+				kfree(priv->cm.srq_ring[j].mapping);
+			kfree(priv->cm.srq_ring);
+			ipoib_cm_dev_cleanup(dev);
+			return -ENOMEM;
+		}
+	}
+
+	for (i = 0; i < num_frags; ++i)
 		priv->cm.rx_sge[i].lkey	= priv->mr->lkey;
 
 	priv->cm.rx_sge[0].length = IPOIB_CM_HEAD_SIZE;
-	for (i = 1; i < IPOIB_CM_RX_SG; ++i)
-		priv->cm.rx_sge[i].length = PAGE_SIZE;
+	for (i = 1; i < num_frags; ++i)
+		priv->cm.rx_sge[i].length = fragment_size;
 	priv->cm.rx_wr.next = NULL;
 	priv->cm.rx_wr.sg_list = priv->cm.rx_sge;
-	priv->cm.rx_wr.num_sge = IPOIB_CM_RX_SG;
+	priv->cm.rx_wr.num_sge = num_frags;
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_cm_alloc_rx_skb(dev, i, IPOIB_CM_RX_SG - 1,
+		if (!ipoib_cm_alloc_rx_skb(dev, i, num_frags - 1,
 					   priv->cm.srq_ring[i].mapping)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 			ipoib_cm_dev_cleanup(dev);
@@ -1362,13 +1419,15 @@ void ipoib_cm_dev_cleanup(struct net_dev
 	priv->cm.srq = NULL;
 	if (!priv->cm.srq_ring)
 		return;
-	for (i = 0; i < ipoib_recvq_size; ++i)
+	for (i = 0; i < ipoib_recvq_size; ++i) {
 		if (priv->cm.srq_ring[i].skb) {
 			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
 					      priv->cm.srq_ring[i].mapping);
 			dev_kfree_skb_any(priv->cm.srq_ring[i].skb);
 			priv->cm.srq_ring[i].skb = NULL;
 		}
+		kfree(priv->cm.srq_ring[i].mapping);
+	}
 	kfree(priv->cm.srq_ring);
 	priv->cm.srq_ring = NULL;
 }