[ofa-general] [PATCH][RFC] Handle packet received on RQ of tx_qp with NOSRQ

Fri Aug 3 17:34:07 PDT 2007

This patch fixes the issues raised by Michael and Roland about receiving
a packet on the Receive Queue of tx_qp in the NOSRQ case. This is solved
by posting 1 WR for the RQ. Create a separate cq to handle this for
NOSRQ only. This situation might occur when IPoIB CM inter operates with a
non-Linux system.

This patch is to be applied on top of the previous series of NOSRQ
patches.

Bulk of this code would not get tested in the normal case and is hard
to test. Code review would help. 


Signed-off-by: Pradeep Satyanarayana <pradeeps at linux.vnet.ibm.com>
---

--- a/linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-08-03 19:00:23.000000000 -0400
+++ b/linux-2.6.23-rc1/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-08-03 19:34:02.000000000 -0400
@@ -149,6 +149,55 @@ static int post_receive_nosrq(struct net
 	return ret;
 }
 
+static int alloc_skb_and_post_rq(struct ipoib_cm_tx *tx, u64 id,
+				  struct ipoib_rx_buf *rx_buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+	struct ib_recv_wr rx_wr, *bad_wr;
+	struct ib_sge sg_list;
+	struct sk_buff *skb;
+	u64 addr;
+	int ret;
+
+	skb = dev_alloc_skb(CM_PACKET_SIZE);
+	if (unlikely(!skb))
+		return -ENOMEM;
+
+	/*
+	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 12);
+
+	addr = ib_dma_map_single(priv->ca, skb->data, CM_PACKET_SIZE,
+				 DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+		dev_kfree_skb_any(skb);
+		return -EIO;
+	}
+
+	rx_buf->skb	= skb;
+	rx_buf->mapping	= addr;
+	sg_list.addr	= addr;
+	sg_list.length	= CM_PACKET_SIZE;
+	sg_list.lkey	= priv->mr->lkey;
+
+	rx_wr.next	= NULL;
+	rx_wr.wr_id	= 0 | IPOIB_CM_OP_RECV;
+	rx_wr.sg_list	= &sg_list;
+	rx_wr.num_sge	= 1;
+	ret = ib_post_recv(tx->qp, &rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post receive failed for rq tx buf\n");
+		ib_dma_unmap_single(priv->ca, rx_buf->mapping,
+				    CM_PACKET_SIZE, DMA_FROM_DEVICE);
+		dev_kfree_skb_any(rx_buf->skb);
+		rx_buf->skb = NULL;
+	}
+
+	return ret;
+}
+
 static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, u64 id,
 					     int frags,
 					     u64 mapping[IPOIB_CM_RX_SG])
@@ -348,8 +397,8 @@ static int ipoib_cm_send_rep(struct net_
 }
 
 static void init_context_and_add_list(struct ib_cm_id *cm_id,
-				    struct ipoib_cm_rx *p,
-				    struct ipoib_dev_priv *priv)
+				      struct ipoib_cm_rx *p,
+				      struct ipoib_dev_priv *priv)
 {
 	cm_id->context = p;
 	p->jiffies = jiffies;
@@ -791,6 +840,50 @@ repost_nosrq:
 			   (unsigned long long)wr_id);
 }
 
+static void handle_rx_wc_rq(struct net_device *dev, struct ipoib_cm_tx *tx,
+			    struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	u64 addr;
+	int ret;
+	struct sk_buff *skb;
+
+	skb	= tx->rx_buf.skb;
+	addr	= tx->rx_buf.mapping;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		ipoib_dbg(priv, "receive error status=%d on RQ of tx_qp\n",
+			  wc->status);
+		++priv->stats.rx_dropped;
+		goto err_rq;
+	}
+
+	ret = alloc_skb_and_post_rq(tx, 0, &tx->rx_buf);
+	if (unlikely(ret)) {
+		ipoib_dbg(priv, "failed to allocate and post receive buffer"
+			  "for RQ of tx_qp\n");
+		++priv->stats.rx_dropped;
+		goto err_rq;
+	}
+
+	ib_dma_unmap_single(priv->ca, addr, CM_PACKET_SIZE, DMA_FROM_DEVICE);
+	skb_put(skb, wc->byte_len);
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
+
+	dev->last_rx = jiffies;
+	++priv->stats.rx_packets;
+	priv->stats.rx_bytes += skb->len;
+	skb->dev = dev;
+	skb->pkt_type = PACKET_HOST;
+
+	netif_rx(skb);
+
+err_rq:
+	ipoib_warn(priv, "Catastrophic failure on RQ ox tx_qp\n");
+}
+
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -958,6 +1051,19 @@ static void ipoib_cm_tx_completion(struc
 	} while (n == IPOIB_NUM_WC);
 }
 
+static void ipoib_cm_tx_rq_handler(struct ib_cq *cq, void *tx_ptr)
+{
+	struct ipoib_cm_tx *tx = tx_ptr;
+	struct ib_wc wc;
+
+	printk(KERN_WARNING "Packet received on RQ of tx_qp!!! Must be from"
+	       "a non-Linux system!\n");
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	ib_poll_cq(cq, 1, &wc);
+
+	handle_rx_wc_rq(tx->dev, tx, &wc);
+}
+
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -1162,19 +1268,23 @@ static int ipoib_cm_rep_handler(struct i
 	return 0;
 }
 
-static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {};
-	attr.recv_cq = priv->cq;
 	attr.srq = priv->cm.srq;
 	attr.cap.max_send_wr = ipoib_sendq_size;
-	attr.cap.max_recv_wr = 0;
 	attr.cap.max_send_sge = 1;
-	attr.cap.max_recv_sge = 0;
 	attr.sq_sig_type = IB_SIGNAL_ALL_WR;
 	attr.qp_type = IB_QPT_RC;
-	attr.send_cq = cq;
+	attr.send_cq = p->cq;
+	if (priv->cm.srq)
+		attr.recv_cq = priv->cq;
+	else {
+		attr.recv_cq = p->rcq;
+		attr.cap.max_recv_wr = 1;
+		attr.cap.max_recv_sge = 1;
+	}
 	return ib_create_qp(priv->pd, &attr);
 }
 
@@ -1268,13 +1378,45 @@ static int ipoib_cm_tx_init(struct ipoib
 		goto err_req_notify;
 	}
 
-	p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+	/* There is a possibility that a non-Linux system may send to the RQ
+	 * of the tx_qp. In order to handle that, for the NOSRQ case
+	 * we create a seperate cq and also post one WR on the RQ.
+	 * For SRQ this is not an issue since WRs are already posted through
+	 * the Shared RQ.
+	 */
+
+	if (!priv->cm.srq) {
+		p->rcq = ib_create_cq(priv->ca, ipoib_cm_tx_rq_handler, NULL, p, 1, 0);
+		if (IS_ERR(p->rcq)) {
+			ret = PTR_ERR(p->rcq);
+			ipoib_warn(priv, "failed to allocate tx rcq: %d\n", ret);
+			goto err_cq;
+		}
+
+		ret = ib_req_notify_cq(p->rcq, IB_CQ_NEXT_COMP);
+		if (ret) {
+			ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
+			goto err_req_notify;
+		}
+	}
+
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	if (IS_ERR(p->qp)) {
 		ret = PTR_ERR(p->qp);
 		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
 		goto err_qp;
 	}
 
+	if (!priv->cm.srq) {
+		ret = alloc_skb_and_post_rq(p, 0, &p->rx_buf);
+		if (unlikely(ret)) {
+			ipoib_warn(priv, "failed to allocate and post receive"
+				   " buffer for RQ of tx_qp\n");
+			ret = -ENOMEM;
+			goto err_qp;
+		}
+	}
+
 	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
 	if (IS_ERR(p->id)) {
 		ret = PTR_ERR(p->id);
@@ -1309,8 +1451,11 @@ err_req_notify:
 err_qp:
 	p->qp = NULL;
 	ib_destroy_cq(p->cq);
+	if (p->rcq)
+		ib_destroy_cq(p->rcq);
 err_cq:
-	p->cq = NULL;
+	p->cq	= NULL;
+	p->rcq	= NULL;
 err_tx:
 	return ret;
 }
@@ -1332,6 +1477,9 @@ static void ipoib_cm_tx_destroy(struct i
 	if (p->cq)
 		ib_destroy_cq(p->cq);
 
+	if (p->rcq)
+		ib_destroy_cq(p->rcq);
+
 	if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
 		netif_wake_queue(p->dev);
 
@@ -1347,6 +1495,12 @@ static void ipoib_cm_tx_destroy(struct i
 		kfree(p->tx_ring);
 	}
 
+	if (!priv->cm.srq) {
+		ib_dma_unmap_single(priv->ca, p->rx_buf.mapping,
+				    CM_PACKET_SIZE, DMA_FROM_DEVICE);
+		dev_kfree_skb_any(p->rx_buf.skb);
+	}
+
 	kfree(p);
 }