[ofa-general] [PATCH] IB/ipoib: Split CQs for IPOIB UD

Eli Cohen eli at dev.mellanox.co.il
Mon Mar 24 09:30:45 PDT 2008


I do not send the unsignaled patch since I could not see the it improves
anything on the upstream kernel. The bellow patch have a significant
improvement on small UDP messages message rate. You can see the affect
on netperf:

without the patch:
sw175:~ # netperf -H 14.4.3.178 -t UDP_STREAM -- -m 128
UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 14.4.3.178 (14.4.3.178) port 0 AF_INET
Socket  Message  Elapsed      Messages                
Size    Size     Time         Okay Errors   Throughput
bytes   bytes    secs            #      #   10^6bits/sec

114688     128   10.00     2976016      0     304.67
114688           10.00     2973579            304.42

with the patch:
sw175:~ # netperf -H 14.4.3.178 -t UDP_STREAM -- -m 128
UDP UNIDIRECTIONAL SEND TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 14.4.3.178 (14.4.3.178) port 0 AF_INET
Socket  Message  Elapsed      Messages
Size    Size     Time         Okay Errors   Throughput
bytes   bytes    secs            #      #   10^6bits/sec

114688     128   10.00     5102042      0     522.23
114688           10.00     4246433            434.65



>From c56ce2c51d3003eef20da01678ade5762714ccc2 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli at mellanox.co.il>
Date: Thu, 20 Mar 2008 16:35:30 +0200
Subject: [PATCH] IB/ipoib: Split CQs for IPOIB UD

Use a dedicated CQ for UD send. Also, do not arm the UD send CQ thus
reducing the number of interrupts generated by the HCA. This patch
farther reduces overhead by not calling poll CQ for every posted send
WR - it does it only when there 16 or more outstanding work requests.

Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
 drivers/infiniband/ulp/ipoib/ipoib.h       |    9 ++++--
 drivers/infiniband/ulp/ipoib/ipoib_cm.c    |    8 ++--
 drivers/infiniband/ulp/ipoib/ipoib_etool.c |    2 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   45 ++++++++++++++++------------
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |   38 +++++++++++++++--------
 5 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 43feffc..fb28f0b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -95,6 +95,8 @@ enum {
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
 	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_SEND_CQE		  = 16,
 };
 
 #define	IPOIB_OP_RECV   (1ul << 31)
@@ -285,7 +287,8 @@ struct ipoib_dev_priv {
 	u16		  pkey_index;
 	struct ib_pd	 *pd;
 	struct ib_mr	 *mr;
-	struct ib_cq	 *cq;
+	struct ib_cq	 *rcq;
+	struct ib_cq	 *scq;
 	struct ib_qp	 *qp;
 	u32		  qkey;
 
@@ -305,7 +308,8 @@ struct ipoib_dev_priv {
 	struct ib_send_wr    tx_wr;
 	unsigned	     tx_outstanding;
 
-	struct ib_wc ibwc[IPOIB_NUM_WC];
+	struct ib_wc	     ibwc[IPOIB_NUM_WC];
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
 
 	struct list_head dead_ahs;
 
@@ -650,7 +654,6 @@ static inline int ipoib_register_debugfs(void) { return 0; }
 static inline void ipoib_unregister_debugfs(void) { }
 #endif
 
-
 #define ipoib_printk(level, priv, format, arg...)	\
 	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
 #define ipoib_warn(priv, format, arg...)		\
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 90ff2c9..dfabb38 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -249,8 +249,8 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
 		.event_handler = ipoib_cm_rx_event_handler,
-		.send_cq = priv->cq, /* For drain WR */
-		.recv_cq = priv->cq,
+		.send_cq = priv->rcq, /* For drain WR */
+		.recv_cq = priv->rcq,
 		.srq = priv->cm.srq,
 		.cap.max_send_wr = 1, /* For drain WR */
 		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
@@ -951,8 +951,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {
-		.send_cq		= priv->cq,
-		.recv_cq		= priv->cq,
+		.send_cq		= priv->rcq,
+		.recv_cq		= priv->rcq,
 		.srq			= priv->cm.srq,
 		.cap.max_send_wr	= ipoib_sendq_size,
 		.cap.max_send_sge	= 1,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_etool.c b/drivers/infiniband/ulp/ipoib/ipoib_etool.c
index a3ac4cf..b4f4f0f 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_etool.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_etool.c
@@ -73,7 +73,7 @@ static int ipoib_set_coalesce(struct net_device *dev,
 	    coal->rx_max_coalesced_frames > 0xffff)
 		return -EINVAL;
 
-	ret = ib_modify_cq(priv->cq, coal->rx_max_coalesced_frames,
+	ret = ib_modify_cq(priv->rcq, coal->rx_max_coalesced_frames,
 			   coal->rx_coalesce_usecs);
 	if (ret) {
 		ipoib_dbg(priv, "failed modifying CQ\n");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 6605109..7cb8bd3 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -311,7 +311,6 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	unsigned int wr_id = wc->wr_id;
 	struct ipoib_tx_buf *tx_req;
-	unsigned long flags;
 
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -331,13 +330,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	dev_kfree_skb_any(tx_req->skb);
 
-	spin_lock_irqsave(&priv->tx_lock, flags);
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    netif_queue_stopped(dev) &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR)
@@ -346,6 +343,17 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 			   wc->status, wr_id, wc->vendor_err);
 }
 
+static int poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+	n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i)
+		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+
+	return n == MAX_SEND_CQE;
+}
+
 int ipoib_poll(struct napi_struct *napi, int budget)
 {
 	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
@@ -361,7 +369,7 @@ poll_more:
 		int max = (budget - done);
 
 		t = min(IPOIB_NUM_WC, max);
-		n = ib_poll_cq(priv->cq, t, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, t, priv->ibwc);
 
 		for (i = 0; i < n; i++) {
 			struct ib_wc *wc = priv->ibwc + i;
@@ -372,12 +380,8 @@ poll_more:
 					ipoib_cm_handle_rx_wc(dev, wc);
 				else
 					ipoib_ib_handle_rx_wc(dev, wc);
-			} else {
-				if (wc->wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_tx_wc(dev, wc);
-				else
-					ipoib_ib_handle_tx_wc(dev, wc);
-			}
+			} else
+                                ipoib_cm_handle_tx_wc(priv->dev, wc);
 		}
 
 		if (n != t)
@@ -386,7 +390,7 @@ poll_more:
 
 	if (done < budget) {
 		netif_rx_complete(dev, napi);
-		if (unlikely(ib_req_notify_cq(priv->cq,
+		if (unlikely(ib_req_notify_cq(priv->rcq,
 					      IB_CQ_NEXT_COMP |
 					      IB_CQ_REPORT_MISSED_EVENTS)) &&
 		    netif_rx_reschedule(dev, napi))
@@ -515,12 +519,17 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
+		skb_orphan(skb);
 
 		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 			netif_stop_queue(dev);
 		}
 	}
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		poll_tx(priv);
+
 	return;
 
 drop:
@@ -673,7 +682,7 @@ void ipoib_drain_cq(struct net_device *dev)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int i, n;
 	do {
-		n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
+		n = ib_poll_cq(priv->rcq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
 			/*
 			 * Convert any successful completions to flush
@@ -688,14 +697,12 @@ void ipoib_drain_cq(struct net_device *dev)
 					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
 				else
 					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-			} else {
-				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
-					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
-				else
-					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
-			}
+			} else
+                                ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
 		}
 	} while (n == IPOIB_NUM_WC);
+
+	while(poll_tx(priv));
 }
 
 int ipoib_ib_dev_stop(struct net_device *dev, int flush)
@@ -787,7 +794,7 @@ timeout:
 		msleep(1);
 	}
 
-	ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP);
+	ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP);
 
 	return 0;
 }
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 1d59f27..ce12ecd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -171,7 +171,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 		goto out_free_pd;
 	}
 
-	size = ipoib_sendq_size + ipoib_recvq_size + 1;
+	size = ipoib_recvq_size + 1;
 	ret = ipoib_cm_dev_init(dev);
 	if (!ret) {
 		if (ipoib_cm_has_srq(dev))
@@ -180,17 +180,23 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 			size += ipoib_recvq_size * ipoib_max_conn_qp;
 	}
 
-	priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
-	if (IS_ERR(priv->cq)) {
-		printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
+	priv->rcq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+	if (IS_ERR(priv->rcq)) {
+		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
 		goto out_free_mr;
 	}
 
-	if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
-		goto out_free_cq;
+	priv->scq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->scq)) {
+		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+		goto out_free_rcq;
+	}
+
+	if (ib_req_notify_cq(priv->rcq, IB_CQ_NEXT_COMP))
+		goto out_free_scq;
 
-	init_attr.send_cq = priv->cq;
-	init_attr.recv_cq = priv->cq;
+	init_attr.send_cq = priv->scq;
+	init_attr.recv_cq = priv->rcq;
 
 	if (priv->hca_caps & IB_DEVICE_TCP_TSO)
 		init_attr.create_flags = QP_CREATE_LSO;
@@ -201,7 +207,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->qp = ib_create_qp(priv->pd, &init_attr);
 	if (IS_ERR(priv->qp)) {
 		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
-		goto out_free_cq;
+		goto out_free_scq;
 	}
 
 	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
@@ -217,8 +223,11 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 
 	return 0;
 
-out_free_cq:
-	ib_destroy_cq(priv->cq);
+out_free_scq:
+	ib_destroy_cq(priv->scq);
+
+out_free_rcq:
+	ib_destroy_cq(priv->rcq);
 
 out_free_mr:
 	ib_dereg_mr(priv->mr);
@@ -241,8 +250,11 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
 		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 	}
 
-	if (ib_destroy_cq(priv->cq))
-		ipoib_warn(priv, "ib_cq_destroy failed\n");
+	if (ib_destroy_cq(priv->scq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+	if (ib_destroy_cq(priv->rcq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
 
 	ipoib_cm_dev_cleanup(dev);
 
-- 
1.5.4.4






More information about the general mailing list