[ofa-general] [PATCH] ipoib/cm: use common CQ for all TX QPs

Michael S. Tsirkin mst at dev.mellanox.co.il
Thu Aug 16 05:36:16 PDT 2007


Use common CQ for all TX QPs: keep a per-device counter of outstanding TX WRs,
and stop the interface when this counter reaches the send queue size, to avoid
CQ overruns.  This should help reduce the number of interrupts for
bi-directional traffic (such as TCP).

This helps fix "driver is hogging interrupts" errors reported for IPoIB send side.
See e.g. https://bugs.openfabrics.org/show_bug.cgi?id=508

Signed-off-by: Michael S. Tsirkin <mst at mellanox.co.il>

---

Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib.h	2007-08-16 15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib.h	2007-08-16 15:23:56.000000000 +0300
@@ -97,9 +97,9 @@ enum {
 
 #define	IPOIB_OP_RECV   (1ul << 31)
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
-#define	IPOIB_CM_OP_SRQ (1ul << 30)
+#define	IPOIB_OP_CM     (1ul << 30)
 #else
-#define	IPOIB_CM_OP_SRQ (0)
+#define	IPOIB_OP_CM     (0)
 #endif
 
 /* structs */
@@ -176,7 +176,6 @@ struct ipoib_cm_rx {
 
 struct ipoib_cm_tx {
 	struct ib_cm_id     *id;
-	struct ib_cq        *cq;
 	struct ib_qp        *qp;
 	struct list_head     list;
 	struct net_device   *dev;
@@ -271,6 +270,7 @@ struct ipoib_dev_priv {
 	unsigned             tx_tail;
 	struct ib_sge        tx_sge;
 	struct ib_send_wr    tx_wr;
+	unsigned             tx_outstanding;
 
 	struct ib_wc ibwc[IPOIB_NUM_WC];
 
@@ -480,6 +480,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm
 void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
 			   unsigned int mtu);
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
 #else
 
 struct ipoib_cm_tx;
@@ -568,6 +569,9 @@ static inline void ipoib_cm_handle_rx_wc
 {
 }
 
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
 #endif
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-08-16 15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-08-16 15:23:56.000000000 +0300
@@ -87,7 +87,7 @@ static int ipoib_cm_post_receive(struct 
 	struct ib_recv_wr *bad_wr;
 	int i, ret;
 
-	priv->cm.rx_wr.wr_id = id | IPOIB_CM_OP_SRQ;
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
 	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
 		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
@@ -401,7 +401,7 @@ static void skb_put_frags(struct sk_buff
 void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id & ~IPOIB_CM_OP_SRQ;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct sk_buff *skb, *newskb;
 	struct ipoib_cm_rx *p;
 	unsigned long flags;
@@ -412,7 +412,7 @@ void ipoib_cm_handle_rx_wc(struct net_de
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
-		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~IPOIB_CM_OP_SRQ)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
 			spin_lock_irqsave(&priv->lock, flags);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
@@ -498,7 +498,7 @@ static inline int post_send(struct ipoib
 	priv->tx_sge.addr             = addr;
 	priv->tx_sge.length           = len;
 
-	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_wr.wr_id 	      = wr_id | IPOIB_OP_CM;
 
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
@@ -549,20 +549,19 @@ void ipoib_cm_send(struct net_device *de
 		dev->trans_start = jiffies;
 		++tx->tx_head;
 
-		if (tx->tx_head - tx->tx_tail == ipoib_sendq_size) {
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
 				  tx->qp->qp_num);
 			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
 		}
 	}
 }
 
-static void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ipoib_cm_tx *tx,
-				  struct ib_wc *wc)
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id;
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
 	struct ipoib_tx_buf *tx_req;
 	unsigned long flags;
 
@@ -587,11 +586,10 @@ static void ipoib_cm_handle_tx_wc(struct
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++tx->tx_tail;
-	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags)) &&
-	    tx->tx_head - tx->tx_tail <= ipoib_sendq_size >> 1) {
-		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags);
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	}
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -614,11 +612,6 @@ static void ipoib_cm_handle_tx_wc(struct
 			tx->neigh = NULL;
 		}
 
-		/* queue would be re-started anyway when TX is destroyed,
-		 * but it makes sense to do it ASAP here. */
-		if (test_and_clear_bit(IPOIB_FLAG_NETIF_STOPPED, &tx->flags))
-			netif_wake_queue(dev);
-
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
 			queue_work(ipoib_workqueue, &priv->cm.reap_task);
@@ -632,19 +625,6 @@ static void ipoib_cm_handle_tx_wc(struct
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 }
 
-static void ipoib_cm_tx_completion(struct ib_cq *cq, void *tx_ptr)
-{
-	struct ipoib_cm_tx *tx = tx_ptr;
-	int n, i;
-
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
-	do {
-		n = ib_poll_cq(cq, IPOIB_NUM_WC, tx->ibwc);
-		for (i = 0; i < n; ++i)
-			ipoib_cm_handle_tx_wc(tx->dev, tx, tx->ibwc + i);
-	} while (n == IPOIB_NUM_WC);
-}
-
 int ipoib_cm_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -807,17 +787,18 @@ static int ipoib_cm_rep_handler(struct i
 	return 0;
 }
 
-static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ib_cq *cq)
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_init_attr attr = {};
 	attr.recv_cq = priv->cq;
+	attr.send_cq = priv->cq;
 	attr.srq = priv->cm.srq;
 	attr.cap.max_send_wr = ipoib_sendq_size;
 	attr.cap.max_send_sge = 1;
 	attr.sq_sig_type = IB_SIGNAL_ALL_WR;
 	attr.qp_type = IB_QPT_RC;
-	attr.send_cq = cq;
+	attr.qp_context = tx;
 	return ib_create_qp(priv->pd, &attr);
 }
 
@@ -897,21 +878,7 @@ static int ipoib_cm_tx_init(struct ipoib
 		goto err_tx;
 	}
 
-	p->cq = ib_create_cq(priv->ca, ipoib_cm_tx_completion, NULL, p,
-			     ipoib_sendq_size + 1, 0);
-	if (IS_ERR(p->cq)) {
-		ret = PTR_ERR(p->cq);
-		ipoib_warn(priv, "failed to allocate tx cq: %d\n", ret);
-		goto err_cq;
-	}
-
-	ret = ib_req_notify_cq(p->cq, IB_CQ_NEXT_COMP);
-	if (ret) {
-		ipoib_warn(priv, "failed to request completion notification: %d\n", ret);
-		goto err_req_notify;
-	}
-
-	p->qp = ipoib_cm_create_tx_qp(p->dev, p->cq);
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
 	if (IS_ERR(p->qp)) {
 		ret = PTR_ERR(p->qp);
 		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
@@ -948,12 +915,8 @@ err_modify:
 err_id:
 	p->id = NULL;
 	ib_destroy_qp(p->qp);
-err_req_notify:
 err_qp:
 	p->qp = NULL;
-	ib_destroy_cq(p->cq);
-err_cq:
-	p->cq = NULL;
 err_tx:
 	return ret;
 }
@@ -962,6 +925,8 @@ static void ipoib_cm_tx_destroy(struct i
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
 	struct ipoib_tx_buf *tx_req;
+	unsigned long flags;
+	unsigned long begin;
 
 	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
 		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
@@ -969,27 +934,40 @@ static void ipoib_cm_tx_destroy(struct i
 	if (p->id)
 		ib_destroy_cm_id(p->id);
 
-	if (p->qp)
-		ib_destroy_qp(p->qp);
-
-	if (p->cq)
-		ib_destroy_cq(p->cq);
-
-	if (test_bit(IPOIB_FLAG_NETIF_STOPPED, &p->flags))
-		netif_wake_queue(p->dev);
-
 	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
 		while ((int) p->tx_tail - (int) p->tx_head < 0) {
-			tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-			ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
-					 DMA_TO_DEVICE);
-			dev_kfree_skb_any(tx_req->skb);
-			++p->tx_tail;
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			msleep(1);
 		}
+	}
 
-		kfree(p->tx_ring);
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+				    DMA_TO_DEVICE);
+		dev_kfree_skb_any(tx_req->skb);
+		++p->tx_tail;
+		spin_lock_irqsave(&priv->tx_lock, flags);
+		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+		    netif_queue_stopped(p->dev) &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			netif_wake_queue(p->dev);
+		spin_unlock_irqrestore(&priv->tx_lock, flags);
 	}
 
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	kfree(p->tx_ring);
 	kfree(p);
 }
 
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-08-16 15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-08-16 15:23:56.000000000 +0300
@@ -267,11 +267,10 @@ static void ipoib_ib_handle_tx_wc(struct
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
 	++priv->tx_tail;
-	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
-	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
-		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
-	}
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 	if (wc->status != IB_WC_SUCCESS &&
@@ -300,16 +299,19 @@ int ipoib_poll(struct net_device *dev, i
 		for (i = 0; i < n; ++i) {
 			struct ib_wc *wc = priv->ibwc + i;
 
-			if (wc->wr_id & IPOIB_CM_OP_SRQ) {
-				++done;
-				--max;
-				ipoib_cm_handle_rx_wc(dev, wc);
-			} else if (wc->wr_id & IPOIB_OP_RECV) {
+			if (wc->wr_id & IPOIB_OP_RECV) {
 				++done;
 				--max;
-				ipoib_ib_handle_rx_wc(dev, wc);
-			} else
-				ipoib_ib_handle_tx_wc(dev, wc);
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else {
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_tx_wc(dev, wc);
+				else
+					ipoib_ib_handle_tx_wc(dev, wc);
+			}
 		}
 
 		if (n != t) {
@@ -406,10 +408,9 @@ void ipoib_send(struct net_device *dev, 
 		address->last_send = priv->tx_head;
 		++priv->tx_head;
 
-		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
 			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
 			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
 		}
 	}
 }
@@ -558,12 +559,18 @@ void ipoib_drain_cq(struct net_device *d
 	do {
 		n = ib_poll_cq(priv->cq, IPOIB_NUM_WC, priv->ibwc);
 		for (i = 0; i < n; ++i) {
-			if (priv->ibwc[i].wr_id & IPOIB_CM_OP_SRQ)
-				ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
-			else if (priv->ibwc[i].wr_id & IPOIB_OP_RECV)
-				ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
-			else
-				ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
+			struct ib_wc *wc = priv->ibwc + i;
+			if (wc->wr_id & IPOIB_OP_RECV) {
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else {
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_tx_wc(dev, wc);
+				else
+					ipoib_ib_handle_tx_wc(dev, wc);
+			}
 		}
 	} while (n == IPOIB_NUM_WC);
 }
@@ -610,6 +617,7 @@ int ipoib_ib_dev_stop(struct net_device 
 						    DMA_TO_DEVICE);
 				dev_kfree_skb_any(tx_req->skb);
 				++priv->tx_tail;
+				--priv->tx_outstanding;
 			}
 
 			for (i = 0; i < ipoib_recvq_size; ++i) {
Index: linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-16 15:23:30.000000000 +0300
+++ linux-2.6/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-16 15:23:56.000000000 +0300
@@ -896,7 +896,7 @@ int ipoib_dev_init(struct net_device *de
 		goto out_rx_ring_cleanup;
 	}
 
-	/* priv->tx_head & tx_tail are already 0 */
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
 
 	if (ipoib_ib_dev_init(dev, ca, port))
 		goto out_tx_ring_cleanup;

-- 
MST



More information about the general mailing list