[ofa-general] [PATCH 2/5] IB/ipoib: Unsingnalled UD QP

Eli Cohen eli at dev.mellanox.co.il
Fri Feb 1 05:40:49 PST 2008


Unsingnalled UD QP

This is patch is using unsignalled QP for UD. Doing this
reduces the number of times a CQ has to be polled and along
with the fact that we do polling on the tx CQ, reduces the
overhead on send and improving small messages BW.

For example, on my Intel machines, send throughput of 128 byte
UDP messages, went up from 380 mbps to 508 mbps.

Signed-off-by: Eli Cohen <eli at mellanox.co.il>
---
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib.h
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -373,6 +373,7 @@ struct ipoib_dev_priv {
 
 	struct ib_wc 	     ibwc[IPOIB_NUM_WC];
 	struct ib_wc         send_wc[MAX_SEND_CQE];
+	unsigned int	     tx_poll;
 
 	struct list_head dead_ahs;
 
@@ -392,6 +393,8 @@ struct ipoib_dev_priv {
 	struct dentry *path_dentry;
 #endif
 	struct ipoib_ethtool_st etool;
+	struct timer_list poll_timer;
+	struct ib_ah *own_ah;
 };
 
 struct ipoib_ah {
@@ -454,7 +457,6 @@ extern struct workqueue_struct *ipoib_wo
 
 int ipoib_poll(struct napi_struct *napi, int budget);
 void ipoib_ib_rx_completion(struct ib_cq *cq, void *dev_ptr);
-void ipoib_ib_tx_completion(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
 				 struct ib_pd *pd, struct ib_ah_attr *attr);
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -254,12 +254,10 @@ repost:
 			   "for buf %d\n", wr_id);
 }
 
-static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc, int need_lock)
+static void _ipoib_ib_handle_tx_wc(struct net_device *dev, int wr_id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	unsigned int wr_id = wc->wr_id;
 	struct ipoib_tx_buf *tx_req;
-	unsigned long flags;
 
 	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
 		       wr_id, wc->status);
@@ -272,39 +270,52 @@ static void ipoib_ib_handle_tx_wc(struct
 
 	tx_req = &priv->tx_ring[wr_id];
 
-	ipoib_dma_unmap_tx(priv->ca, tx_req);
-
-	++dev->stats.tx_packets;
-	dev->stats.tx_bytes += tx_req->skb->len;
-
-	dev_kfree_skb_any(tx_req->skb);
-
-	if (need_lock)
-		spin_lock_irqsave(&priv->tx_lock, flags);
-
+	if (tx_req->skb) {
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		++dev->stats.tx_packets;
+		dev->stats.tx_bytes += tx_req->skb->len;
+		dev_kfree_skb_any(tx_req->skb);
+	}
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
 	    netif_queue_stopped(dev) &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		netif_wake_queue(dev);
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	int i;
+
+	i = priv->tx_poll;
+	do {
+		i &= (ipoib_sendq_size - 1);
+		_ipoib_ib_handle_tx_wc(dev, i);
+	} while (i++ != wr_id);
+	priv->tx_poll = i & (ipoib_sendq_size - 1);
 
-	if (need_lock)
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
+	if (unlikely(wc->status != IB_WC_SUCCESS &&
+		     wc->status != IB_WC_WR_FLUSH_ERR))
 
-	if (wc->status != IB_WC_SUCCESS &&
-	    wc->status != IB_WC_WR_FLUSH_ERR)
 		ipoib_warn(priv, "failed send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
 }
 
-static void poll_tx(struct ipoib_dev_priv *priv, int need_lock)
+void poll_tx(struct ipoib_dev_priv *priv)
 {
 	int n, i;
 
-	n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
-	for (i = 0; i < n; ++i)
-		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i, need_lock);
+	while (1) {
+		n = ib_poll_cq(priv->scq, MAX_SEND_CQE, priv->send_wc);
+		for (i = 0; i < n; ++i)
+			ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+
+		if (n < MAX_SEND_CQE)
+			break;
+	}
 }
 
 int ipoib_poll(struct napi_struct *napi, int budget)
@@ -361,11 +372,65 @@ void ipoib_ib_rx_completion(struct ib_cq
 	netif_rx_schedule(dev, &priv->napi);
 }
 
-void ipoib_ib_tx_completion(struct ib_cq *cq, void *dev_ptr)
+static inline int post_zlen_send_wr(struct ipoib_dev_priv *priv, unsigned wrid)
+{
+	struct ib_send_wr wr = {
+		.opcode = IB_WR_SEND,
+		.send_flags = IB_SEND_SIGNALED,
+		.wr_id = wrid,
+	};
+	struct ib_send_wr *bad_wr;
+
+	if (!priv->own_ah)
+		return -EBUSY;
+
+	wr.wr.ud.ah = priv->own_ah;
+	wr.wr.ud.remote_qpn = priv->qp->qp_num;
+	return ib_post_send(priv->qp, &wr, &bad_wr);
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long dev_ptr)
+{
+	struct net_device *dev = (struct net_device *)dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+	unsigned int wrid;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	if (((int)priv->tx_tail - (int)priv->tx_head < 0) &&
+		time_after(jiffies, dev->trans_start + 10)) {
+		wrid = priv->tx_head & (ipoib_sendq_size - 1);
+		ipoib_dbg(priv, "posting zlen send, wrid = %d: head = %d, tail = %d\n", wrid,
+				priv->tx_head, priv->tx_tail);
+		priv->tx_ring[wrid].skb = NULL;
+		if (post_zlen_send_wr(priv, wrid))
+			ipoib_warn(priv, "failed to post zlen send\n");
+		else {
+			++priv->tx_head;
+			++priv->tx_outstanding;
+			ipoib_dbg(priv, "%s-%d: head = %d\n", __func__, __LINE__, priv->tx_head);
+		}
+	}
+	poll_tx(priv);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	mod_timer(&priv->poll_timer, jiffies + HZ / 2);
+}
+
+static void flush_tx_queue(struct ipoib_dev_priv *priv)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
+	unsigned long flags;
+	unsigned int wrid;
 
-	poll_tx(priv, 1);
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	wrid = priv->tx_head & (ipoib_sendq_size - 1);
+	priv->tx_ring[wrid].skb = NULL;
+	if (!post_zlen_send_wr(priv, wrid)) {
+		++priv->tx_head;
+		++priv->tx_outstanding;
+	}
+	poll_tx(priv);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
@@ -405,6 +470,11 @@ static inline int post_send(struct ipoib
 	} else
 		priv->tx_wr.opcode      = IB_WR_SEND;
 
+	if (unlikely((priv->tx_head & (MAX_SEND_CQE - 1)) == MAX_SEND_CQE - 1))
+		priv->tx_wr.send_flags |= IB_SEND_SIGNALED;
+	else
+		priv->tx_wr.send_flags &= ~IB_SEND_SIGNALED;
+
 	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
 }
 
@@ -489,7 +559,7 @@ void ipoib_send(struct net_device *dev, 
 	}
 
 	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE + 1))
-		poll_tx(priv, 0);
+		poll_tx(priv);
 
 	return;
 
@@ -530,6 +600,32 @@ void ipoib_reap_ah(struct work_struct *w
 				   round_jiffies_relative(HZ));
 }
 
+static int create_own_ah(struct ipoib_dev_priv *priv)
+{
+	struct ib_ah_attr attr = {
+		.dlid = priv->local_lid,
+		.port_num = priv->port,
+	};
+
+	if (priv->own_ah) {
+		ipoib_dbg(priv, "own ah already exists\n");
+		return -EINVAL;
+	}
+	priv->own_ah = ib_create_ah(priv->pd, &attr);
+	return IS_ERR(priv->own_ah);
+}
+
+static void destroy_own_ah(struct ipoib_dev_priv *priv)
+{
+	if (!priv->own_ah) {
+		ipoib_dbg(priv, "destroying an already destroyed own ah\n");
+		return;
+	}
+
+	ib_destroy_ah(priv->own_ah);
+	priv->own_ah = NULL;
+}
+
 int ipoib_ib_dev_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -542,9 +638,17 @@ int ipoib_ib_dev_open(struct net_device 
 	}
 	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
 
+	ret = create_own_ah(priv);
+	if (ret) {
+		priv->own_ah = NULL;
+		ipoib_warn(priv, "failed to create own ah\n");
+		return -1;
+	}
+
 	ret = ipoib_init_qp(dev);
 	if (ret) {
 		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+		destroy_own_ah(priv);
 		return -1;
 	}
 
@@ -566,6 +670,11 @@ int ipoib_ib_dev_open(struct net_device 
 	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 			   round_jiffies_relative(HZ));
 
+	init_timer(&priv->poll_timer);
+	priv->poll_timer.function = ipoib_ib_tx_timer_func;
+	priv->poll_timer.data = (unsigned long)dev;
+        mod_timer(&priv->poll_timer, jiffies + HZ / 2);
+
 	set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
 	return 0;
@@ -662,7 +771,7 @@ void ipoib_drain_cq(struct net_device *d
 				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
 					ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
 				else
-					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i, 1);
+					ipoib_ib_handle_tx_wc(dev, priv->ibwc + i);
 			}
 		}
 	} while (n == IPOIB_NUM_WC);
@@ -673,12 +782,14 @@ int ipoib_ib_dev_stop(struct net_device 
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
 	unsigned long begin;
-	struct ipoib_tx_buf *tx_req;
 	int i;
+	unsigned long flags;
 
+        del_timer_sync(&priv->poll_timer);
 	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
 
 	ipoib_cm_dev_stop(dev);
+	flush_tx_queue(priv);
 
 	/*
 	 * Move our QP to the error state and then reinitialize in
@@ -700,32 +811,30 @@ int ipoib_ib_dev_stop(struct net_device 
 			 * assume the HW is wedged and just free up
 			 * all our pending work requests.
 			 */
-			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
-				tx_req = &priv->tx_ring[priv->tx_tail &
-							(ipoib_sendq_size - 1)];
-				ipoib_dma_unmap_tx(priv->ca, tx_req);
-				dev_kfree_skb_any(tx_req->skb);
-				++priv->tx_tail;
-				--priv->tx_outstanding;
-			}
-
 			for (i = 0; i < ipoib_recvq_size; ++i) {
 				struct ipoib_rx_buf *rx_req;
 
 				rx_req = &priv->rx_ring[i];
-				if (!rx_req->skb)
-					continue;
-				ib_dma_unmap_single(priv->ca,
-						    rx_req->mapping,
-						    IPOIB_BUF_SIZE,
-						    DMA_FROM_DEVICE);
-				dev_kfree_skb_any(rx_req->skb);
-				rx_req->skb = NULL;
+
+				if (rx_req->skb) {
+					ib_dma_unmap_single(priv->ca,
+							    rx_req->mapping,
+							    IPOIB_BUF_SIZE,
+							    DMA_FROM_DEVICE);
+					dev_kfree_skb_any(rx_req->skb);
+					rx_req->skb = NULL;
+				}
 			}
 
 			goto timeout;
 		}
 
+		if ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+			spin_lock_irqsave(&priv->tx_lock, flags);
+			poll_tx(priv);
+			spin_unlock_irqrestore(&priv->tx_lock, flags);
+		}
+
 		ipoib_drain_cq(dev);
 
 		msleep(1);
@@ -734,6 +843,7 @@ int ipoib_ib_dev_stop(struct net_device 
 	ipoib_dbg(priv, "All sends and receives done.\n");
 
 timeout:
+	destroy_own_ah(priv);
 	qp_attr.qp_state = IB_QPS_RESET;
 	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
 		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
Index: ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
===================================================================
--- ofed_kernel.orig/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ ofed_kernel/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -153,7 +153,7 @@ int ipoib_transport_dev_init(struct net_
 			.max_send_sge = dev->features & NETIF_F_SG ? MAX_SKB_FRAGS + 1 : 1,
 			.max_recv_sge = 1
 		},
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type     = IB_QPT_UD,
 		.create_flags = QP_CREATE_LSO,
 	};
@@ -184,7 +184,7 @@ int ipoib_transport_dev_init(struct net_
 		goto out_free_mr;
 	}
 
-	priv->scq = ib_create_cq(priv->ca, ipoib_ib_tx_completion, NULL, dev, ipoib_sendq_size, 0);
+	priv->scq = ib_create_cq(priv->ca, NULL, NULL, dev, ipoib_sendq_size, 0);
 	if (IS_ERR(priv->scq)) {
 		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
 		goto out_free_rcq;





More information about the general mailing list