[openib-general] [PATCH/RFC] IB/ipoib: add selective tx signaling

Thu Jan 11 07:11:43 PST 2007

> This makes sense but I think you should also consider the fact that 
 > the tx_lock is taken once per per tx_completion so, with the patch,
 > the driver spends less time under lock.

I think we could get rid of the tx lock on completion entirely...

I have the old patch below lying around.  It no longer applies, and it
needs some careful thought to make sure there are no memory ordering
problems (which need to be addressed with appropriate barriers when
looking at tx_head/tx_tail)

--- infiniband/ulp/ipoib/ipoib_main.c	(revision 7542)
+++ infiniband/ulp/ipoib/ipoib_main.c	(working copy)
@@ -634,6 +634,14 @@ static int ipoib_start_xmit(struct sk_bu
 		return NETDEV_TX_BUSY;
 	}
 
+	/*
+	 * Because tx_lock is not held when updating tx_tail in the
+	 * send completion handler, we may receive a spurious wakeup
+	 * that starts our queue when there really isn't space yet.
+	 */
+	if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size))
+		return NETDEV_TX_BUSY;
+
 	if (skb->dst && skb->dst->neighbour) {
 		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
 			ipoib_path_lookup(skb, dev);
@@ -703,6 +711,21 @@ static struct net_device_stats *ipoib_ge
 static void ipoib_timeout(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+	int lost_wakeup = 0;
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	if (netif_queue_stopped(dev) &&
+	    priv->tx_head - priv->tx_tail < ipoib_sendq_size) {
+		ipoib_dbg(priv, "lost wakeup, head %u, tail %u\n",
+			  priv->tx_head, priv->tx_tail);
+		lost_wakeup = 1;
+		netif_wake_queue(dev);
+	}
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	if (lost_wakeup)
+		return;
 
 	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
 		   jiffies_to_msecs(jiffies - dev->trans_start));
--- infiniband/ulp/ipoib/ipoib_ib.c	(revision 7542)
+++ infiniband/ulp/ipoib/ipoib_ib.c	(working copy)
@@ -244,7 +244,6 @@ static void ipoib_ib_handle_wc(struct ne
 
 	} else {
 		struct ipoib_tx_buf *tx_req;
-		unsigned long flags;
 
 		if (wr_id >= ipoib_sendq_size) {
 			ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
@@ -266,12 +265,17 @@ static void ipoib_ib_handle_wc(struct ne
 
 		dev_kfree_skb_any(tx_req->skb);
 
-		spin_lock_irqsave(&priv->tx_lock, flags);
 		++priv->tx_tail;
+
+		/*
+		 * Since we don't hold tx_lock here, this may lead to
+		 * both lost wakeups (which we deal with in our
+		 * watchdog) and spurious wakeups (which we deal with
+		 * by handling TX ring overflows in the xmit function).
+		 */
 		if (netif_queue_stopped(dev) &&
 		    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
 			netif_wake_queue(dev);
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
 
 		if (wc->status != IB_WC_SUCCESS &&
 		    wc->status != IB_WC_WR_FLUSH_ERR)