[ofa-general] [PATCH 9/10 Rev4] [IPoIB] Implement batching

Wed Aug 22 01:31:29 PDT 2007

IPoIB: implement the new batching API.

Signed-off-by: Krishna Kumar <krkumar2 at in.ibm.com>
---
 ipoib_main.c |  251 ++++++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 171 insertions(+), 80 deletions(-)

diff -ruNp org/drivers/infiniband/ulp/ipoib/ipoib_main.c new/drivers/infiniband/ulp/ipoib/ipoib_main.c

--- org/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-20 14:26:26.000000000 +0530
+++ new/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-22 08:33:51.000000000 +0530
@@ -560,7 +560,8 @@ static void neigh_add_path(struct sk_buf
 				goto err_drop;
 			}
 		} else
-			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			ipoib_send(dev, skb, path->ah,
+				   IPOIB_QPN(skb->dst->neighbour->ha), 1);
 	} else {
 		neigh->ah  = NULL;
 
@@ -640,7 +641,7 @@ static void unicast_arp_send(struct sk_b
 		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 			  be16_to_cpu(path->pathrec.dlid));
 
-		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
+		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr), 1);
 	} else if ((path->query || !path_rec_start(dev, path)) &&
 		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 		/* put pseudoheader back on for next time */
@@ -654,105 +655,166 @@ static void unicast_arp_send(struct sk_b
 	spin_unlock(&priv->lock);
 }
 
+#define	XMIT_PROCESSED_SKBS()						\
+	do {								\
+		if (wr_num) {						\
+			ipoib_send(dev, NULL, old_neigh->ah, old_qpn,	\
+				   wr_num);				\
+			wr_num = 0;					\
+		}							\
+	} while (0)
+
 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_neigh *neigh;
+	struct sk_buff_head *blist;
+	int max_skbs, wr_num = 0;
+	u32 qpn, old_qpn = 0;
+	struct ipoib_neigh *neigh, *old_neigh = NULL;
 	unsigned long flags;
 
 	if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags)))
 		return NETDEV_TX_LOCKED;
 
-	/*
-	 * Check if our queue is stopped.  Since we have the LLTX bit
-	 * set, we can't rely on netif_stop_queue() preventing our
-	 * xmit function from being called with a full queue.
-	 */
-	if (unlikely(netif_queue_stopped(dev))) {
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
-		return NETDEV_TX_BUSY;
-	}
-
-	if (likely(skb->dst && skb->dst->neighbour)) {
-		if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
-			ipoib_path_lookup(skb, dev);
-			goto out;
-		}
+	blist = dev->skb_blist;
 
-		neigh = *to_ipoib_neigh(skb->dst->neighbour);
+	if (!skb || (blist && skb_queue_len(blist))) {
+		/*
+		 * Either batching xmit call, or single skb case but there are
+		 * skbs already in the batch list from previous failure to
+		 * xmit - send the earlier skbs first to avoid out of order.
+		 */
+
+		if (skb)
+			__skb_queue_tail(blist, skb);
+
+		/*
+		 * Figure out how many skbs can be sent. This prevents the
+		 * device getting full and avoids checking for stopped queue
+		 * after each iteration. Now the queue can get stopped atmost
+		 * after xmit of the last skb.
+		 */
+		max_skbs = ipoib_sendq_size - (priv->tx_head - priv->tx_tail);
+		skb = __skb_dequeue(blist);
+	} else {
+		blist = NULL;
+		max_skbs = 1;
+	}
 
-		if (ipoib_cm_get(neigh)) {
-			if (ipoib_cm_up(neigh)) {
-				ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
-				goto out;
-			}
-		} else if (neigh->ah) {
-			if (unlikely(memcmp(&neigh->dgid.raw,
-					    skb->dst->neighbour->ha + 4,
-					    sizeof(union ib_gid)))) {
-				spin_lock(&priv->lock);
-				/*
-				 * It's safe to call ipoib_put_ah() inside
-				 * priv->lock here, because we know that
-				 * path->ah will always hold one more reference,
-				 * so ipoib_put_ah() will never do more than
-				 * decrement the ref count.
-				 */
-				ipoib_put_ah(neigh->ah);
-				list_del(&neigh->list);
-				ipoib_neigh_free(dev, neigh);
-				spin_unlock(&priv->lock);
+	do {
+		if (likely(skb->dst && skb->dst->neighbour)) {
+			if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+				XMIT_PROCESSED_SKBS();
 				ipoib_path_lookup(skb, dev);
-				goto out;
+				continue;
 			}
 
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
-			goto out;
-		}
-
-		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
-			spin_lock(&priv->lock);
-			__skb_queue_tail(&neigh->queue, skb);
-			spin_unlock(&priv->lock);
-		} else {
-			++priv->stats.tx_dropped;
-			dev_kfree_skb_any(skb);
-		}
-	} else {
-		struct ipoib_pseudoheader *phdr =
-			(struct ipoib_pseudoheader *) skb->data;
-		skb_pull(skb, sizeof *phdr);
+			neigh = *to_ipoib_neigh(skb->dst->neighbour);
 
-		if (phdr->hwaddr[4] == 0xff) {
-			/* Add in the P_Key for multicast*/
-			phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
-			phdr->hwaddr[9] = priv->pkey & 0xff;
+			if (ipoib_cm_get(neigh)) {
+				if (ipoib_cm_up(neigh)) {
+					XMIT_PROCESSED_SKBS();
+					ipoib_cm_send(dev, skb,
+						      ipoib_cm_get(neigh));
+					continue;
+				}
+			} else if (neigh->ah) {
+				if (unlikely(memcmp(&neigh->dgid.raw,
+						    skb->dst->neighbour->ha + 4,
+						    sizeof(union ib_gid)))) {
+					spin_lock(&priv->lock);
+					/*
+					 * It's safe to call ipoib_put_ah()
+					 * inside priv->lock here, because we
+					 * know that path->ah will always hold
+					 * one more reference, so ipoib_put_ah()
+					 * will never do more than decrement
+					 * the ref count.
+					 */
+					ipoib_put_ah(neigh->ah);
+					list_del(&neigh->list);
+					ipoib_neigh_free(dev, neigh);
+					spin_unlock(&priv->lock);
+					XMIT_PROCESSED_SKBS();
+					ipoib_path_lookup(skb, dev);
+					continue;
+				}
+
+				qpn = IPOIB_QPN(skb->dst->neighbour->ha);
+				if (neigh != old_neigh || qpn != old_qpn) {
+					/*
+					 * Sending to a different destination
+					 * from earlier skb's (or this is the
+					 * first skb) - send all existing skbs.
+					 */
+					XMIT_PROCESSED_SKBS();
+					old_neigh = neigh;
+					old_qpn = qpn;
+				}
+
+				if (likely(!ipoib_process_skb(dev, skb, priv,
+							      neigh->ah, qpn,
+							      wr_num)))
+					wr_num++;
 
-			ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
-		} else {
-			/* unicast GID -- should be ARP or RARP reply */
+				continue;
+			}
 
-			if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
-			    (be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
-				ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
-					   IPOIB_GID_FMT "\n",
-					   skb->dst ? "neigh" : "dst",
-					   be16_to_cpup((__be16 *) skb->data),
-					   IPOIB_QPN(phdr->hwaddr),
-					   IPOIB_GID_RAW_ARG(phdr->hwaddr + 4));
+			if (skb_queue_len(&neigh->queue) <
+			    IPOIB_MAX_PATH_REC_QUEUE) {
+				spin_lock(&priv->lock);
+				__skb_queue_tail(&neigh->queue, skb);
+				spin_unlock(&priv->lock);
+			} else {
 				dev_kfree_skb_any(skb);
 				++priv->stats.tx_dropped;
-				goto out;
+				++max_skbs;
+			}
+		} else {
+			struct ipoib_pseudoheader *phdr =
+				(struct ipoib_pseudoheader *) skb->data;
+			skb_pull(skb, sizeof *phdr);
+
+			if (phdr->hwaddr[4] == 0xff) {
+				/* Add in the P_Key for multicast*/
+				phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+				phdr->hwaddr[9] = priv->pkey & 0xff;
+
+				XMIT_PROCESSED_SKBS();
+				ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
+			} else {
+				/* unicast GID -- should be ARP or RARP reply */
+
+				if ((be16_to_cpup((__be16 *) skb->data) !=
+				    ETH_P_ARP) &&
+				    (be16_to_cpup((__be16 *) skb->data) !=
+				    ETH_P_RARP)) {
+					ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
+						IPOIB_GID_FMT "\n",
+						skb->dst ? "neigh" : "dst",
+						be16_to_cpup((__be16 *)
+						skb->data),
+						IPOIB_QPN(phdr->hwaddr),
+						IPOIB_GID_RAW_ARG(phdr->hwaddr
+								  + 4));
+					dev_kfree_skb_any(skb);
+					++priv->stats.tx_dropped;
+					++max_skbs;
+					continue;
+				}
+				XMIT_PROCESSED_SKBS();
+				unicast_arp_send(skb, dev, phdr);
 			}
-
-			unicast_arp_send(skb, dev, phdr);
 		}
-	}
+	} while (--max_skbs && (skb = __skb_dequeue(blist)) != NULL);
+
+	/* Send out last packets (if any) */
+	XMIT_PROCESSED_SKBS();
 
-out:
 	spin_unlock_irqrestore(&priv->tx_lock, flags);
 
-	return NETDEV_TX_OK;
+	return (!blist || !skb_queue_len(blist)) ? NETDEV_TX_OK :
+						   NETDEV_TX_BUSY;
 }
 
 static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
@@ -900,11 +962,35 @@ int ipoib_dev_init(struct net_device *de
 
 	/* priv->tx_head & tx_tail are already 0 */
 
-	if (ipoib_ib_dev_init(dev, ca, port))
+	/* Allocate tx_sge */
+	priv->tx_sge = kmalloc(ipoib_sendq_size * sizeof *priv->tx_sge,
+			       GFP_KERNEL);
+	if (!priv->tx_sge) {
+		printk(KERN_WARNING "%s: failed to allocate TX sge (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
 		goto out_tx_ring_cleanup;
+	}
+
+	/* Allocate tx_wr */
+	priv->tx_wr = kmalloc(ipoib_sendq_size * sizeof *priv->tx_wr,
+			      GFP_KERNEL);
+	if (!priv->tx_wr) {
+		printk(KERN_WARNING "%s: failed to allocate TX wr (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
+		goto out_tx_sge_cleanup;
+	}
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_wr_cleanup;
 
 	return 0;
 
+out_tx_wr_cleanup:
+	kfree(priv->tx_wr);
+
+out_tx_sge_cleanup:
+	kfree(priv->tx_sge);
+
 out_tx_ring_cleanup:
 	kfree(priv->tx_ring);
 
@@ -932,9 +1018,13 @@ void ipoib_dev_cleanup(struct net_device
 
 	kfree(priv->rx_ring);
 	kfree(priv->tx_ring);
+	kfree(priv->tx_sge);
+	kfree(priv->tx_wr);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
+	priv->tx_sge = NULL;
+	priv->tx_wr = NULL;
 }
 
 static void ipoib_setup(struct net_device *dev)
@@ -965,7 +1055,8 @@ static void ipoib_setup(struct net_devic
 	dev->addr_len 		 = INFINIBAND_ALEN;
 	dev->type 		 = ARPHRD_INFINIBAND;
 	dev->tx_queue_len 	 = ipoib_sendq_size * 2;
-	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
+	dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX |
+				   NETIF_F_BATCH_SKBS;
 
 	/* MTU will be reset when mcast join happens */
 	dev->mtu 		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;