[openib-general] [PATCH] IB/ipoib: NAPI

Eli cohen eli at dev.mellanox.co.il
Thu Sep 21 07:57:37 PDT 2006


This patch implements NAPI for iopib. It is a draft implementation.
I would like your opinion on whether we need a module parameter
to control if NAPI should be activated or not.
Also there is a need to implement peek_cq and call it for
ib_req_notify_cq() so as to know if there is a need to call
netif_rx_schedule_prep() again.

Signed-off-by: Eli Cohen <eli at dev.mellanox.co.il>
---

Index: openib-1.1-rc6/drivers/infiniband/ulp/ipoib/ipoib_main.c
===================================================================
--- openib-1.1-rc6.orig/drivers/infiniband/ulp/ipoib/ipoib_main.c	2006-09-21 16:30:35.000000000 +0300
+++ openib-1.1-rc6/drivers/infiniband/ulp/ipoib/ipoib_main.c	2006-09-21 16:30:42.000000000 +0300
@@ -69,6 +69,8 @@
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 #endif
 
+static const int poll_def_weight = 64;
+
 struct ipoib_path_iter {
 	struct net_device *dev;
 	struct ipoib_path  path;
@@ -91,6 +93,9 @@
 	.remove = ipoib_remove_one
 };
 
+
+int ipoib_poll(struct net_device *dev, int *budget);
+
 int ipoib_open(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -689,6 +694,7 @@
 			goto out;
 		}
 
+
 		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 			spin_lock(&priv->lock);
 			__skb_queue_tail(&neigh->queue, skb);
@@ -892,6 +898,7 @@
 
 	/* Delete any child interfaces first */
 	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		netif_poll_disable(priv->dev);
 		unregister_netdev(cpriv->dev);
 		ipoib_dev_cleanup(cpriv->dev);
 		free_netdev(cpriv->dev);
@@ -919,6 +926,8 @@
 	dev->hard_header 	 = ipoib_hard_header;
 	dev->set_multicast_list  = ipoib_set_mcast_list;
 	dev->neigh_setup         = ipoib_neigh_setup_dev;
+	dev->poll                = ipoib_poll;
+	dev->weight              = poll_def_weight;
 
 	dev->watchdog_timeo 	 = HZ;
 
@@ -1097,6 +1106,8 @@
 		goto register_failed;
 	}
 
+	netif_poll_enable(priv->dev);
+
 	ipoib_create_debug_files(priv->dev);
 
 	if (ipoib_add_pkey_attr(priv->dev))
@@ -1111,6 +1122,7 @@
 	return priv->dev;
 
 sysfs_failed:
+	netif_poll_disable(priv->dev);
 	ipoib_delete_debug_files(priv->dev);
 	unregister_netdev(priv->dev);
 
@@ -1168,6 +1180,7 @@
 	dev_list = ib_get_client_data(device, &ipoib_client);
 
 	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		netif_poll_disable(priv->dev);
 		ib_unregister_event_handler(&priv->event_handler);
 		flush_scheduled_work();
 
Index: openib-1.1-rc6/drivers/infiniband/ulp/ipoib/ipoib_ib.c
===================================================================
--- openib-1.1-rc6.orig/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2006-09-21 16:30:38.000000000 +0300
+++ openib-1.1-rc6/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2006-09-21 17:24:59.000000000 +0300
@@ -169,7 +169,7 @@
 	return 0;
 }
 
-static void ipoib_ib_handle_wc(struct net_device *dev,
+static void ipoib_ib_handle_rwc(struct net_device *dev,
 			       struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -178,122 +178,186 @@
 	ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
 		       wr_id, wc->opcode, wc->status);
 
-	if (wr_id & IPOIB_OP_RECV) {
-		wr_id &= ~IPOIB_OP_RECV;
-
-		if (wr_id < ipoib_recvq_size) {
-			struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
-			dma_addr_t      addr = priv->rx_ring[wr_id].mapping;
-
-			if (unlikely(wc->status != IB_WC_SUCCESS)) {
-				if (wc->status != IB_WC_WR_FLUSH_ERR)
-					ipoib_warn(priv, "failed recv event "
-						   "(status=%d, wrid=%d vend_err %x)\n",
-						   wc->status, wr_id, wc->vendor_err);
-				dma_unmap_single(priv->ca->dma_device, addr,
-						 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
-				dev_kfree_skb_any(skb);
-				priv->rx_ring[wr_id].skb = NULL;
-				return;
-			}
-
-			/*
-			 * If we can't allocate a new RX buffer, dump
-			 * this packet and reuse the old buffer.
-			 */
-			if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
-				++priv->stats.rx_dropped;
-				goto repost;
-			}
-
-			ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
-				       wc->byte_len, wc->slid);
+	wr_id &= ~IPOIB_OP_RECV;
 
+	if (wr_id < ipoib_recvq_size) {
+		struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
+		dma_addr_t      addr = priv->rx_ring[wr_id].mapping;
+
+		if (unlikely(wc->status != IB_WC_SUCCESS)) {
+			if (wc->status != IB_WC_WR_FLUSH_ERR)
+				ipoib_warn(priv, "failed recv event "
+					   "(status=%d, wrid=%d vend_err %x)\n",
+					   wc->status, wr_id, wc->vendor_err);
 			dma_unmap_single(priv->ca->dma_device, addr,
 					 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+			dev_kfree_skb_any(skb);
+			priv->rx_ring[wr_id].skb = NULL;
+			return;
+		}
 
-			skb_put(skb, wc->byte_len);
-			skb_pull(skb, IB_GRH_BYTES);
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+			++priv->stats.rx_dropped;
+			goto repost;
+		}
 
-			if (wc->slid != priv->local_lid ||
-			    wc->src_qp != priv->qp->qp_num) {
-				skb->protocol = ((struct ipoib_header *) skb->data)->proto;
-				skb->mac.raw = skb->data;
-				skb_pull(skb, IPOIB_ENCAP_LEN);
-
-				dev->last_rx = jiffies;
-				++priv->stats.rx_packets;
-				priv->stats.rx_bytes += skb->len;
-
-				skb->dev = dev;
-				/* XXX get correct PACKET_ type here */
-				skb->pkt_type = PACKET_HOST;
-				netif_rx_ni(skb);
-			} else {
-				ipoib_dbg_data(priv, "dropping loopback packet\n");
-				dev_kfree_skb_any(skb);
-			}
+		ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+			       wc->byte_len, wc->slid);
 
-		repost:
-			if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
-				ipoib_warn(priv, "ipoib_ib_post_receive failed "
-					   "for buf %d\n", wr_id);
-		} else
-			ipoib_warn(priv, "completion event with wrid %d\n",
-				   wr_id);
+		dma_unmap_single(priv->ca->dma_device, addr,
+				 IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
 
-	} else {
-		struct ipoib_tx_buf *tx_req;
-		unsigned long flags;
+		skb_put(skb, wc->byte_len);
+		skb_pull(skb, IB_GRH_BYTES);
 
-		if (wr_id >= ipoib_sendq_size) {
-			ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
-				   wr_id, ipoib_sendq_size);
-			return;
+		if (wc->slid != priv->local_lid ||
+		    wc->src_qp != priv->qp->qp_num) {
+			skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+			skb->mac.raw = skb->data;
+			skb_pull(skb, IPOIB_ENCAP_LEN);
+
+			dev->last_rx = jiffies;
+			++priv->stats.rx_packets;
+			priv->stats.rx_bytes += skb->len;
+
+			skb->dev = dev;
+			/* XXX get correct PACKET_ type here */
+			skb->pkt_type = PACKET_HOST;
+			netif_receive_skb(skb);
+		} else {
+			ipoib_dbg_data(priv, "dropping loopback packet\n");
+			dev_kfree_skb_any(skb);
 		}
 
-		ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+	repost:
+		if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+			ipoib_warn(priv, "ipoib_ib_post_receive failed "
+				   "for buf %d\n", wr_id);
+	} else
+		ipoib_warn(priv, "completion event with wrid %d\n",
+			   wr_id);
 
-		tx_req = &priv->tx_ring[wr_id];
+}
 
-		dma_unmap_single(priv->ca->dma_device,
-				 pci_unmap_addr(tx_req, mapping),
-				 tx_req->skb->len,
-				 DMA_TO_DEVICE);
 
-		++priv->stats.tx_packets;
-		priv->stats.tx_bytes += tx_req->skb->len;
+static void ipoib_ib_handle_swc(struct net_device *dev,
+			       struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	struct ipoib_tx_buf *tx_req;
+	unsigned long flags;
 
-		dev_kfree_skb_any(tx_req->skb);
+	ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+		       wr_id, wc->opcode, wc->status);
 
-		spin_lock_irqsave(&priv->tx_lock, flags);
-		++priv->tx_tail;
-		if (netif_queue_stopped(dev) &&
-		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
-		    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
-			netif_wake_queue(dev);
-		spin_unlock_irqrestore(&priv->tx_lock, flags);
-
-		if (wc->status != IB_WC_SUCCESS &&
-		    wc->status != IB_WC_WR_FLUSH_ERR)
-			ipoib_warn(priv, "failed send event "
-				   "(status=%d, wrid=%d vend_err %x)\n",
-				   wc->status, wr_id, wc->vendor_err);
+	if (wr_id >= ipoib_sendq_size) {
+		ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
 	}
+
+	ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+
+	tx_req = &priv->tx_ring[wr_id];
+
+	dma_unmap_single(priv->ca->dma_device,
+			 pci_unmap_addr(tx_req, mapping),
+			 tx_req->skb->len,
+			 DMA_TO_DEVICE);
+
+	++priv->stats.tx_packets;
+	priv->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	spin_lock_irqsave(&priv->tx_lock, flags);
+	++priv->tx_tail;
+	if (netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
+	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+		netif_wake_queue(dev);
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR)
+		ipoib_warn(priv, "failed send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
 }
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+static inline int is_rx_comp(struct ib_wc *wc)
+{
+	unsigned int wr_id = wc->wr_id;
+
+	if (wr_id & IPOIB_OP_RECV)
+		return 1;
+
+	return 0;
+}
+
+int ipoib_poll(struct net_device *dev, int *budget)
 {
-	struct net_device *dev = (struct net_device *) dev_ptr;
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int n, i;
+	struct ib_cq *cq = priv->cq;
+	int quota = dev->quota;
+	int wc;
+	int rx = 0;
+	int tx = 0;
 
-	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 	do {
-		n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
-		for (i = 0; i < n; ++i)
-			ipoib_ib_handle_wc(dev, priv->ibwc + i);
-	} while (n == IPOIB_NUM_WC);
+		wc = min_t(int, quota, IPOIB_NUM_WC);
+		n = ib_poll_cq(cq, wc, priv->ibwc);
+		for (i = 0; i < n; ++i) {
+			if (is_rx_comp(priv->ibwc + i)) {
+				++rx;
+				--quota;
+				ipoib_ib_handle_rwc(dev, priv->ibwc + i);
+			}
+			else {
+				++tx;
+				ipoib_ib_handle_swc(dev, priv->ibwc + i);
+			}
+
+			if (unlikely(quota <= 0))
+				goto not_done;
+		}
+	} while (n == wc);
+
+	if (rx || tx)
+		goto not_done;
+
+
+	netif_rx_complete(dev);
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	/* TODO we need peek_cq here for hw devices that
+	   could would not generate interrupts for completions
+	   arriving between end of polling till request notify */
+
+	return 0;
+
+not_done:
+	*budget -= rx;
+	dev->quota = quota;
+	return 1;
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct net_device *dev = (struct net_device *) dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* tell the network layer we have packts */
+        if (netif_rx_schedule_prep(dev))
+		__netif_rx_schedule(dev);
+	else {
+		ipoib_warn(priv, "received interupt while in polling\n");
+	}
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,






More information about the general mailing list