[openib-general] [PATCH] IPoIB splitting CQ, increase both send/recv poll NUM_WC & interval

Shirley Ma xma at us.ibm.com
Wed Apr 19 08:18:30 PDT 2006


Roland,

Here is the patch. This patch includes:

1. sepeate CQ to send CQ and recv CQ
2. increase both send/recv poll NUM_WC from 4 to 32 
3. add cacheline smp in tx_ring, rx_ring and send_ibwc, recv_ibwc
4. add tunalbe poll interval in both send and recv.

example commandline:
        modprobe ib_ipoib send_poll_interval=80 recv_poll_interval=10

Attachment is the file for you download. Any problems let me know.


Signed-off-by: Shirley Ma <xma at us.ibm.com>

diff -urN infiniband/ulp/ipoib/ipoib.h infiniband-cq/ulp/ipoib/ipoib.h
--- infiniband/ulp/ipoib/ipoib.h        2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib.h     2006-04-19 08:40:42.030284464 
-0700
@@ -71,7 +71,8 @@
        IPOIB_MAX_QUEUE_SIZE      = 8192,
        IPOIB_MIN_QUEUE_SIZE      = 2,
 
-       IPOIB_NUM_WC              = 4,
+       IPOIB_NUM_SEND_WC         = 32,
+       IPOIB_NUM_RECV_WC         = 32,
 
        IPOIB_MAX_PATH_REC_QUEUE  = 3,
        IPOIB_MAX_MCAST_QUEUE     = 3,
@@ -151,7 +152,8 @@
        u16               pkey;
        struct ib_pd     *pd;
        struct ib_mr     *mr;
-       struct ib_cq     *cq;
+       struct ib_cq     *send_cq;
+       struct ib_cq     *recv_cq;
        struct ib_qp     *qp;
        u32               qkey;
 
@@ -162,16 +164,17 @@
        unsigned int admin_mtu;
        unsigned int mcast_mtu;
 
-       struct ipoib_rx_buf *rx_ring;
+       struct ipoib_rx_buf *rx_ring    ____cacheline_aligned_in_smp;
 
        spinlock_t           tx_lock;
-       struct ipoib_tx_buf *tx_ring;
+       struct ipoib_tx_buf *tx_ring    ____cacheline_aligned_in_smp; 
        unsigned             tx_head;
        unsigned             tx_tail;
        struct ib_sge        tx_sge;
        struct ib_send_wr    tx_wr;
 
-       struct ib_wc ibwc[IPOIB_NUM_WC];
+       struct ib_wc send_ibwc[IPOIB_NUM_SEND_WC] 
____cacheline_aligned_in_smp;
+       struct ib_wc recv_ibwc[IPOIB_NUM_RECV_WC] 
____cacheline_aligned_in_smp;
 
        struct list_head dead_ahs;
 
@@ -243,9 +246,13 @@
 
 extern struct workqueue_struct *ipoib_workqueue;
 
+extern int ipoib_send_poll_interval;
+extern int ipoib_recv_poll_interval;
+
 /* functions */
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
                                 struct ib_pd *pd, struct ib_ah_attr 
*attr);
diff -urN infiniband/ulp/ipoib/ipoib_ib.c 
infiniband-cq/ulp/ipoib/ipoib_ib.c
--- infiniband/ulp/ipoib/ipoib_ib.c     2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_ib.c  2006-04-19 08:56:40.395590792 
-0700
@@ -50,7 +50,6 @@
                 "Enable data path debug tracing if > 0");
 #endif
 
-#define        IPOIB_OP_RECV   (1ul << 31)
 
 static DEFINE_MUTEX(pkey_mutex);
 
@@ -108,7 +107,7 @@
        list.lkey     = priv->mr->lkey;
 
        param.next    = NULL;
-       param.wr_id   = id | IPOIB_OP_RECV;
+       param.wr_id   = id;
        param.sg_list = &list;
        param.num_sge = 1;
 
@@ -175,8 +174,8 @@
        return 0;
 }
 
-static void ipoib_ib_handle_wc(struct net_device *dev,
-                              struct ib_wc *wc)
+static void ipoib_ib_handle_recv_wc(struct net_device *dev,
+                                   struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        unsigned int wr_id = wc->wr_id;
@@ -184,121 +183,142 @@
        ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
                       wr_id, wc->opcode, wc->status);
 
-       if (wr_id & IPOIB_OP_RECV) {
-               wr_id &= ~IPOIB_OP_RECV;
-
-               if (wr_id < ipoib_recvq_size) {
-                       struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
-                       dma_addr_t      addr = 
priv->rx_ring[wr_id].mapping;
-
-                       if (unlikely(wc->status != IB_WC_SUCCESS)) {
-                               if (wc->status != IB_WC_WR_FLUSH_ERR)
-                                       ipoib_warn(priv, "failed recv 
event "
-                                                  "(status=%d, wrid=%d 
vend_err %x)\n",
-                                                  wc->status, wr_id, 
wc->vendor_err);
-                               dma_unmap_single(priv->ca->dma_device, 
addr,
-                                                IPOIB_BUF_SIZE, 
DMA_FROM_DEVICE);
-                               dev_kfree_skb_any(skb);
-                               priv->rx_ring[wr_id].skb = NULL;
-                               return;
-                       }
-
-                       /*
-                        * If we can't allocate a new RX buffer, dump
-                        * this packet and reuse the old buffer.
-                        */
-                       if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
-                               ++priv->stats.rx_dropped;
-                               goto repost;
-                       }
-
-                       ipoib_dbg_data(priv, "received %d bytes, SLID 
0x%04x\n",
-                                      wc->byte_len, wc->slid);
 
+       if (wr_id < ipoib_recvq_size) {
+               struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
+               dma_addr_t      addr = priv->rx_ring[wr_id].mapping;
+
+               if (unlikely(wc->status != IB_WC_SUCCESS)) {
+                       if (wc->status != IB_WC_WR_FLUSH_ERR)
+                               ipoib_warn(priv, "failed recv event "
+                                          "(status=%d, wrid=%d vend_err 
%x)\n",
+                                          wc->status, wr_id, 
wc->vendor_err);
                        dma_unmap_single(priv->ca->dma_device, addr,
                                         IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+                       dev_kfree_skb_any(skb);
+                       priv->rx_ring[wr_id].skb = NULL;
+                       return;
+               }
 
-                       skb_put(skb, wc->byte_len);
-                       skb_pull(skb, IB_GRH_BYTES);
+               /*
+                * If we can't allocate a new RX buffer, dump
+                * this packet and reuse the old buffer.
+                */
+               if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+                       ++priv->stats.rx_dropped;
+                       goto repost;
+               }
 
-                       if (wc->slid != priv->local_lid ||
-                           wc->src_qp != priv->qp->qp_num) {
-                               skb->protocol = ((struct ipoib_header *) 
skb->data)->proto;
-                               skb->mac.raw = skb->data;
-                               skb_pull(skb, IPOIB_ENCAP_LEN);
-
-                               dev->last_rx = jiffies;
-                               ++priv->stats.rx_packets;
-                               priv->stats.rx_bytes += skb->len;
-
-                               skb->dev = dev;
-                               /* XXX get correct PACKET_ type here */
-                               skb->pkt_type = PACKET_HOST;
-                               netif_rx_ni(skb);
-                       } else {
-                               ipoib_dbg_data(priv, "dropping loopback 
packet\n");
-                               dev_kfree_skb_any(skb);
-                       }
+               ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+                              wc->byte_len, wc->slid);
 
-               repost:
-                       if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
-                               ipoib_warn(priv, "ipoib_ib_post_receive 
failed "
-                                          "for buf %d\n", wr_id);
-               } else
-                       ipoib_warn(priv, "completion event with wrid 
%d\n",
-                                  wr_id);
+               dma_unmap_single(priv->ca->dma_device, addr,
+                                IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
 
-       } else {
-               struct ipoib_tx_buf *tx_req;
-               unsigned long flags;
+               skb_put(skb, wc->byte_len);
+               skb_pull(skb, IB_GRH_BYTES);
 
-               if (wr_id >= ipoib_sendq_size) {
-                       ipoib_warn(priv, "completion event with wrid %d (> 
%d)\n",
-                                  wr_id, ipoib_sendq_size);
-                       return;
+               if (wc->slid != priv->local_lid ||
+                   wc->src_qp != priv->qp->qp_num) {
+                       skb->protocol = ((struct ipoib_header *) 
skb->data)->proto;
+                       skb->mac.raw = skb->data;
+                       skb_pull(skb, IPOIB_ENCAP_LEN);
+
+                       dev->last_rx = jiffies;
+                       ++priv->stats.rx_packets;
+                       priv->stats.rx_bytes += skb->len;
+
+                       skb->dev = dev;
+                       /* XXX get correct PACKET_ type here */
+                       skb->pkt_type = PACKET_HOST;
+                       netif_rx_ni(skb);
+               } else {
+                       ipoib_dbg_data(priv, "dropping loopback 
packet\n");
+                       dev_kfree_skb_any(skb);
                }
 
-               ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+       repost:
+               if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+                       ipoib_warn(priv, "ipoib_ib_post_receive failed "
+                                  "for buf %d\n", wr_id);
+       } else
+               ipoib_warn(priv, "completion event with wrid %d\n",
+                          wr_id);
+}
+
+static void ipoib_ib_handle_send_wc(struct net_device *dev,
+                                   struct ib_wc *wc)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       unsigned int wr_id = wc->wr_id;
+       struct ipoib_tx_buf *tx_req;
+       unsigned long flags;
+
+       ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+                      wr_id, wc->opcode, wc->status);
 
-               tx_req = &priv->tx_ring[wr_id];
+       if (wr_id >= ipoib_sendq_size) {
+               ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+                          wr_id, ipoib_sendq_size);
+               return;
+       }
 
-               dma_unmap_single(priv->ca->dma_device,
-                                pci_unmap_addr(tx_req, mapping),
-                                tx_req->skb->len,
-                                DMA_TO_DEVICE);
+       ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
 
-               ++priv->stats.tx_packets;
-               priv->stats.tx_bytes += tx_req->skb->len;
+       tx_req = &priv->tx_ring[wr_id];
 
-               dev_kfree_skb_any(tx_req->skb);
+       dma_unmap_single(priv->ca->dma_device,
+                        pci_unmap_addr(tx_req, mapping),
+                        tx_req->skb->len,
+                        DMA_TO_DEVICE);
 
-               spin_lock_irqsave(&priv->tx_lock, flags);
-               ++priv->tx_tail;
-               if (netif_queue_stopped(dev) &&
-                   priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 
1)
-                       netif_wake_queue(dev);
-               spin_unlock_irqrestore(&priv->tx_lock, flags);
+       ++priv->stats.tx_packets;
+       priv->stats.tx_bytes += tx_req->skb->len;
 
-               if (wc->status != IB_WC_SUCCESS &&
-                   wc->status != IB_WC_WR_FLUSH_ERR)
-                       ipoib_warn(priv, "failed send event "
-                                  "(status=%d, wrid=%d vend_err %x)\n",
-                                  wc->status, wr_id, wc->vendor_err);
-       }
+       dev_kfree_skb_any(tx_req->skb);
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       ++priv->tx_tail;
+       if (netif_queue_stopped(dev) &&
+           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+               netif_wake_queue(dev);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+       if (wc->status != IB_WC_SUCCESS &&
+           wc->status != IB_WC_WR_FLUSH_ERR)
+               ipoib_warn(priv, "failed send event "
+                          "(status=%d, wrid=%d vend_err %x)\n",
+                          wc->status, wr_id, wc->vendor_err);
+}
+
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)
+{
+       struct net_device *dev = (struct net_device *) dev_ptr;
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       int n, i;
+
+       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       udelay(ipoib_send_poll_interval);
+       do {
+               n = ib_poll_cq(cq, IPOIB_NUM_SEND_WC, priv->send_ibwc);
+               for (i = 0; i < n; ++i)
+                       ipoib_ib_handle_send_wc(dev, priv->send_ibwc + i);
+       } while (n == IPOIB_NUM_SEND_WC);
 }
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr)
 {
        struct net_device *dev = (struct net_device *) dev_ptr;
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int n, i;
 
        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       udelay(ipoib_recv_poll_interval);
        do {
-               n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+               n = ib_poll_cq(cq, IPOIB_NUM_RECV_WC, priv->recv_ibwc);
                for (i = 0; i < n; ++i)
-                       ipoib_ib_handle_wc(dev, priv->ibwc + i);
-       } while (n == IPOIB_NUM_WC);
+                       ipoib_ib_handle_recv_wc(dev, priv->recv_ibwc + i);
+       } while (n == IPOIB_NUM_RECV_WC);
 }
 
 static inline int post_send(struct ipoib_dev_priv *priv,
diff -urN infiniband/ulp/ipoib/ipoib_main.c 
infiniband-cq/ulp/ipoib/ipoib_main.c
--- infiniband/ulp/ipoib/ipoib_main.c   2006-04-12 16:43:38.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_main.c        2006-04-19 
08:44:27.192054672 -0700
@@ -56,12 +56,17 @@
 
 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+int ipoib_send_poll_interval __read_mostly = 0;
+int ipoib_recv_poll_interval __read_mostly = 0;
 
 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive 
queue");
 
+module_param_named(send_poll_interval, ipoib_send_poll_interval, int, 
0444);
+module_param_named(recv_poll_interval, ipoib_recv_poll_interval, int, 
0444);
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
 
@@ -895,7 +900,7 @@
 
        kfree(priv->rx_ring);
        kfree(priv->tx_ring);
-
+ 
        priv->rx_ring = NULL;
        priv->tx_ring = NULL;
 }
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c 
infiniband-cq/ulp/ipoib/ipoib_verbs.c
--- infiniband/ulp/ipoib/ipoib_verbs.c  2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_verbs.c       2006-04-12 
19:14:41.000000000 -0700
@@ -174,24 +174,35 @@
                return -ENODEV;
        }
 
-       priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
-                               ipoib_sendq_size + ipoib_recvq_size + 1);
-       if (IS_ERR(priv->cq)) {
-               printk(KERN_WARNING "%s: failed to create CQ\n", 
ca->name);
+       priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_send_completion, 
NULL, dev,
+                               ipoib_sendq_size + 1);
+       if (IS_ERR(priv->send_cq)) {
+               printk(KERN_WARNING "%s: failed to create send CQ\n", 
ca->name);
                goto out_free_pd;
        }
 
-       if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
-               goto out_free_cq;
+       if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+               goto out_free_send_cq;
+
+
+       priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_recv_completion, 
NULL, dev,
+                               ipoib_recvq_size + 1);
+       if (IS_ERR(priv->recv_cq)) {
+               printk(KERN_WARNING "%s: failed to create recv CQ\n", 
ca->name);
+               goto out_free_send_cq;
+       }
+
+       if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+               goto out_free_recv_cq;
 
        priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
        if (IS_ERR(priv->mr)) {
                printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", 
ca->name);
-               goto out_free_cq;
+               goto out_free_recv_cq;
        }
 
-       init_attr.send_cq = priv->cq;
-       init_attr.recv_cq = priv->cq,
+       init_attr.send_cq = priv->send_cq;
+       init_attr.recv_cq = priv->recv_cq,
 
        priv->qp = ib_create_qp(priv->pd, &init_attr);
        if (IS_ERR(priv->qp)) {
@@ -215,8 +226,11 @@
 out_free_mr:
        ib_dereg_mr(priv->mr);
 
-out_free_cq:
-       ib_destroy_cq(priv->cq);
+out_free_recv_cq:
+       ib_destroy_cq(priv->recv_cq);
+
+out_free_send_cq:
+       ib_destroy_cq(priv->send_cq);
 
 out_free_pd:
        ib_dealloc_pd(priv->pd);
@@ -238,7 +252,10 @@
        if (ib_dereg_mr(priv->mr))
                ipoib_warn(priv, "ib_dereg_mr failed\n");
 
-       if (ib_destroy_cq(priv->cq))
+       if (ib_destroy_cq(priv->send_cq))
+               ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+       if (ib_destroy_cq(priv->recv_cq))
                ipoib_warn(priv, "ib_cq_destroy failed\n");
 
        if (ib_dealloc_pd(priv->pd))





Thanks
Shirley Ma
IBM Linux Technology Center
15300 SW Koll Parkway
Beaverton, OR 97006-6063
Phone(Fax): (503) 578-7638

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060419/a82ac19a/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cq.tune.patch
Type: application/octet-stream
Size: 13025 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060419/a82ac19a/attachment.obj>


More information about the general mailing list