[openib-general] [PATCH] splitting IPoIB CQ

Shirley Ma xma at us.ibm.com
Fri Apr 14 12:20:57 PDT 2006


Hello Roland,

Here is the patch to split IPoIB CQ into send CQ and recv CQ. 

Some tests have been done over mthca and ehca. Unidirectional stream test, 
gains up to 15% throughout with this patch on systems over 4 cpus. 
Bidirectional could gain more. People might get different performance 
improvement number under different drivers and cpus. I have attached the 
patch for who are willing to run the performance test with different 
drivers. And please give your inputs.

The reason I have two seperated wc handler is because I am working on 
another patch to optimize send CQ and recv CQ seperately. 

Signed-off-by: Shirley Ma <xma at us.ibm.com>

diff -urpN infiniband/ulp/ipoib/ipoib.h infiniband-cq/ulp/ipoib/ipoib.h
--- infiniband/ulp/ipoib/ipoib.h        2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib.h     2006-04-12 16:55:57.000000000 
-0700
@@ -151,7 +151,8 @@ struct ipoib_dev_priv {
        u16               pkey;
        struct ib_pd     *pd;
        struct ib_mr     *mr;
-       struct ib_cq     *cq;
+       struct ib_cq     *send_cq;
+       struct ib_cq     *recv_cq;
        struct ib_qp     *qp;
        u32               qkey;
 
@@ -171,7 +172,8 @@ struct ipoib_dev_priv {
        struct ib_sge        tx_sge;
        struct ib_send_wr    tx_wr;
 
-       struct ib_wc ibwc[IPOIB_NUM_WC];
+       struct ib_wc *send_ibwc;
+       struct ib_wc *recv_ibwc;
 
        struct list_head dead_ahs;
 
@@ -245,7 +247,8 @@ extern struct workqueue_struct *ipoib_wo
 
 /* functions */
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr);
 
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
                                 struct ib_pd *pd, struct ib_ah_attr 
*attr);
diff -urpN infiniband/ulp/ipoib/ipoib_ib.c 
infiniband-cq/ulp/ipoib/ipoib_ib.c
--- infiniband/ulp/ipoib/ipoib_ib.c     2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_ib.c  2006-04-14 12:49:51.113116736 
-0700
@@ -50,7 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
                 "Enable data path debug tracing if > 0");
 #endif
 
-#define        IPOIB_OP_RECV   (1ul << 31)
 
 static DEFINE_MUTEX(pkey_mutex);
 
@@ -108,7 +107,7 @@ static int ipoib_ib_post_receive(struct 
        list.lkey     = priv->mr->lkey;
 
        param.next    = NULL;
-       param.wr_id   = id | IPOIB_OP_RECV;
+       param.wr_id   = id;
        param.sg_list = &list;
        param.num_sge = 1;
 
@@ -175,8 +174,8 @@ static int ipoib_ib_post_receives(struct
        return 0;
 }
 
-static void ipoib_ib_handle_wc(struct net_device *dev,
-                              struct ib_wc *wc)
+static void ipoib_ib_handle_recv_wc(struct net_device *dev,
+                                   struct ib_wc *wc)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        unsigned int wr_id = wc->wr_id;
@@ -184,110 +183,129 @@ static void ipoib_ib_handle_wc(struct ne
        ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
                       wr_id, wc->opcode, wc->status);
 
-       if (wr_id & IPOIB_OP_RECV) {
-               wr_id &= ~IPOIB_OP_RECV;
-
-               if (wr_id < ipoib_recvq_size) {
-                       struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
-                       dma_addr_t      addr = 
priv->rx_ring[wr_id].mapping;
-
-                       if (unlikely(wc->status != IB_WC_SUCCESS)) {
-                               if (wc->status != IB_WC_WR_FLUSH_ERR)
-                                       ipoib_warn(priv, "failed recv 
event "
-                                                  "(status=%d, wrid=%d 
vend_err %x)\n",
-                                                  wc->status, wr_id, 
wc->vendor_err);
-                               dma_unmap_single(priv->ca->dma_device, 
addr,
-                                                IPOIB_BUF_SIZE, 
DMA_FROM_DEVICE);
-                               dev_kfree_skb_any(skb);
-                               priv->rx_ring[wr_id].skb = NULL;
-                               return;
-                       }
-
-                       /*
-                        * If we can't allocate a new RX buffer, dump
-                        * this packet and reuse the old buffer.
-                        */
-                       if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
-                               ++priv->stats.rx_dropped;
-                               goto repost;
-                       }
-
-                       ipoib_dbg_data(priv, "received %d bytes, SLID 
0x%04x\n",
-                                      wc->byte_len, wc->slid);
 
+       if (wr_id < ipoib_recvq_size) {
+               struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
+               dma_addr_t      addr = priv->rx_ring[wr_id].mapping;
+
+               if (unlikely(wc->status != IB_WC_SUCCESS)) {
+                       if (wc->status != IB_WC_WR_FLUSH_ERR)
+                               ipoib_warn(priv, "failed recv event "
+                                          "(status=%d, wrid=%d vend_err 
%x)\n",
+                                          wc->status, wr_id, 
wc->vendor_err);
                        dma_unmap_single(priv->ca->dma_device, addr,
                                         IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
+                       dev_kfree_skb_any(skb);
+                       priv->rx_ring[wr_id].skb = NULL;
+                       return;
+               }
 
-                       skb_put(skb, wc->byte_len);
-                       skb_pull(skb, IB_GRH_BYTES);
+               /*
+                * If we can't allocate a new RX buffer, dump
+                * this packet and reuse the old buffer.
+                */
+               if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
+                       ++priv->stats.rx_dropped;
+                       goto repost;
+               }
 
-                       if (wc->slid != priv->local_lid ||
-                           wc->src_qp != priv->qp->qp_num) {
-                               skb->protocol = ((struct ipoib_header *) 
skb->data)->proto;
-                               skb->mac.raw = skb->data;
-                               skb_pull(skb, IPOIB_ENCAP_LEN);
-
-                               dev->last_rx = jiffies;
-                               ++priv->stats.rx_packets;
-                               priv->stats.rx_bytes += skb->len;
-
-                               skb->dev = dev;
-                               /* XXX get correct PACKET_ type here */
-                               skb->pkt_type = PACKET_HOST;
-                               netif_rx_ni(skb);
-                       } else {
-                               ipoib_dbg_data(priv, "dropping loopback 
packet\n");
-                               dev_kfree_skb_any(skb);
-                       }
+               ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+                              wc->byte_len, wc->slid);
 
-               repost:
-                       if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
-                               ipoib_warn(priv, "ipoib_ib_post_receive 
failed "
-                                          "for buf %d\n", wr_id);
-               } else
-                       ipoib_warn(priv, "completion event with wrid 
%d\n",
-                                  wr_id);
+               dma_unmap_single(priv->ca->dma_device, addr,
+                                IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
 
-       } else {
-               struct ipoib_tx_buf *tx_req;
-               unsigned long flags;
+               skb_put(skb, wc->byte_len);
+               skb_pull(skb, IB_GRH_BYTES);
 
-               if (wr_id >= ipoib_sendq_size) {
-                       ipoib_warn(priv, "completion event with wrid %d (> 
%d)\n",
-                                  wr_id, ipoib_sendq_size);
-                       return;
+               if (wc->slid != priv->local_lid ||
+                   wc->src_qp != priv->qp->qp_num) {
+                       skb->protocol = ((struct ipoib_header *) 
skb->data)->proto;
+                       skb->mac.raw = skb->data;
+                       skb_pull(skb, IPOIB_ENCAP_LEN);
+
+                       dev->last_rx = jiffies;
+                       ++priv->stats.rx_packets;
+                       priv->stats.rx_bytes += skb->len;
+
+                       skb->dev = dev;
+                       /* XXX get correct PACKET_ type here */
+                       skb->pkt_type = PACKET_HOST;
+                       netif_rx_ni(skb);
+               } else {
+                       ipoib_dbg_data(priv, "dropping loopback 
packet\n");
+                       dev_kfree_skb_any(skb);
                }
 
-               ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
+       repost:
+               if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+                       ipoib_warn(priv, "ipoib_ib_post_receive failed "
+                                  "for buf %d\n", wr_id);
+       } else
+               ipoib_warn(priv, "completion event with wrid %d\n",
+                          wr_id);
+}
 
-               tx_req = &priv->tx_ring[wr_id];
+static void ipoib_ib_handle_send_wc(struct net_device *dev,
+                                   struct ib_wc *wc)
+{
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       unsigned int wr_id = wc->wr_id;
+       struct ipoib_tx_buf *tx_req;
+       unsigned long flags;
 
-               dma_unmap_single(priv->ca->dma_device,
-                                pci_unmap_addr(tx_req, mapping),
-                                tx_req->skb->len,
-                                DMA_TO_DEVICE);
+       ipoib_dbg_data(priv, "called: id %d, op %d, status: %d\n",
+                      wr_id, wc->opcode, wc->status);
 
-               ++priv->stats.tx_packets;
-               priv->stats.tx_bytes += tx_req->skb->len;
+       if (wr_id >= ipoib_sendq_size) {
+               ipoib_warn(priv, "completion event with wrid %d (> %d)\n",
+                          wr_id, ipoib_sendq_size);
+               return;
+       }
 
-               dev_kfree_skb_any(tx_req->skb);
+       ipoib_dbg_data(priv, "send complete, wrid %d\n", wr_id);
 
-               spin_lock_irqsave(&priv->tx_lock, flags);
-               ++priv->tx_tail;
-               if (netif_queue_stopped(dev) &&
-                   priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 
1)
-                       netif_wake_queue(dev);
-               spin_unlock_irqrestore(&priv->tx_lock, flags);
+       tx_req = &priv->tx_ring[wr_id];
 
-               if (wc->status != IB_WC_SUCCESS &&
-                   wc->status != IB_WC_WR_FLUSH_ERR)
-                       ipoib_warn(priv, "failed send event "
-                                  "(status=%d, wrid=%d vend_err %x)\n",
-                                  wc->status, wr_id, wc->vendor_err);
-       }
+       dma_unmap_single(priv->ca->dma_device,
+                        pci_unmap_addr(tx_req, mapping),
+                        tx_req->skb->len,
+                        DMA_TO_DEVICE);
+
+       ++priv->stats.tx_packets;
+       priv->stats.tx_bytes += tx_req->skb->len;
+
+       dev_kfree_skb_any(tx_req->skb);
+
+       spin_lock_irqsave(&priv->tx_lock, flags);
+       ++priv->tx_tail;
+       if (netif_queue_stopped(dev) &&
+           priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
+               netif_wake_queue(dev);
+       spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+       if (wc->status != IB_WC_SUCCESS &&
+           wc->status != IB_WC_WR_FLUSH_ERR)
+               ipoib_warn(priv, "failed send event "
+                          "(status=%d, wrid=%d vend_err %x)\n",
+                          wc->status, wr_id, wc->vendor_err);
+}
+
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)
+{
+       struct net_device *dev = (struct net_device *) dev_ptr;
+       struct ipoib_dev_priv *priv = netdev_priv(dev);
+       int n, i;
+
+       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+       do {
+               n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->send_ibwc);
+               for (i = 0; i < n; ++i)
+                       ipoib_ib_handle_send_wc(dev, priv->send_ibwc + i);
+       } while (n == IPOIB_NUM_WC);
 }
 
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr)
 {
        struct net_device *dev = (struct net_device *) dev_ptr;
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -295,9 +313,9 @@ void ipoib_ib_completion(struct ib_cq *c
 
        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
        do {
-               n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);
+               n = ib_poll_cq(cq, IPOIB_NUM_WC, priv->recv_ibwc);
                for (i = 0; i < n; ++i)
-                       ipoib_ib_handle_wc(dev, priv->ibwc + i);
+                       ipoib_ib_handle_recv_wc(dev, priv->recv_ibwc + i);
        } while (n == IPOIB_NUM_WC);
 }
 
diff -urpN infiniband/ulp/ipoib/ipoib_main.c 
infiniband-cq/ulp/ipoib/ipoib_main.c
--- infiniband/ulp/ipoib/ipoib_main.c   2006-04-12 16:43:38.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_main.c        2006-04-14 
12:40:27.833748216 -0700
@@ -863,11 +863,25 @@ int ipoib_dev_init(struct net_device *de
 
        /* priv->tx_head & tx_tail are already 0 */
 
-       if (ipoib_ib_dev_init(dev, ca, port))
+       priv->send_ibwc = kzalloc(IPOIB_NUM_WC * sizeof(struct ib_wc), 
GFP_KERNEL);
+       if (!priv->send_ibwc)
                goto out_tx_ring_cleanup;
 
+       priv->recv_ibwc = kzalloc(IPOIB_NUM_WC * sizeof(struct ib_wc), 
GFP_KERNEL);
+       if (!priv->recv_ibwc)
+               goto out_send_ibwc_cleanup;
+
+       if (ipoib_ib_dev_init(dev, ca, port))
+               goto out_recv_ibwc_cleanup;
+
        return 0;
 
+out_recv_ibwc_cleanup:
+       kfree(priv->recv_ibwc);
+
+out_send_ibwc_cleanup:
+       kfree(priv->send_ibwc);
+
 out_tx_ring_cleanup:
        kfree(priv->tx_ring);
 
@@ -895,9 +909,15 @@ void ipoib_dev_cleanup(struct net_device
 
        kfree(priv->rx_ring);
        kfree(priv->tx_ring);
-
+ 
        priv->rx_ring = NULL;
        priv->tx_ring = NULL;
+
+       kfree(priv->send_ibwc);
+       kfree(priv->recv_ibwc);
+
+       priv->send_ibwc = NULL;
+       priv->recv_ibwc = NULL;
 }
 
 static void ipoib_setup(struct net_device *dev)
diff -urpN infiniband/ulp/ipoib/ipoib_verbs.c 
infiniband-cq/ulp/ipoib/ipoib_verbs.c
--- infiniband/ulp/ipoib/ipoib_verbs.c  2006-04-05 17:43:18.000000000 
-0700
+++ infiniband-cq/ulp/ipoib/ipoib_verbs.c       2006-04-12 
19:14:41.000000000 -0700
@@ -174,24 +174,35 @@ int ipoib_transport_dev_init(struct net_
                return -ENODEV;
        }
 
-       priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
-                               ipoib_sendq_size + ipoib_recvq_size + 1);
-       if (IS_ERR(priv->cq)) {
-               printk(KERN_WARNING "%s: failed to create CQ\n", 
ca->name);
+       priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_send_completion, 
NULL, dev,
+                               ipoib_sendq_size + 1);
+       if (IS_ERR(priv->send_cq)) {
+               printk(KERN_WARNING "%s: failed to create send CQ\n", 
ca->name);
                goto out_free_pd;
        }
 
-       if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
-               goto out_free_cq;
+       if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+               goto out_free_send_cq;
+
+
+       priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_recv_completion, 
NULL, dev,
+                               ipoib_recvq_size + 1);
+       if (IS_ERR(priv->recv_cq)) {
+               printk(KERN_WARNING "%s: failed to create recv CQ\n", 
ca->name);
+               goto out_free_send_cq;
+       }
+
+       if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+               goto out_free_recv_cq;
 
        priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
        if (IS_ERR(priv->mr)) {
                printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", 
ca->name);
-               goto out_free_cq;
+               goto out_free_recv_cq;
        }
 
-       init_attr.send_cq = priv->cq;
-       init_attr.recv_cq = priv->cq,
+       init_attr.send_cq = priv->send_cq;
+       init_attr.recv_cq = priv->recv_cq,
 
        priv->qp = ib_create_qp(priv->pd, &init_attr);
        if (IS_ERR(priv->qp)) {
@@ -215,8 +226,11 @@ int ipoib_transport_dev_init(struct net_
 out_free_mr:
        ib_dereg_mr(priv->mr);
 
-out_free_cq:
-       ib_destroy_cq(priv->cq);
+out_free_recv_cq:
+       ib_destroy_cq(priv->recv_cq);
+
+out_free_send_cq:
+       ib_destroy_cq(priv->send_cq);
 
 out_free_pd:
        ib_dealloc_pd(priv->pd);
@@ -238,7 +252,10 @@ void ipoib_transport_dev_cleanup(struct 
        if (ib_dereg_mr(priv->mr))
                ipoib_warn(priv, "ib_dereg_mr failed\n");
 
-       if (ib_destroy_cq(priv->cq))
+       if (ib_destroy_cq(priv->send_cq))
+               ipoib_warn(priv, "ib_cq_destroy failed\n");
+
+       if (ib_destroy_cq(priv->recv_cq))
                ipoib_warn(priv, "ib_cq_destroy failed\n");
 
        if (ib_dealloc_pd(priv->pd))






Thanks
Shirley Ma
IBM Linux Technology Center
15300 SW Koll Parkway
Beaverton, OR 97006-6063
Phone(Fax): (503) 578-7638

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060414/1864a696/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: cq.test.patch
Type: application/octet-stream
Size: 12581 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060414/1864a696/attachment.obj>


More information about the general mailing list