<br><font size=2 face="sans-serif">Hello Roland,</font>
<br>
<br><font size=2 face="sans-serif">Here is the patch to split IPoIB CQ
into send CQ and recv CQ. </font>
<br>
<br><font size=2 face="sans-serif">Some tests have been done over mthca
and ehca. Unidirectional stream test, gains up to 15% throughout with this
patch on systems over 4 cpus. Bidirectional could gain more. People might
get different performance improvement number under different drivers and
cpus. I have attached the patch for who are willing to run the performance
test with different drivers. And please give your inputs.</font>
<br>
<br><font size=2 face="sans-serif">The reason I have two seperated wc handler
is because I am working on another patch to optimize send CQ and recv CQ
seperately. </font>
<br>
<br><font size=2 face="sans-serif">Signed-off-by: Shirley Ma <xma@us.ibm.com></font>
<br>
<div>
<br><font size=2 face="sans-serif">diff -urpN infiniband/ulp/ipoib/ipoib.h
infiniband-cq/ulp/ipoib/ipoib.h<br>
--- infiniband/ulp/ipoib/ipoib.h        2006-04-05
17:43:18.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib.h        2006-04-12
16:55:57.000000000 -0700<br>
@@ -151,7 +151,8 @@ struct ipoib_dev_priv {<br>
         u16          
          pkey;<br>
         struct ib_pd      
    *pd;<br>
         struct ib_mr      
    *mr;<br>
-        struct ib_cq      
    *cq;<br>
+        struct ib_cq      
    *send_cq;<br>
+        struct ib_cq      
    *recv_cq;<br>
         struct ib_qp      
    *qp;<br>
         u32          
          qkey;<br>
 <br>
@@ -171,7 +172,8 @@ struct ipoib_dev_priv {<br>
         struct ib_sge      
 tx_sge;<br>
         struct ib_send_wr    tx_wr;<br>
 <br>
-        struct ib_wc ibwc[IPOIB_NUM_WC];<br>
+        struct ib_wc *send_ibwc;<br>
+        struct ib_wc *recv_ibwc;<br>
 <br>
         struct list_head dead_ahs;<br>
 <br>
@@ -245,7 +247,8 @@ extern struct workqueue_struct *ipoib_wo<br>
 <br>
 /* functions */<br>
 <br>
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);<br>
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr);<br>
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr);<br>
 <br>
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,<br>
                  
               struct
ib_pd *pd, struct ib_ah_attr *attr);</font>
<br><font size=2 face="sans-serif">diff -urpN infiniband/ulp/ipoib/ipoib_ib.c
infiniband-cq/ulp/ipoib/ipoib_ib.c<br>
--- infiniband/ulp/ipoib/ipoib_ib.c        2006-04-05
17:43:18.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib_ib.c        2006-04-14
12:49:51.113116736 -0700<br>
@@ -50,7 +50,6 @@ MODULE_PARM_DESC(data_debug_level,<br>
                  "Enable
data path debug tracing if > 0");<br>
 #endif<br>
 <br>
-#define        IPOIB_OP_RECV    
   (1ul << 31)<br>
 <br>
 static DEFINE_MUTEX(pkey_mutex);<br>
 <br>
@@ -108,7 +107,7 @@ static int ipoib_ib_post_receive(struct <br>
         list.lkey     = priv->mr->lkey;<br>
 <br>
         param.next    = NULL;<br>
-        param.wr_id   = id | IPOIB_OP_RECV;<br>
+        param.wr_id   = id;<br>
         param.sg_list = &list;<br>
         param.num_sge = 1;<br>
 <br>
@@ -175,8 +174,8 @@ static int ipoib_ib_post_receives(struct<br>
         return 0;<br>
 }<br>
 <br>
-static void ipoib_ib_handle_wc(struct net_device *dev,<br>
-                
              struct ib_wc *wc)<br>
+static void ipoib_ib_handle_recv_wc(struct net_device *dev,<br>
+                
               
   struct ib_wc *wc)<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         unsigned int wr_id = wc->wr_id;<br>
@@ -184,110 +183,129 @@ static void ipoib_ib_handle_wc(struct ne</font>
<br><font size=2 face="sans-serif">         ipoib_dbg_data(priv,
"called: id %d, op %d, status: %d\n",<br>
                  
     wr_id, wc->opcode, wc->status);<br>
 <br>
-        if (wr_id & IPOIB_OP_RECV) {<br>
-                wr_id
&= ~IPOIB_OP_RECV;<br>
-<br>
-                if
(wr_id < ipoib_recvq_size) {<br>
-                
       struct sk_buff *skb  = priv->rx_ring[wr_id].skb;<br>
-                
       dma_addr_t      addr
= priv->rx_ring[wr_id].mapping;<br>
-<br>
-                
       if (unlikely(wc->status != IB_WC_SUCCESS))
{<br>
-                
               if
(wc->status != IB_WC_WR_FLUSH_ERR)<br>
-                
               
       ipoib_warn(priv, "failed recv event
"<br>
-                
               
               
  "(status=%d, wrid=%d vend_err %x)\n",<br>
-                
               
               
  wc->status, wr_id, wc->vendor_err);<br>
-                
               dma_unmap_single(priv->ca->dma_device,
addr,<br>
-                
               
               
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);<br>
-                
               dev_kfree_skb_any(skb);<br>
-                
               priv->rx_ring[wr_id].skb
= NULL;<br>
-                
               return;<br>
-                
       }<br>
-<br>
-                
       /*<br>
-                
        * If we can't allocate a new RX buffer,
dump<br>
-                
        * this packet and reuse the old buffer.<br>
-                
        */<br>
-                
       if (unlikely(ipoib_alloc_rx_skb(dev,
wr_id))) {<br>
-                
               ++priv->stats.rx_dropped;<br>
-                
               goto
repost;<br>
-                
       }<br>
-<br>
-                
       ipoib_dbg_data(priv, "received
%d bytes, SLID 0x%04x\n",<br>
-                
               
      wc->byte_len, wc->slid);</font>
<br><font size=2 face="sans-serif"> <br>
+        if (wr_id < ipoib_recvq_size) {<br>
+                struct
sk_buff *skb  = priv->rx_ring[wr_id].skb;<br>
+                dma_addr_t
     addr = priv->rx_ring[wr_id].mapping;<br>
+<br>
+                if
(unlikely(wc->status != IB_WC_SUCCESS)) {<br>
+                
       if (wc->status != IB_WC_WR_FLUSH_ERR)<br>
+                
               ipoib_warn(priv,
"failed recv event "<br>
+                
               
          "(status=%d, wrid=%d vend_err
%x)\n",<br>
+                
               
          wc->status, wr_id, wc->vendor_err);<br>
                  
      dma_unmap_single(priv->ca->dma_device,
addr,<br>
                  
               
       IPOIB_BUF_SIZE, DMA_FROM_DEVICE);<br>
+                
       dev_kfree_skb_any(skb);<br>
+                
       priv->rx_ring[wr_id].skb = NULL;<br>
+                
       return;<br>
+                }<br>
 <br>
-                
       skb_put(skb, wc->byte_len);<br>
-                
       skb_pull(skb, IB_GRH_BYTES);<br>
+                /*<br>
+                
* If we can't allocate a new RX buffer, dump<br>
+                
* this packet and reuse the old buffer.<br>
+                
*/<br>
+                if
(unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {<br>
+                
       ++priv->stats.rx_dropped;<br>
+                
       goto repost;<br>
+                }<br>
 <br>
-                
       if (wc->slid != priv->local_lid
||<br>
-                
           wc->src_qp != priv->qp->qp_num)
{<br>
-                
               skb->protocol
= ((struct ipoib_header *) skb->data)->proto;<br>
-                
               skb->mac.raw
= skb->data;<br>
-                
               skb_pull(skb,
IPOIB_ENCAP_LEN);</font>
<br><font size=2 face="sans-serif">-<br>
-                
               dev->last_rx
= jiffies;<br>
-                
               ++priv->stats.rx_packets;<br>
-                
               priv->stats.rx_bytes
+= skb->len;<br>
-<br>
-                
               skb->dev
= dev;<br>
-                
               /*
XXX get correct PACKET_ type here */<br>
-                
               skb->pkt_type
= PACKET_HOST;<br>
-                
               netif_rx_ni(skb);<br>
-                
       } else {<br>
-                
               ipoib_dbg_data(priv,
"dropping loopback packet\n");<br>
-                
               dev_kfree_skb_any(skb);<br>
-                
       }<br>
+                ipoib_dbg_data(priv,
"received %d bytes, SLID 0x%04x\n",<br>
+                
              wc->byte_len,
wc->slid);<br>
 <br>
-                repost:<br>
-                
       if (unlikely(ipoib_ib_post_receive(dev,
wr_id)))<br>
-                
               ipoib_warn(priv,
"ipoib_ib_post_receive failed "<br>
-                
               
          "for buf %d\n", wr_id);<br>
-                }
else<br>
-                
       ipoib_warn(priv, "completion event
with wrid %d\n",<br>
-                
               
  wr_id);<br>
+                dma_unmap_single(priv->ca->dma_device,
addr,<br>
+                
               
IPOIB_BUF_SIZE, DMA_FROM_DEVICE);<br>
 <br>
-        } else {<br>
-                struct
ipoib_tx_buf *tx_req;<br>
-                unsigned
long flags;<br>
+                skb_put(skb,
wc->byte_len);<br>
+                skb_pull(skb,
IB_GRH_BYTES);<br>
 <br>
-                if
(wr_id >= ipoib_sendq_size) {<br>
-                
       ipoib_warn(priv, "completion event
with wrid %d (> %d)\n",<br>
-                
               
  wr_id, ipoib_sendq_size);<br>
-                
       return;</font>
<br><font size=2 face="sans-serif">+        
       if (wc->slid != priv->local_lid
||<br>
+                
   wc->src_qp != priv->qp->qp_num) {<br>
+                
       skb->protocol = ((struct ipoib_header
*) skb->data)->proto;<br>
+                
       skb->mac.raw = skb->data;<br>
+                
       skb_pull(skb, IPOIB_ENCAP_LEN);<br>
+<br>
+                
       dev->last_rx = jiffies;<br>
+                
       ++priv->stats.rx_packets;<br>
+                
       priv->stats.rx_bytes += skb->len;<br>
+<br>
+                
       skb->dev = dev;<br>
+                
       /* XXX get correct PACKET_ type here
*/<br>
+                
       skb->pkt_type = PACKET_HOST;<br>
+                
       netif_rx_ni(skb);<br>
+                }
else {<br>
+                
       ipoib_dbg_data(priv, "dropping
loopback packet\n");<br>
+                
       dev_kfree_skb_any(skb);<br>
                 }<br>
 <br>
-                ipoib_dbg_data(priv,
"send complete, wrid %d\n", wr_id);<br>
+        repost:<br>
+                if
(unlikely(ipoib_ib_post_receive(dev, wr_id)))<br>
+                
       ipoib_warn(priv, "ipoib_ib_post_receive
failed "<br>
+                
               
  "for buf %d\n", wr_id);<br>
+        } else<br>
+                ipoib_warn(priv,
"completion event with wrid %d\n",<br>
+                
          wr_id);<br>
+}<br>
 <br>
-                tx_req
= &priv->tx_ring[wr_id];<br>
+static void ipoib_ib_handle_send_wc(struct net_device *dev,<br>
+                
               
   struct ib_wc *wc)<br>
+{<br>
+        struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
+        unsigned int wr_id = wc->wr_id;</font>
<br><font size=2 face="sans-serif">+        struct
ipoib_tx_buf *tx_req;<br>
+        unsigned long flags;<br>
 <br>
-                dma_unmap_single(priv->ca->dma_device,<br>
-                
               
pci_unmap_addr(tx_req, mapping),<br>
-                
               
tx_req->skb->len,<br>
-                
               
DMA_TO_DEVICE);<br>
+        ipoib_dbg_data(priv, "called:
id %d, op %d, status: %d\n",<br>
+                
      wr_id, wc->opcode, wc->status);<br>
 <br>
-                ++priv->stats.tx_packets;<br>
-                priv->stats.tx_bytes
+= tx_req->skb->len;<br>
+        if (wr_id >= ipoib_sendq_size)
{<br>
+                ipoib_warn(priv,
"completion event with wrid %d (> %d)\n",<br>
+                
          wr_id, ipoib_sendq_size);<br>
+                return;<br>
+        }<br>
 <br>
-                dev_kfree_skb_any(tx_req->skb);<br>
+        ipoib_dbg_data(priv, "send complete,
wrid %d\n", wr_id);<br>
 <br>
-                spin_lock_irqsave(&priv->tx_lock,
flags);<br>
-                ++priv->tx_tail;<br>
-                if
(netif_queue_stopped(dev) &&<br>
-                
   priv->tx_head - priv->tx_tail <= ipoib_sendq_size
>> 1)<br>
-                
       netif_wake_queue(dev);<br>
-                spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
+        tx_req = &priv->tx_ring[wr_id];<br>
 <br>
-                if
(wc->status != IB_WC_SUCCESS &&<br>
-                
   wc->status != IB_WC_WR_FLUSH_ERR)<br>
-                
       ipoib_warn(priv, "failed send event
"<br>
-                
               
  "(status=%d, wrid=%d vend_err %x)\n",</font>
<br><font size=2 face="sans-serif">-        
               
          wc->status, wr_id, wc->vendor_err);<br>
-        }<br>
+        dma_unmap_single(priv->ca->dma_device,<br>
+                
        pci_unmap_addr(tx_req, mapping),<br>
+                
        tx_req->skb->len,<br>
+                
        DMA_TO_DEVICE);<br>
+<br>
+        ++priv->stats.tx_packets;<br>
+        priv->stats.tx_bytes += tx_req->skb->len;<br>
+<br>
+        dev_kfree_skb_any(tx_req->skb);<br>
+<br>
+        spin_lock_irqsave(&priv->tx_lock,
flags);<br>
+        ++priv->tx_tail;<br>
+        if (netif_queue_stopped(dev) &&<br>
+            priv->tx_head - priv->tx_tail
<= ipoib_sendq_size >> 1)<br>
+                netif_wake_queue(dev);<br>
+        spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
+<br>
+        if (wc->status != IB_WC_SUCCESS
&&<br>
+            wc->status != IB_WC_WR_FLUSH_ERR)<br>
+                ipoib_warn(priv,
"failed send event "<br>
+                
          "(status=%d, wrid=%d vend_err
%x)\n",<br>
+                
          wc->status, wr_id, wc->vendor_err);<br>
+}<br>
+<br>
+void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)<br>
+{<br>
+        struct net_device *dev = (struct net_device
*) dev_ptr;<br>
+        struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
+        int n, i;<br>
+<br>
+        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);<br>
+        do {<br>
+                n
= ib_poll_cq(cq, IPOIB_NUM_WC, priv->send_ibwc);<br>
+                for
(i = 0; i < n; ++i)</font>
<br><font size=2 face="sans-serif">+        
               ipoib_ib_handle_send_wc(dev,
priv->send_ibwc + i);<br>
+        } while (n == IPOIB_NUM_WC);<br>
 }<br>
 <br>
-void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)<br>
+void ipoib_ib_recv_completion(struct ib_cq *cq, void *dev_ptr)<br>
 {<br>
         struct net_device *dev = (struct net_device
*) dev_ptr;<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
@@ -295,9 +313,9 @@ void ipoib_ib_completion(struct ib_cq *c<br>
 <br>
         ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);<br>
         do {<br>
-                n
= ib_poll_cq(cq, IPOIB_NUM_WC, priv->ibwc);<br>
+                n
= ib_poll_cq(cq, IPOIB_NUM_WC, priv->recv_ibwc);<br>
                 for
(i = 0; i < n; ++i)<br>
-                
       ipoib_ib_handle_wc(dev, priv->ibwc
+ i);<br>
+                
       ipoib_ib_handle_recv_wc(dev, priv->recv_ibwc
+ i);<br>
         } while (n == IPOIB_NUM_WC);<br>
 }<br>
 <br>
diff -urpN infiniband/ulp/ipoib/ipoib_main.c infiniband-cq/ulp/ipoib/ipoib_main.c<br>
--- infiniband/ulp/ipoib/ipoib_main.c        2006-04-12
16:43:38.000000000 -0700<br>
+++ infiniband-cq/ulp/ipoib/ipoib_main.c        2006-04-14
12:40:27.833748216 -0700<br>
@@ -863,11 +863,25 @@ int ipoib_dev_init(struct net_device *de<br>
 <br>
         /* priv->tx_head & tx_tail
are already 0 */</font>
<br><font size=2 face="sans-serif"> <br>
-        if (ipoib_ib_dev_init(dev, ca, port))<br>
+        priv->send_ibwc = kzalloc(IPOIB_NUM_WC
* sizeof(struct ib_wc), GFP_KERNEL);<br>
+        if (!priv->send_ibwc)<br>
                 goto
out_tx_ring_cleanup;<br>
 <br>
+        priv->recv_ibwc = kzalloc(IPOIB_NUM_WC
* sizeof(struct ib_wc), GFP_KERNEL);<br>
+        if (!priv->recv_ibwc)<br>
+                goto
out_send_ibwc_cleanup;<br>
+<br>
+        if (ipoib_ib_dev_init(dev, ca, port))<br>
+                goto
out_recv_ibwc_cleanup;<br>
+<br>
         return 0;<br>
 <br>
+out_recv_ibwc_cleanup:<br>
+        kfree(priv->recv_ibwc);<br>
+<br>
+out_send_ibwc_cleanup:<br>
+        kfree(priv->send_ibwc);<br>
+<br>
 out_tx_ring_cleanup:<br>
         kfree(priv->tx_ring);<br>
 <br>
@@ -895,9 +909,15 @@ void ipoib_dev_cleanup(struct net_device<br>
 <br>
         kfree(priv->rx_ring);<br>
         kfree(priv->tx_ring);<br>
-<br>
+        <br>
         priv->rx_ring = NULL;<br>
         priv->tx_ring = NULL;<br>
+<br>
+        kfree(priv->send_ibwc);<br>
+        kfree(priv->recv_ibwc);<br>
+<br>
+        priv->send_ibwc = NULL;<br>
+        priv->recv_ibwc = NULL;<br>
 }<br>
 <br>
 static void ipoib_setup(struct net_device *dev)<br>
diff -urpN infiniband/ulp/ipoib/ipoib_verbs.c infiniband-cq/ulp/ipoib/ipoib_verbs.c<br>
--- infiniband/ulp/ipoib/ipoib_verbs.c        2006-04-05
17:43:18.000000000 -0700</font>
<br><font size=2 face="sans-serif">+++ infiniband-cq/ulp/ipoib/ipoib_verbs.c
       2006-04-12 19:14:41.000000000 -0700<br>
@@ -174,24 +174,35 @@ int ipoib_transport_dev_init(struct net_<br>
                 return
-ENODEV;<br>
         }<br>
 <br>
-        priv->cq = ib_create_cq(priv->ca,
ipoib_ib_completion, NULL, dev,<br>
-                
               ipoib_sendq_size
+ ipoib_recvq_size + 1);<br>
-        if (IS_ERR(priv->cq)) {<br>
-                printk(KERN_WARNING
"%s: failed to create CQ\n", ca->name);<br>
+        priv->send_cq = ib_create_cq(priv->ca,
ipoib_ib_send_completion, NULL, dev,<br>
+                
               ipoib_sendq_size
+ 1);<br>
+        if (IS_ERR(priv->send_cq)) {<br>
+                printk(KERN_WARNING
"%s: failed to create send CQ\n", ca->name);<br>
                 goto
out_free_pd;<br>
         }<br>
 <br>
-        if (ib_req_notify_cq(priv->cq,
IB_CQ_NEXT_COMP))<br>
-                goto
out_free_cq;<br>
+        if (ib_req_notify_cq(priv->send_cq,
IB_CQ_NEXT_COMP))<br>
+                goto
out_free_send_cq;<br>
+<br>
+<br>
+        priv->recv_cq = ib_create_cq(priv->ca,
ipoib_ib_recv_completion, NULL, dev,<br>
+                
               ipoib_recvq_size
+ 1);<br>
+        if (IS_ERR(priv->recv_cq)) {<br>
+                printk(KERN_WARNING
"%s: failed to create recv CQ\n", ca->name);<br>
+                goto
out_free_send_cq;<br>
+        }<br>
+<br>
+        if (ib_req_notify_cq(priv->recv_cq,
IB_CQ_NEXT_COMP))</font>
<br><font size=2 face="sans-serif">+        
       goto out_free_recv_cq;<br>
 <br>
         priv->mr = ib_get_dma_mr(priv->pd,
IB_ACCESS_LOCAL_WRITE);<br>
         if (IS_ERR(priv->mr)) {<br>
                 printk(KERN_WARNING
"%s: ib_get_dma_mr failed\n", ca->name);<br>
-                goto
out_free_cq;<br>
+                goto
out_free_recv_cq;<br>
         }<br>
 <br>
-        init_attr.send_cq = priv->cq;<br>
-        init_attr.recv_cq = priv->cq,<br>
+        init_attr.send_cq = priv->send_cq;<br>
+        init_attr.recv_cq = priv->recv_cq,<br>
 <br>
         priv->qp = ib_create_qp(priv->pd,
&init_attr);<br>
         if (IS_ERR(priv->qp)) {<br>
@@ -215,8 +226,11 @@ int ipoib_transport_dev_init(struct net_<br>
 out_free_mr:<br>
         ib_dereg_mr(priv->mr);<br>
 <br>
-out_free_cq:<br>
-        ib_destroy_cq(priv->cq);<br>
+out_free_recv_cq:<br>
+        ib_destroy_cq(priv->recv_cq);<br>
+<br>
+out_free_send_cq:<br>
+        ib_destroy_cq(priv->send_cq);<br>
 <br>
 out_free_pd:<br>
         ib_dealloc_pd(priv->pd);<br>
@@ -238,7 +252,10 @@ void ipoib_transport_dev_cleanup(struct <br>
         if (ib_dereg_mr(priv->mr))<br>
                 ipoib_warn(priv,
"ib_dereg_mr failed\n");<br>
 <br>
-        if (ib_destroy_cq(priv->cq))<br>
+        if (ib_destroy_cq(priv->send_cq))<br>
+                ipoib_warn(priv,
"ib_cq_destroy failed\n");<br>
+<br>
+        if (ib_destroy_cq(priv->recv_cq))</font>
<br><font size=2 face="sans-serif">         
       ipoib_warn(priv, "ib_cq_destroy
failed\n");<br>
 <br>
         if (ib_dealloc_pd(priv->pd))</font>
<br>
<br>
<br>
<br>
<br><font size=2 face="sans-serif"><br>
</font>
<br><font size=2 face="sans-serif">Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638<br>
<br>
</font></div>