<br><font size=2 face="sans-serif">Hello Roland,</font>
<br>
<br><font size=2 face="sans-serif">Here is the patch to tune IPoIB QP send/recv
queue size. Please reivew. The attachement file is for you to apply the
patch.</font>
<br><font size=2 face="sans-serif">This patch includes:</font>
<br><font size=2 face="sans-serif">        a.
these two parameters are module parameters</font>
<br><font size=2 face="sans-serif">        b.
they are saved in priv_dev</font>
<br><font size=2 face="sans-serif">        c.
the QP size max value=4k, min value=64</font>
<br><font size=2 face="sans-serif">        d.
QP size check to be power of 2 because of tx_ring</font>
<br><font size=2 face="sans-serif">        e.
these QP sizes are logged in /var/log/messages.</font>
<br><font size=2 face="sans-serif">        f.
modify TX_RING/RX_RING_SIZE to SENDQ/RECVQ_SIZE. the ring concept here
is only to have a place to save pointers not an actual ring.</font>
<br><font size=2 face="sans-serif">        </font>
<br><font size=2 face="sans-serif">This patch will address packets retransmission/timeout
issues on large cluster environment. Also the performance has dramatically
improved by tuning these parameters. It's about 7 times better throughput
than default value according to our MPI test results on cluster.</font>
<br>
<br><font size=2 face="sans-serif">Signed-off-by: Shirley Ma <xma@us.ibm.com></font>
<br>
<div>
<br><font size=2 face="sans-serif">diff -urN infiniband/ulp/ipoib/ipoib.h
infiniband-queue/ulp/ipoib/ipoib.h<br>
--- infiniband/ulp/ipoib/ipoib.h        2006-03-26
11:57:15.000000000 -0800<br>
+++ infiniband-queue/ulp/ipoib/ipoib.h        2006-03-31
08:46:34.171748048 -0800<br>
@@ -66,8 +66,8 @@<br>
 <br>
         IPOIB_ENCAP_LEN      
    = 4,<br>
 <br>
-        IPOIB_RX_RING_SIZE      
    = 128,<br>
-        IPOIB_TX_RING_SIZE      
    = 64,<br>
+        IPOIB_SENDQ_SIZE      
    = 64,<br>
+        IPOIB_RECVQ_SIZE      
    = 128,<br>
 <br>
         IPOIB_NUM_WC      
            = 4,<br>
 <br>
@@ -186,6 +186,8 @@<br>
         struct dentry *mcg_dentry;<br>
         struct dentry *path_dentry;<br>
 #endif<br>
+        int        sendq_size;<br>
+        int         recvq_size;<br>
 };<br>
 <br>
 struct ipoib_ah {<br>
@@ -338,6 +340,8 @@<br>
 #define ipoib_warn(priv, format, arg...)        
       \<br>
         ipoib_printk(KERN_WARNING, priv, format
, ## arg)<br>
 <br>
+extern int ipoib_sendq_size;<br>
+extern int ipoib_recvq_size;<br>
 <br>
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG<br>
 extern int ipoib_debug_level;<br>
diff -urN infiniband/ulp/ipoib/ipoib_ib.c infiniband-queue/ulp/ipoib/ipoib_ib.c<br>
--- infiniband/ulp/ipoib/ipoib_ib.c        2006-03-26
11:57:15.000000000 -0800<br>
+++ infiniband-queue/ulp/ipoib/ipoib_ib.c        2006-03-31
08:46:34.227739536 -0800</font>
<br><font size=2 face="sans-serif">@@ -161,7 +161,7 @@<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         int i;<br>
 <br>
-        for (i = 0; i < IPOIB_RX_RING_SIZE;
++i) {<br>
+        for (i = 0; i < priv->recvq_size;
++i) {<br>
                 if
(ipoib_alloc_rx_skb(dev, i)) {<br>
                  
      ipoib_warn(priv, "failed to allocate
receive buffer %d\n", i);<br>
                  
      return -ENOMEM;<br>
@@ -187,7 +187,7 @@<br>
         if (wr_id & IPOIB_OP_RECV) {<br>
                 wr_id
&= ~IPOIB_OP_RECV;<br>
 <br>
-                if
(wr_id < IPOIB_RX_RING_SIZE) {<br>
+                if
(wr_id < priv->recvq_size) {<br>
                  
      struct sk_buff *skb  = priv->rx_ring[wr_id].skb;<br>
                  
      dma_addr_t      addr = priv->rx_ring[wr_id].mapping;<br>
 <br>
@@ -252,9 +252,9 @@<br>
                 struct
ipoib_tx_buf *tx_req;<br>
                 unsigned
long flags;<br>
 <br>
-                if
(wr_id >= IPOIB_TX_RING_SIZE) {<br>
+                if
(wr_id >= priv->sendq_size) {<br>
                  
      ipoib_warn(priv, "completion event with
wrid %d (> %d)\n",<br>
-                
               
  wr_id, IPOIB_TX_RING_SIZE);<br>
+                
               
  wr_id, priv->sendq_size);<br>
                  
      return;<br>
                 }<br>
 <br>
@@ -275,7 +275,7 @@<br>
                 spin_lock_irqsave(&priv->tx_lock,
flags);<br>
                 ++priv->tx_tail;<br>
                 if
(netif_queue_stopped(dev) &&<br>
-                
   priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE
/ 2)</font>
<br><font size=2 face="sans-serif">+        
           priv->tx_head - priv->tx_tail
<= priv->sendq_size / 2)<br>
                  
      netif_wake_queue(dev);<br>
                 spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
 <br>
@@ -344,13 +344,13 @@<br>
          * means we have to make sure
everything is properly recorded and<br>
          * our state is consistent before
we call post_send().<br>
          */<br>
-        tx_req = &priv->tx_ring[priv->tx_head
& (IPOIB_TX_RING_SIZE - 1)];<br>
+        tx_req = &priv->tx_ring[priv->tx_head
& (priv->sendq_size - 1)];<br>
         tx_req->skb = skb;<br>
         addr = dma_map_single(priv->ca->dma_device,
skb->data, skb->len,<br>
                  
            DMA_TO_DEVICE);<br>
         pci_unmap_addr_set(tx_req, mapping,
addr);<br>
 <br>
-        if (unlikely(post_send(priv, priv->tx_head
& (IPOIB_TX_RING_SIZE - 1),<br>
+        if (unlikely(post_send(priv, priv->tx_head
& (priv->sendq_size - 1),<br>
                  
             address->ah,
qpn, addr, skb->len))) {<br>
                 ipoib_warn(priv,
"post_send failed\n");<br>
                 ++priv->stats.tx_errors;<br>
@@ -363,7 +363,7 @@<br>
                 address->last_send
= priv->tx_head;<br>
                 ++priv->tx_head;<br>
 <br>
-                if
(priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {<br>
+                if
(priv->tx_head - priv->tx_tail == priv->sendq_size) {</font>
<br><font size=2 face="sans-serif">         
               ipoib_dbg(priv,
"TX ring full, stopping kernel net queue\n");<br>
                  
      netif_stop_queue(dev);<br>
                 }<br>
@@ -488,7 +488,7 @@<br>
         int pending = 0;<br>
         int i;<br>
 <br>
-        for (i = 0; i < IPOIB_RX_RING_SIZE;
++i)<br>
+        for (i = 0; i < priv->recvq_size;
++i)<br>
                 if
(priv->rx_ring[i].skb)<br>
                  
      ++pending;<br>
 <br>
@@ -527,7 +527,7 @@<br>
                  
       */<br>
                  
      while ((int) priv->tx_tail - (int) priv->tx_head
< 0) {<br>
                  
              tx_req
= &priv->tx_ring[priv->tx_tail &<br>
-                
               
               
       (IPOIB_TX_RING_SIZE - 1)];<br>
+                
               
               
       (priv->sendq_size - 1)];<br>
                  
              dma_unmap_single(priv->ca->dma_device,<br>
                  
               
               pci_unmap_addr(tx_req,
mapping),<br>
                  
               
               tx_req->skb->len,<br>
@@ -536,7 +536,7 @@<br>
                  
              ++priv->tx_tail;<br>
                  
      }<br>
 <br>
-                
       for (i = 0; i < IPOIB_RX_RING_SIZE;
++i)<br>
+                
       for (i = 0; i < priv->recvq_size;
++i)<br>
                  
              if (priv->rx_ring[i].skb)
{<br>
                  
               
      dma_unmap_single(priv->ca->dma_device,<br>
                  
               
               
       pci_unmap_addr(&priv->rx_ring[i],<br>
diff -urN infiniband/ulp/ipoib/ipoib_main.c infiniband-queue/ulp/ipoib/ipoib_main.c<br>
--- infiniband/ulp/ipoib/ipoib_main.c        2006-03-28
19:20:21.000000000 -0800<br>
+++ infiniband-queue/ulp/ipoib/ipoib_main.c        2006-03-31
09:15:06.345458080 -0800</font>
<br><font size=2 face="sans-serif">@@ -53,6 +53,17 @@<br>
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");<br>
 MODULE_LICENSE("Dual BSD/GPL");<br>
 <br>
+#define IPOIB_MAX_QUEUE_SIZE        4096  
     /* max is 4k */<br>
+#define IPOIB_MIN_QUEUE_SIZE    64        /*
min is 64 */<br>
+<br>
+int ipoib_sendq_size = IPOIB_SENDQ_SIZE;<br>
+int ipoib_recvq_size = IPOIB_RECVQ_SIZE;<br>
+<br>
+module_param_named(sendq_size, ipoib_sendq_size, int, 0444);<br>
+MODULE_PARM_DESC(sendq_size, "Number of wqe in send queue");<br>
+module_param_named(recvq_size, ipoib_recvq_size, int, 0444);<br>
+MODULE_PARM_DESC(recvq_size, "Number of wqe in receive queue");<br>
+<br>
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG<br>
 int ipoib_debug_level;<br>
 <br>
@@ -837,27 +848,61 @@<br>
         return 0;<br>
 }<br>
 <br>
+static int expsize(int size)<br>
+{        <br>
+        int expsize_t = 1;<br>
+        int j = 1;<br>
+        while (size / 2 >= expsize_t) {<br>
+                expsize_t
= 1 << ++j;<br>
+        }<br>
+        return expsize_t;<br>
+}<br>
+<br>
 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
 <br>
         /* Allocate RX/TX "rings"
to hold queued skbs */<br>
 <br>
-        priv->rx_ring =      
 kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf),</font>
<br><font size=2 face="sans-serif">+        if
(ipoib_recvq_size > IPOIB_MAX_QUEUE_SIZE) {<br>
+                ipoib_recvq_size
= IPOIB_MAX_QUEUE_SIZE;<br>
+                
printk(KERN_WARNING "%s: ipoib_recvq_size is too big, use max %d instead\n",
ca->name, IPOIB_MAX_QUEUE_SIZE);<br>
+        }<br>
+        if (ipoib_recvq_size < IPOIB_MIN_QUEUE_SIZE)
{<br>
+                ipoib_recvq_size
= IPOIB_MIN_QUEUE_SIZE;<br>
+                printk(KERN_WARNING
"%s: ipoib_recvq_size is too small, use min %d instead\n", ca->name,
IPOIB_MIN_QUEUE_SIZE);<br>
+        }<br>
+        priv->recvq_size = expsize(ipoib_recvq_size);<br>
+        priv->rx_ring =      
 kzalloc(priv->recvq_size * sizeof (struct ipoib_rx_buf),<br>
                  
              GFP_KERNEL);<br>
         if (!priv->rx_ring) {<br>
                 printk(KERN_WARNING
"%s: failed to allocate RX ring (%d entries)\n",<br>
-                
      ca->name, IPOIB_RX_RING_SIZE);<br>
+                
      ca->name, priv->sendq_size);<br>
                 goto
out;<br>
         }<br>
+        printk(KERN_INFO "%s: RX_RING_SIZE
is set to %d entries\n",<br>
+               ca->name,
priv->recvq_size);<br>
+<br>
+        if (ipoib_sendq_size > IPOIB_MAX_QUEUE_SIZE)
{<br>
+                ipoib_sendq_size
= IPOIB_MAX_QUEUE_SIZE;<br>
+                printk(KERN_WARNING
"%s: ipoib_sendq_size is too big, use max %d instead\n", ca->name,
IPOIB_MAX_QUEUE_SIZE);</font>
<br><font size=2 face="sans-serif">+        }<br>
+        if (ipoib_sendq_size < IPOIB_MIN_QUEUE_SIZE)
{<br>
+                ipoib_sendq_size
= IPOIB_MIN_QUEUE_SIZE;<br>
+                printk(KERN_WARNING
"%s: ipoib_recvq_size is too small, use min %d instead\n", ca->name,
IPOIB_MIN_QUEUE_SIZE);<br>
+        }         <br>
+<br>
+        priv->sendq_size = expsize(ipoib_sendq_size);<br>
 <br>
-        priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE
* sizeof (struct ipoib_tx_buf),<br>
+        priv->tx_ring = kzalloc(priv->sendq_size
* sizeof (struct ipoib_tx_buf),<br>
                  
              GFP_KERNEL);<br>
         if (!priv->tx_ring) {<br>
                 printk(KERN_WARNING
"%s: failed to allocate TX ring (%d entries)\n",<br>
-                
      ca->name, IPOIB_TX_RING_SIZE);<br>
+                
      ca->name, priv->sendq_size);<br>
                 goto
out_rx_ring_cleanup;<br>
         }<br>
+        printk(KERN_INFO "%s: TX_RING_SIZE
is set to %d entries\n",<br>
+               ca->name,
priv->sendq_size);<br>
 <br>
         /* priv->tx_head & tx_tail
are already 0 */<br>
 <br>
@@ -923,7 +968,7 @@<br>
         dev->hard_header_len    
     = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;<br>
         dev->addr_len      
           = INFINIBAND_ALEN;<br>
         dev->type      
           = ARPHRD_INFINIBAND;<br>
-        dev->tx_queue_len    
     = IPOIB_TX_RING_SIZE * 2;<br>
+        dev->tx_queue_len    
     = priv->sendq_size * 2;</font>
<br><font size=2 face="sans-serif">         dev->features
           = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;<br>
 <br>
         /* MTU will be reset when mcast join
happens */<br>
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c infiniband-queue/ulp/ipoib/ipoib_verbs.c<br>
--- infiniband/ulp/ipoib/ipoib_verbs.c        2006-03-26
11:57:15.000000000 -0800<br>
+++ infiniband-queue/ulp/ipoib/ipoib_verbs.c        2006-03-31
08:46:34.308727224 -0800<br>
@@ -159,8 +159,8 @@<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         struct ib_qp_init_attr init_attr =
{<br>
                 .cap
= {<br>
-                
       .max_send_wr  = IPOIB_TX_RING_SIZE,<br>
-                
       .max_recv_wr  = IPOIB_RX_RING_SIZE,<br>
+                
       .max_send_wr  = priv->sendq_size,<br>
+                
       .max_recv_wr  = priv->recvq_size,<br>
                  
      .max_send_sge = 1,<br>
                  
      .max_recv_sge = 1<br>
                 },<br>
@@ -175,7 +175,7 @@<br>
         }<br>
 <br>
         priv->cq = ib_create_cq(priv->ca,
ipoib_ib_completion, NULL, dev,<br>
-                
               IPOIB_TX_RING_SIZE
+ IPOIB_RX_RING_SIZE + 1);<br>
+                
               priv->sendq_size
+ priv->recvq_size + 1);<br>
         if (IS_ERR(priv->cq)) {<br>
                 printk(KERN_WARNING
"%s: failed to create CQ\n", ca->name);<br>
                 goto
out_free_pd;</font>
<br>
<br>
<br>
<br>
<br><font size=2 face="sans-serif"><br>
Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638</font></div>