<br><font size=2 face="sans-serif">Hello Roland,</font>
<br>
<br><font size=2 face="sans-serif">The default TX_RING_SIZE is 64, RX_RING_SIZE
is 128 in IPoIB, which are not optimized for all IB device drivers. We
saw numerous retransmissions in our MPI stress test cluster environment.
I've changed these two parameters as loadable. We even got 6 times better
performance test results with much bigger TX/RX_RING_SIZE, and no more
retransmissions. I think it's more reasonable to allow these parameters
to be set during module load.</font>
<br>
<br><font size=2 face="sans-serif">Here is the patch for review.</font>
<br>
<br><font size=2 face="sans-serif">Sign-off-by: Shirley Ma <xma@us.ibm.com></font>
<div>
<br><font size=2 face="sans-serif"><br>
diff -urN infiniband/ulp/ipoib/ipoib_ib.c infiniband-ring/ulp/ipoib/ipoib_ib.c<br>
--- infiniband/ulp/ipoib/ipoib_ib.c        2006-03-03
13:57:18.000000000 -0800<br>
+++ infiniband-ring/ulp/ipoib/ipoib_ib.c        2006-03-06
20:11:06.155526568 -0800<br>
@@ -54,6 +54,9 @@<br>
 <br>
 static DEFINE_MUTEX(pkey_mutex);<br>
 <br>
+extern int ipoib_rx_ring_size;<br>
+extern int ipoib_tx_ring_size;<br>
+<br>
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,<br>
                  
               struct
ib_pd *pd, struct ib_ah_attr *attr)<br>
 {<br>
@@ -161,7 +164,7 @@<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         int i;<br>
 <br>
-        for (i = 0; i < IPOIB_RX_RING_SIZE;
++i) {<br>
+        for (i = 0; i < ipoib_rx_ring_size;
++i) {<br>
                 if
(ipoib_alloc_rx_skb(dev, i)) {<br>
                  
      ipoib_warn(priv, "failed to allocate
receive buffer %d\n", i);<br>
                  
      return -ENOMEM;<br>
@@ -187,7 +190,7 @@<br>
         if (wr_id & IPOIB_OP_RECV) {<br>
                 wr_id
&= ~IPOIB_OP_RECV;<br>
 <br>
-                if
(wr_id < IPOIB_RX_RING_SIZE) {<br>
+                if
(wr_id < ipoib_rx_ring_size) {<br>
                  
      struct sk_buff *skb  = priv->rx_ring[wr_id].skb;<br>
                  
      dma_addr_t      addr = priv->rx_ring[wr_id].mapping;</font>
<br><font size=2 face="sans-serif"> <br>
@@ -252,9 +255,9 @@<br>
                 struct
ipoib_tx_buf *tx_req;<br>
                 unsigned
long flags;<br>
 <br>
-                if
(wr_id >= IPOIB_TX_RING_SIZE) {<br>
+                if
(wr_id >= ipoib_tx_ring_size) {<br>
                  
      ipoib_warn(priv, "completion event with
wrid %d (> %d)\n",<br>
-                
               
  wr_id, IPOIB_TX_RING_SIZE);<br>
+                
               
  wr_id, ipoib_tx_ring_size);<br>
                  
      return;<br>
                 }<br>
 <br>
@@ -275,7 +278,7 @@<br>
                 spin_lock_irqsave(&priv->tx_lock,
flags);<br>
                 ++priv->tx_tail;<br>
                 if
(netif_queue_stopped(dev) &&<br>
-                
   priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE
/ 2)<br>
+                
   priv->tx_head - priv->tx_tail <= ipoib_tx_ring_size
/ 2)<br>
                  
      netif_wake_queue(dev);<br>
                 spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
 <br>
@@ -344,13 +347,13 @@<br>
          * means we have to make sure
everything is properly recorded and<br>
          * our state is consistent before
we call post_send().<br>
          */<br>
-        tx_req = &priv->tx_ring[priv->tx_head
& (IPOIB_TX_RING_SIZE - 1)];<br>
+        tx_req = &priv->tx_ring[priv->tx_head
& (ipoib_tx_ring_size - 1)];<br>
         tx_req->skb = skb;<br>
         addr = dma_map_single(priv->ca->dma_device,
skb->data, skb->len,<br>
                  
            DMA_TO_DEVICE);</font>
<br><font size=2 face="sans-serif">         pci_unmap_addr_set(tx_req,
mapping, addr);<br>
 <br>
-        if (unlikely(post_send(priv, priv->tx_head
& (IPOIB_TX_RING_SIZE - 1),<br>
+        if (unlikely(post_send(priv, priv->tx_head
& (ipoib_tx_ring_size - 1),<br>
                  
             address->ah,
qpn, addr, skb->len))) {<br>
                 ipoib_warn(priv,
"post_send failed\n");<br>
                 ++priv->stats.tx_errors;<br>
@@ -363,7 +366,7 @@<br>
                 address->last_send
= priv->tx_head;<br>
                 ++priv->tx_head;<br>
 <br>
-                if
(priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {<br>
+                if
(priv->tx_head - priv->tx_tail == ipoib_tx_ring_size) {<br>
                  
      ipoib_dbg(priv, "TX ring full, stopping
kernel net queue\n");<br>
                  
      netif_stop_queue(dev);<br>
                 }<br>
@@ -467,7 +470,7 @@<br>
         int pending = 0;<br>
         int i;<br>
 <br>
-        for (i = 0; i < IPOIB_RX_RING_SIZE;
++i)<br>
+        for (i = 0; i < ipoib_rx_ring_size;
++i)<br>
                 if
(priv->rx_ring[i].skb)<br>
                  
      ++pending;<br>
 <br>
@@ -504,7 +507,7 @@<br>
                  
       */<br>
                  
      while ((int) priv->tx_tail - (int) priv->tx_head
< 0) {<br>
                  
              tx_req
= &priv->tx_ring[priv->tx_tail &<br>
-                
               
               
       (IPOIB_TX_RING_SIZE - 1)];<br>
+                
               
               
       (ipoib_tx_ring_size - 1)];<br>
                  
              dma_unmap_single(priv->ca->dma_device,</font>
<br><font size=2 face="sans-serif">         
               
               
        pci_unmap_addr(tx_req, mapping),<br>
                  
               
               tx_req->skb->len,<br>
@@ -513,7 +516,7 @@<br>
                  
              ++priv->tx_tail;<br>
                  
      }<br>
 <br>
-                
       for (i = 0; i < IPOIB_RX_RING_SIZE;
++i)<br>
+                
       for (i = 0; i < ipoib_rx_ring_size;
++i)<br>
                  
              if (priv->rx_ring[i].skb)
{<br>
                  
               
      dma_unmap_single(priv->ca->dma_device,<br>
                  
               
               
       pci_unmap_addr(&priv->rx_ring[i],<br>
diff -urN infiniband/ulp/ipoib/ipoib_main.c infiniband-ring/ulp/ipoib/ipoib_main.c<br>
--- infiniband/ulp/ipoib/ipoib_main.c        2006-02-01
13:45:43.000000000 -0800<br>
+++ infiniband-ring/ulp/ipoib/ipoib_main.c        2006-03-06
20:12:56.579739536 -0800<br>
@@ -53,6 +53,14 @@<br>
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");<br>
 MODULE_LICENSE("Dual BSD/GPL");<br>
 <br>
+int ipoib_rx_ring_size = IPOIB_RX_RING_SIZE;<br>
+int ipoib_tx_ring_size = IPOIB_TX_RING_SIZE;<br>
+<br>
+module_param_named(rx_ring_size, ipoib_rx_ring_size, int, 0);<br>
+MODULE_PARM_DESC(rx_ring_size, "change rx_ring_size");<br>
+module_param_named(tx_ring_size, ipoib_tx_ring_size, int, 0);<br>
+MODULE_PARM_DESC(tx_ring_size, "change tx_ring_size");<br>
+<br>
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG</font>
<br><font size=2 face="sans-serif"> int ipoib_debug_level;<br>
 <br>
@@ -794,21 +802,25 @@<br>
 <br>
         /* Allocate RX/TX "rings"
to hold queued skbs */<br>
 <br>
-        priv->rx_ring =      
 kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct ipoib_rx_buf),<br>
+        priv->rx_ring =      
 kzalloc(ipoib_rx_ring_size * sizeof (struct ipoib_rx_buf),<br>
                  
              GFP_KERNEL);<br>
         if (!priv->rx_ring) {<br>
                 printk(KERN_WARNING
"%s: failed to allocate RX ring (%d entries)\n",<br>
-                
      ca->name, IPOIB_RX_RING_SIZE);<br>
+                
      ca->name, ipoib_rx_ring_size);<br>
                 goto
out;<br>
         }<br>
+        printk(KERN_INFO "%s: RX_RING_SIZE
is set to %d entries\n",<br>
+               ca->name,
ipoib_rx_ring_size);<br>
 <br>
-        priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE
* sizeof (struct ipoib_tx_buf),<br>
+        priv->tx_ring = kzalloc(ipoib_tx_ring_size
* sizeof (struct ipoib_tx_buf),<br>
                  
              GFP_KERNEL);<br>
         if (!priv->tx_ring) {<br>
                 printk(KERN_WARNING
"%s: failed to allocate TX ring (%d entries)\n",<br>
-                
      ca->name, IPOIB_TX_RING_SIZE);<br>
+                
      ca->name, ipoib_tx_ring_size);<br>
                 goto
out_rx_ring_cleanup;<br>
         }<br>
+        printk(KERN_INFO "%s: TX_RING_SIZE
is set to %d entries\n",<br>
+               ca->name,
ipoib_tx_ring_size);</font>
<br><font size=2 face="sans-serif"> <br>
         /* priv->tx_head & tx_tail
are already 0 */<br>
 <br>
@@ -874,7 +886,7 @@<br>
         dev->hard_header_len    
     = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;<br>
         dev->addr_len      
           = INFINIBAND_ALEN;<br>
         dev->type      
           = ARPHRD_INFINIBAND;<br>
-        dev->tx_queue_len    
     = IPOIB_TX_RING_SIZE * 2;<br>
+        dev->tx_queue_len    
     = ipoib_tx_ring_size * 2;<br>
         dev->features      
     = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;<br>
 <br>
         /* MTU will be reset when mcast join
happens */<br>
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c infiniband-ring/ulp/ipoib/ipoib_verbs.c<br>
--- infiniband/ulp/ipoib/ipoib_verbs.c        2006-01-16
19:14:55.000000000 -0800<br>
+++ infiniband-ring/ulp/ipoib/ipoib_verbs.c        2006-03-06
19:58:49.476518800 -0800<br>
@@ -37,6 +37,9 @@<br>
 <br>
 #include "ipoib.h"<br>
 <br>
+extern int ipoib_rx_ring_size;<br>
+extern int ipoib_tx_ring_size;<br>
+<br>
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid
*mgid)<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
@@ -159,8 +162,8 @@<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         struct ib_qp_init_attr init_attr =
{<br>
                 .cap
= {<br>
-                
       .max_send_wr  = IPOIB_TX_RING_SIZE,</font>
<br><font size=2 face="sans-serif">-        
               .max_recv_wr
 = IPOIB_RX_RING_SIZE,<br>
+                
       .max_send_wr  = ipoib_tx_ring_size,<br>
+                
       .max_recv_wr  = ipoib_rx_ring_size,<br>
                  
      .max_send_sge = 1,<br>
                  
      .max_recv_sge = 1<br>
                 },<br>
@@ -175,7 +178,7 @@<br>
         }<br>
 <br>
         priv->cq = ib_create_cq(priv->ca,
ipoib_ib_completion, NULL, dev,<br>
-                
               IPOIB_TX_RING_SIZE
+ IPOIB_RX_RING_SIZE + 1);<br>
+                
               ipoib_tx_ring_size
+ ipoib_rx_ring_size + 1);<br>
         if (IS_ERR(priv->cq)) {<br>
                 printk(KERN_WARNING
"%s: failed to create CQ\n", ca->name);<br>
                 goto
out_free_pd;</font>
<br>
<br>
<br><font size=2 face="sans-serif"><br>
Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638</font></div>