[openib-general] [PATCH] TX/RX_RING_SIZE as loadable parameters

Shirley Ma xma at us.ibm.com
Mon Mar 6 21:04:57 PST 2006


Hello Roland,

The default TX_RING_SIZE is 64, RX_RING_SIZE is 128 in IPoIB, which are 
not optimized for all IB device drivers. We saw numerous retransmissions 
in our MPI stress test cluster environment. I've changed these two 
parameters as loadable. We even got 6 times better performance test 
results with much bigger TX/RX_RING_SIZE, and no more retransmissions. I 
think it's more reasonable to allow these parameters to be set during 
module load.

Here is the patch for review.

Sign-off-by: Shirley Ma <xma at us.ibm.com>

diff -urN infiniband/ulp/ipoib/ipoib_ib.c 
infiniband-ring/ulp/ipoib/ipoib_ib.c
--- infiniband/ulp/ipoib/ipoib_ib.c     2006-03-03 13:57:18.000000000 
-0800
+++ infiniband-ring/ulp/ipoib/ipoib_ib.c        2006-03-06 
20:11:06.155526568 -0800
@@ -54,6 +54,9 @@
 
 static DEFINE_MUTEX(pkey_mutex);
 
+extern int ipoib_rx_ring_size;
+extern int ipoib_tx_ring_size;
+
 struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
                                 struct ib_pd *pd, struct ib_ah_attr 
*attr)
 {
@@ -161,7 +164,7 @@
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        int i;
 
-       for (i = 0; i < IPOIB_RX_RING_SIZE; ++i) {
+       for (i = 0; i < ipoib_rx_ring_size; ++i) {
                if (ipoib_alloc_rx_skb(dev, i)) {
                        ipoib_warn(priv, "failed to allocate receive 
buffer %d\n", i);
                        return -ENOMEM;
@@ -187,7 +190,7 @@
        if (wr_id & IPOIB_OP_RECV) {
                wr_id &= ~IPOIB_OP_RECV;
 
-               if (wr_id < IPOIB_RX_RING_SIZE) {
+               if (wr_id < ipoib_rx_ring_size) {
                        struct sk_buff *skb  = priv->rx_ring[wr_id].skb;
                        dma_addr_t      addr = 
priv->rx_ring[wr_id].mapping;
 
@@ -252,9 +255,9 @@
                struct ipoib_tx_buf *tx_req;
                unsigned long flags;
 
-               if (wr_id >= IPOIB_TX_RING_SIZE) {
+               if (wr_id >= ipoib_tx_ring_size) {
                        ipoib_warn(priv, "completion event with wrid %d (> 
%d)\n",
-                                  wr_id, IPOIB_TX_RING_SIZE);
+                                  wr_id, ipoib_tx_ring_size);
                        return;
                }
 
@@ -275,7 +278,7 @@
                spin_lock_irqsave(&priv->tx_lock, flags);
                ++priv->tx_tail;
                if (netif_queue_stopped(dev) &&
-                   priv->tx_head - priv->tx_tail <= IPOIB_TX_RING_SIZE / 
2)
+                   priv->tx_head - priv->tx_tail <= ipoib_tx_ring_size / 
2)
                        netif_wake_queue(dev);
                spin_unlock_irqrestore(&priv->tx_lock, flags);
 
@@ -344,13 +347,13 @@
         * means we have to make sure everything is properly recorded and
         * our state is consistent before we call post_send().
         */
-       tx_req = &priv->tx_ring[priv->tx_head & (IPOIB_TX_RING_SIZE - 1)];
+       tx_req = &priv->tx_ring[priv->tx_head & (ipoib_tx_ring_size - 1)];
        tx_req->skb = skb;
        addr = dma_map_single(priv->ca->dma_device, skb->data, skb->len,
                              DMA_TO_DEVICE);
        pci_unmap_addr_set(tx_req, mapping, addr);
 
-       if (unlikely(post_send(priv, priv->tx_head & (IPOIB_TX_RING_SIZE - 
1),
+       if (unlikely(post_send(priv, priv->tx_head & (ipoib_tx_ring_size - 
1),
                               address->ah, qpn, addr, skb->len))) {
                ipoib_warn(priv, "post_send failed\n");
                ++priv->stats.tx_errors;
@@ -363,7 +366,7 @@
                address->last_send = priv->tx_head;
                ++priv->tx_head;
 
-               if (priv->tx_head - priv->tx_tail == IPOIB_TX_RING_SIZE) {
+               if (priv->tx_head - priv->tx_tail == ipoib_tx_ring_size) {
                        ipoib_dbg(priv, "TX ring full, stopping kernel net 
queue\n");
                        netif_stop_queue(dev);
                }
@@ -467,7 +470,7 @@
        int pending = 0;
        int i;
 
-       for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+       for (i = 0; i < ipoib_rx_ring_size; ++i)
                if (priv->rx_ring[i].skb)
                        ++pending;
 
@@ -504,7 +507,7 @@
                         */
                        while ((int) priv->tx_tail - (int) priv->tx_head < 
0) {
                                tx_req = &priv->tx_ring[priv->tx_tail &
- (IPOIB_TX_RING_SIZE - 1)];
+ (ipoib_tx_ring_size - 1)];
                                dma_unmap_single(priv->ca->dma_device,
                                                 pci_unmap_addr(tx_req, 
mapping),
                                                 tx_req->skb->len,
@@ -513,7 +516,7 @@
                                ++priv->tx_tail;
                        }
 
-                       for (i = 0; i < IPOIB_RX_RING_SIZE; ++i)
+                       for (i = 0; i < ipoib_rx_ring_size; ++i)
                                if (priv->rx_ring[i].skb) {
 dma_unmap_single(priv->ca->dma_device,
 pci_unmap_addr(&priv->rx_ring[i],
diff -urN infiniband/ulp/ipoib/ipoib_main.c 
infiniband-ring/ulp/ipoib/ipoib_main.c
--- infiniband/ulp/ipoib/ipoib_main.c   2006-02-01 13:45:43.000000000 
-0800
+++ infiniband-ring/ulp/ipoib/ipoib_main.c      2006-03-06 
20:12:56.579739536 -0800
@@ -53,6 +53,14 @@
 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
+int ipoib_rx_ring_size = IPOIB_RX_RING_SIZE;
+int ipoib_tx_ring_size = IPOIB_TX_RING_SIZE;
+
+module_param_named(rx_ring_size, ipoib_rx_ring_size, int, 0);
+MODULE_PARM_DESC(rx_ring_size, "change rx_ring_size");
+module_param_named(tx_ring_size, ipoib_tx_ring_size, int, 0);
+MODULE_PARM_DESC(tx_ring_size, "change tx_ring_size");
+
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 int ipoib_debug_level;
 
@@ -794,21 +802,25 @@
 
        /* Allocate RX/TX "rings" to hold queued skbs */
 
-       priv->rx_ring = kzalloc(IPOIB_RX_RING_SIZE * sizeof (struct 
ipoib_rx_buf),
+       priv->rx_ring = kzalloc(ipoib_rx_ring_size * sizeof (struct 
ipoib_rx_buf),
                                GFP_KERNEL);
        if (!priv->rx_ring) {
                printk(KERN_WARNING "%s: failed to allocate RX ring (%d 
entries)\n",
-                      ca->name, IPOIB_RX_RING_SIZE);
+                      ca->name, ipoib_rx_ring_size);
                goto out;
        }
+       printk(KERN_INFO "%s: RX_RING_SIZE is set to %d entries\n",
+              ca->name, ipoib_rx_ring_size);
 
-       priv->tx_ring = kzalloc(IPOIB_TX_RING_SIZE * sizeof (struct 
ipoib_tx_buf),
+       priv->tx_ring = kzalloc(ipoib_tx_ring_size * sizeof (struct 
ipoib_tx_buf),
                                GFP_KERNEL);
        if (!priv->tx_ring) {
                printk(KERN_WARNING "%s: failed to allocate TX ring (%d 
entries)\n",
-                      ca->name, IPOIB_TX_RING_SIZE);
+                      ca->name, ipoib_tx_ring_size);
                goto out_rx_ring_cleanup;
        }
+       printk(KERN_INFO "%s: TX_RING_SIZE is set to %d entries\n",
+              ca->name, ipoib_tx_ring_size);
 
        /* priv->tx_head & tx_tail are already 0 */
 
@@ -874,7 +886,7 @@
        dev->hard_header_len     = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
        dev->addr_len            = INFINIBAND_ALEN;
        dev->type                = ARPHRD_INFINIBAND;
-       dev->tx_queue_len        = IPOIB_TX_RING_SIZE * 2;
+       dev->tx_queue_len        = ipoib_tx_ring_size * 2;
        dev->features            = NETIF_F_VLAN_CHALLENGED | NETIF_F_LLTX;
 
        /* MTU will be reset when mcast join happens */
diff -urN infiniband/ulp/ipoib/ipoib_verbs.c 
infiniband-ring/ulp/ipoib/ipoib_verbs.c
--- infiniband/ulp/ipoib/ipoib_verbs.c  2006-01-16 19:14:55.000000000 
-0800
+++ infiniband-ring/ulp/ipoib/ipoib_verbs.c     2006-03-06 
19:58:49.476518800 -0800
@@ -37,6 +37,9 @@
 
 #include "ipoib.h"
 
+extern int ipoib_rx_ring_size;
+extern int ipoib_tx_ring_size;
+
 int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid 
*mgid)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -159,8 +162,8 @@
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ib_qp_init_attr init_attr = {
                .cap = {
-                       .max_send_wr  = IPOIB_TX_RING_SIZE,
-                       .max_recv_wr  = IPOIB_RX_RING_SIZE,
+                       .max_send_wr  = ipoib_tx_ring_size,
+                       .max_recv_wr  = ipoib_rx_ring_size,
                        .max_send_sge = 1,
                        .max_recv_sge = 1
                },
@@ -175,7 +178,7 @@
        }
 
        priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev,
-                               IPOIB_TX_RING_SIZE + IPOIB_RX_RING_SIZE + 
1);
+                               ipoib_tx_ring_size + ipoib_rx_ring_size + 
1);
        if (IS_ERR(priv->cq)) {
                printk(KERN_WARNING "%s: failed to create CQ\n", 
ca->name);
                goto out_free_pd;



Thanks
Shirley Ma
IBM Linux Technology Center
15300 SW Koll Parkway
Beaverton, OR 97006-6063
Phone(Fax): (503) 578-7638
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060306/23b70304/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: infiniband-ring.patch
Type: application/octet-stream
Size: 6750 bytes
Desc: not available
URL: <http://lists.openfabrics.org/pipermail/general/attachments/20060306/23b70304/attachment.obj>


More information about the general mailing list