<br><font size=2 face="sans-serif">Roland, </font>
<br>
<br><font size=2 face="sans-serif">Here is the tx_ring removal patch for
you to review.</font>
<br>
<div>
<br><font size=2 face="sans-serif">diff -urpN infiniband-ah/ulp/ipoib/ipoib.h
infiniband-tx/ulp/ipoib/ipoib.h<br>
--- infiniband-ah/ulp/ipoib/ipoib.h        2006-05-23
10:09:05.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib.h        2006-05-24
11:45:52.000000000 -0700<br>
@@ -114,11 +114,19 @@ struct ipoib_rx_buf {<br>
         dma_addr_t        mapping;<br>
 };<br>
 <br>
-struct ipoib_tx_buf {<br>
-        struct sk_buff *skb;<br>
-        DECLARE_PCI_UNMAP_ADDR(mapping)<br>
+struct ipoib_skb_prv {<br>
+        dma_addr_t        
       addr;<br>
+        struct ipoib_ah      
         *ah;<br>
+        struct sk_buff      
         *skb;<br>
+        struct list_head      
 list;<br>
 };<br>
 <br>
+#define IPOIB_SKB_PRV_ADDR(skb)        (((struct
ipoib_skb_prv *)(skb)->cb)->addr)<br>
+#define IPOIB_SKB_PRV_AH(skb)        (((struct
ipoib_skb_prv *)(skb)->cb)->ah)<br>
+#define IPOIB_SKB_PRV_SKB(skb)        (((struct
ipoib_skb_prv *)(skb)->cb)->skb)<br>
+#define IPOIB_SKB_PRV_LIST(skb)        (((struct
ipoib_skb_prv *)(skb)->cb)->list)<br>
+<br>
+<br>
 /*<br>
  * Device private locking: tx_lock protects members used in TX fast<br>
  * path (and we use LLTX so upper layers don't do extra locking).<br>
@@ -166,12 +174,11 @@ struct ipoib_dev_priv {<br>
 <br>
         struct ipoib_rx_buf *rx_ring;<br>
 <br>
-        spinlock_t        
  tx_lock        ____cacheline_aligned_in_smp;</font>
<br><font size=2 face="sans-serif">-        struct
ipoib_tx_buf *tx_ring;<br>
-        unsigned        
    tx_head;<br>
-        unsigned        
    tx_tail;<br>
+        spinlock_t        
  tx_lock;<br>
         struct ib_sge      
 tx_sge;<br>
         struct ib_send_wr    tx_wr;<br>
+        spinlock_t        
    slist_lock;<br>
+        struct list_head     send_list;<br>
 <br>
         struct list_head dead_ahs;<br>
 <br>
diff -urpN infiniband-ah/ulp/ipoib/ipoib_ib.c infiniband-tx/ulp/ipoib/ipoib_ib.c<br>
--- infiniband-ah/ulp/ipoib/ipoib_ib.c        2006-05-23
10:14:08.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib_ib.c        2006-05-24
11:58:46.000000000 -0700<br>
@@ -243,45 +243,36 @@ static void ipoib_ib_handle_send_wc(stru<br>
                  
               
  struct ib_wc *wc)<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
-        unsigned int wr_id = wc->wr_id;<br>
-        struct ipoib_tx_buf *tx_req;<br>
-        unsigned long flags;<br>
-<br>
-        ipoib_dbg_data(priv, "called:
id %d, op %d, status: %d\n",<br>
-                
      wr_id, wc->opcode, wc->status);<br>
-<br>
-        if (wr_id >= ipoib_sendq_size)
{<br>
-                ipoib_warn(priv,
"completion event with wrid %d (> %d)\n",<br>
-                
          wr_id, ipoib_sendq_size);<br>
-                return;<br>
-        }<br>
-<br>
-        ipoib_dbg_data(priv, "send complete,
wrid %d\n", wr_id);</font>
<br><font size=2 face="sans-serif">+        struct
sk_buff *skb;<br>
+        unsigned long wr_id = wc->wr_id;<br>
 <br>
-        tx_req = &priv->tx_ring[wr_id];<br>
-<br>
-        dma_unmap_single(priv->ca->dma_device,<br>
-                
        pci_unmap_addr(tx_req, mapping),<br>
-                
        tx_req->skb->len,<br>
-                
        DMA_TO_DEVICE);<br>
+        skb = (struct sk_buff *)wr_id;<br>
+        kref_put(&IPOIB_SKB_PRV_AH(skb)->ref,
ipoib_free_ah);<br>
 <br>
+         if (IS_ERR(skb) || skb != IPOIB_SKB_PRV_SKB(skb))
{<br>
+                 ipoib_warn(priv,
"send completion event with corrupted wrid\n");<br>
+                  return;<br>
+          }<br>
+        list_del(&IPOIB_SKB_PRV_LIST(skb));<br>
+  <br>
+        ipoib_dbg_data(priv, "send complete,
wrid %lu\n", wr_id);<br>
+  <br>
+          dma_unmap_single(priv->ca->dma_device,<br>
+                  
       IPOIB_SKB_PRV_ADDR(skb),<br>
+                
        skb->len,<br>
+                  
        DMA_TO_DEVICE);<br>
+  <br>
         ++priv->stats.tx_packets;<br>
-        priv->stats.tx_bytes += tx_req->skb->len;<br>
-<br>
-        dev_kfree_skb_any(tx_req->skb);<br>
-<br>
-        spin_lock_irqsave(&priv->tx_lock,
flags);<br>
-        ++priv->tx_tail;<br>
-        if (netif_queue_stopped(dev) &&<br>
-            priv->tx_head - priv->tx_tail
<= ipoib_sendq_size >> 1)<br>
-                netif_wake_queue(dev);<br>
-        spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
-<br>
-        if (wc->status != IB_WC_SUCCESS
&&</font>
<br><font size=2 face="sans-serif">-        
   wc->status != IB_WC_WR_FLUSH_ERR)<br>
-                ipoib_warn(priv,
"failed send event "<br>
-                
          "(status=%d, wrid=%d vend_err
%x)\n",<br>
-                
          wc->status, wr_id, wc->vendor_err);<br>
+        priv->stats.tx_bytes += skb->len;<br>
+        dev_kfree_skb_any(skb);<br>
+ <br>
+        if (netif_queue_stopped(dev))<br>
+                  netif_wake_queue(dev);<br>
+          if (wc->status != IB_WC_SUCCESS
&&<br>
+              wc->status
!= IB_WC_WR_FLUSH_ERR)<br>
+                  ipoib_warn(priv,
"failed send event "<br>
+                  
         "(status=%d, wrid=%lu vend_err
%x)\n",<br>
+                  
         wc->status, wr_id, wc->vendor_err);<br>
 }<br>
 <br>
 void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)<br>
@@ -313,7 +304,7 @@ void ipoib_ib_recv_completion(struct ib_<br>
 }<br>
 <br>
 static inline int post_send(struct ipoib_dev_priv *priv,<br>
-                
           unsigned int wr_id,<br>
+                
           unsigned long wr_id,<br>
                  
          struct ib_ah *address, u32 qpn,<br>
                  
          dma_addr_t addr, int len)<br>
 {<br>
@@ -333,8 +324,9 @@ void ipoib_send(struct net_device *dev, <br>
                 struct
ipoib_ah *address, u32 qpn)<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
-        struct ipoib_tx_buf *tx_req;<br>
         dma_addr_t addr;</font>
<br><font size=2 face="sans-serif">+        unsigned
long wr_id;<br>
+        unsigned long flags;<br>
         int err;<br>
 <br>
         kref_get(&address->ref);<br>
@@ -350,38 +342,31 @@ void ipoib_send(struct net_device *dev, <br>
 <br>
         ipoib_dbg_data(priv, "sending
packet, length=%d address=%p qpn=0x%06x\n",<br>
                  
     skb->len, address, qpn);<br>
-<br>
-        /*<br>
-         * We put the skb into the tx_ring
_before_ we call post_send()<br>
-         * because it's entirely possible
that the completion handler will<br>
-         * run before we execute anything
after the post_send().  That<br>
-         * means we have to make sure everything
is properly recorded and<br>
-         * our state is consistent before
we call post_send().<br>
-         */<br>
-        tx_req = &priv->tx_ring[priv->tx_head
& (ipoib_sendq_size - 1)];<br>
-        tx_req->skb = skb;<br>
         addr = dma_map_single(priv->ca->dma_device,
skb->data, skb->len,<br>
                  
            DMA_TO_DEVICE);<br>
-        pci_unmap_addr_set(tx_req, mapping,
addr);<br>
-<br>
-        err = post_send(priv, priv->tx_head
& (ipoib_sendq_size - 1),        <br>
-                
       address->ah, qpn, addr, skb->len);
<br>
-        kref_put(&address->ref, ipoib_free_ah);<br>
-        if (unlikely(err)) {<br>
-                ipoib_warn(priv,
"post_send failed\n");</font>
<br><font size=2 face="sans-serif">-        
       ++priv->stats.tx_errors;<br>
-                dma_unmap_single(priv->ca->dma_device,
addr, skb->len,<br>
-                
               
DMA_TO_DEVICE);<br>
-                dev_kfree_skb_any(skb);<br>
-        } else {<br>
-                dev->trans_start
= jiffies;<br>
-<br>
-                ++priv->tx_head;<br>
-<br>
-                if
(priv->tx_head - priv->tx_tail == ipoib_sendq_size) {<br>
-                
       ipoib_dbg(priv, "TX ring full,
stopping kernel net queue\n");<br>
+         wr_id = (unsigned long)skb;<br>
+         err = post_send(priv, wr_id, address->ah,
qpn, addr, skb->len);<br>
+         if (!err) {<br>
+                 dev->trans_start
= jiffies;<br>
+                 IPOIB_SKB_PRV_ADDR(skb)
= addr;<br>
+                 IPOIB_SKB_PRV_AH(skb)
= address;<br>
+                 IPOIB_SKB_PRV_SKB(skb)
= skb;<br>
+                spin_lock_irqsave(&priv->slist_lock,
flags);<br>
+                 list_add_tail(&IPOIB_SKB_PRV_LIST(skb),
&priv->send_list);<br>
+                spin_unlock_irqrestore(&priv->slist_lock,
flags);<br>
+                return;<br>
+         } else {<br>
+                 if
(!netif_queue_stopped(dev)) {<br>
                  
      netif_stop_queue(dev);<br>
+                  
      ipoib_warn(priv, "stopping kernel net
queue\n");<br>
                 }<br>
+                  dma_unmap_single(priv->ca->dma_device,
addr, skb->len,<br>
+                  
               
DMA_TO_DEVICE);<br>
+                  ipoib_warn(priv,
"post_send failed\n");<br>
+                ++priv->stats.tx_dropped;</font>
<br><font size=2 face="sans-serif">+        
       ++priv->stats.tx_errors;<br>
+                dev_kfree_skb_any(skb);<br>
+                kref_put(&address->ref,
ipoib_free_ah);<br>
         }<br>
 }<br>
 <br>
@@ -480,7 +465,9 @@ int ipoib_ib_dev_stop(struct net_device <br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
         struct ib_qp_attr qp_attr;<br>
         unsigned long begin;<br>
-        struct ipoib_tx_buf *tx_req;<br>
+        unsigned long flags;<br>
+        struct ipoib_skb_prv *cb, *tcb;<br>
+        struct sk_buff *skb;<br>
         int i;<br>
 <br>
         clear_bit(IPOIB_FLAG_INITIALIZED,
&priv->flags);<br>
@@ -496,25 +483,25 @@ int ipoib_ib_dev_stop(struct net_device <br>
         /* Wait for all sends and receives
to complete */<br>
         begin = jiffies;<br>
 <br>
-        while (priv->tx_head != priv->tx_tail
|| recvs_pending(dev)) {<br>
+        while (!list_empty(&priv->send_list)
|| recvs_pending(dev)) {<br>
                 if
(time_after(jiffies, begin + 5 * HZ)) {<br>
-                
       ipoib_warn(priv, "timing out; %d
sends %d receives not completed\n",<br>
-                
               
  priv->tx_head - priv->tx_tail, recvs_pending(dev));<br>
+                
       ipoib_warn(priv, "timing out; %d
receives not completed\n",<br>
+                
               
 recvs_pending(dev));<br>
 <br>
                  
      /*<br>
                  
       * assume the HW is wedged and just free
up</font>
<br><font size=2 face="sans-serif">         
               
* all our pending work requests.<br>
                  
       */<br>
-                
       while ((int) priv->tx_tail - (int)
priv->tx_head < 0) {<br>
-                
               tx_req
= &priv->tx_ring[priv->tx_tail &<br>
-                
               
               
       (ipoib_sendq_size - 1)];<br>
-                
               dma_unmap_single(priv->ca->dma_device,<br>
-                
               
               
pci_unmap_addr(tx_req, mapping),<br>
-                
               
               
tx_req->skb->len,<br>
-                
               
               
DMA_TO_DEVICE);<br>
-                
               dev_kfree_skb_any(tx_req->skb);<br>
-                
               ++priv->tx_tail;<br>
-                
       }<br>
+                
       spin_lock_irqsave(&priv->slist_lock,
flags);<br>
+                  
      list_for_each_entry_safe(cb, tcb, &priv->send_list,<br>
+                
               
           list) {<br>
+                  
              skb =
cb->skb;<br>
+                  
               dma_unmap_single(priv->ca->dma_device,<br>
+                  
               
               IPOIB_SKB_PRV_ADDR(skb),<br>
+                  
               
               skb->len,
DMA_TO_DEVICE);<br>
+                  
              dev_kfree_skb_any(skb);<br>
+                  
       }<br>
+                
       spin_unlock_irqrestore(&priv->slist_lock,
flags);<br>
 <br>
                  
      for (i = 0; i < ipoib_recvq_size; ++i)<br>
                  
              if (priv->rx_ring[i].skb)
{<br>
diff -urpN infiniband-ah/ulp/ipoib/ipoib_main.c infiniband-tx/ulp/ipoib/ipoib_main.c<br>
--- infiniband-ah/ulp/ipoib/ipoib_main.c        2006-05-23
09:31:49.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib_main.c        2006-05-24
11:47:06.000000000 -0700</font>
<br><font size=2 face="sans-serif">@@ -708,9 +708,7 @@ static void ipoib_timeout(struct
net_dev<br>
 <br>
         ipoib_warn(priv, "transmit timeout:
latency %d msecs\n",<br>
                  
 jiffies_to_msecs(jiffies - dev->trans_start));<br>
-        ipoib_warn(priv, "queue stopped
%d, tx_head %u, tx_tail %u\n",<br>
-                
  netif_queue_stopped(dev),<br>
-                
  priv->tx_head, priv->tx_tail);<br>
+        ipoib_warn(priv, "queue stopped
%d\n", netif_queue_stopped(dev));<br>
         /* XXX reset QP, etc. */<br>
 }<br>
 <br>
@@ -846,7 +844,7 @@ int ipoib_dev_init(struct net_device *de<br>
 {<br>
         struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
 <br>
-        /* Allocate RX/TX "rings"
to hold queued skbs */<br>
+        /* Allocate RX "rings" to
hold queued skbs */<br>
         priv->rx_ring =      
 kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,<br>
                  
              GFP_KERNEL);<br>
         if (!priv->rx_ring) {<br>
@@ -855,24 +853,11 @@ int ipoib_dev_init(struct net_device *de<br>
                 goto
out;<br>
         }<br>
 <br>
-        priv->tx_ring = kzalloc(ipoib_sendq_size
* sizeof *priv->tx_ring,<br>
-                
               GFP_KERNEL);<br>
-        if (!priv->tx_ring) {<br>
-                printk(KERN_WARNING
"%s: failed to allocate TX ring (%d entries)\n",<br>
-                
      ca->name, ipoib_sendq_size);</font>
<br><font size=2 face="sans-serif">-        
       goto out_rx_ring_cleanup;<br>
-        }<br>
-<br>
-        /* priv->tx_head & tx_tail
are already 0 */<br>
-<br>
         if (ipoib_ib_dev_init(dev, ca, port))<br>
-                goto
out_tx_ring_cleanup;<br>
+                goto
out_rx_ring_cleanup;<br>
 <br>
         return 0;<br>
 <br>
-out_tx_ring_cleanup:<br>
-        kfree(priv->tx_ring);<br>
-<br>
 out_rx_ring_cleanup:<br>
         kfree(priv->rx_ring);<br>
 <br>
@@ -896,10 +881,8 @@ void ipoib_dev_cleanup(struct net_device<br>
         ipoib_ib_dev_cleanup(dev);<br>
 <br>
         kfree(priv->rx_ring);<br>
-        kfree(priv->tx_ring);<br>
 <br>
         priv->rx_ring = NULL;<br>
-        priv->tx_ring = NULL;<br>
 }<br>
 <br>
 static void ipoib_setup(struct net_device *dev)<br>
@@ -944,6 +927,7 @@ static void ipoib_setup(struct net_devic<br>
 <br>
         spin_lock_init(&priv->lock);<br>
         spin_lock_init(&priv->tx_lock);<br>
+        spin_lock_init(&priv->slist_lock);<br>
 <br>
         mutex_init(&priv->mcast_mutex);<br>
         mutex_init(&priv->vlan_mutex);<br>
@@ -952,6 +936,7 @@ static void ipoib_setup(struct net_devic<br>
         INIT_LIST_HEAD(&priv->child_intfs);<br>
         INIT_LIST_HEAD(&priv->dead_ahs);<br>
         INIT_LIST_HEAD(&priv->multicast_list);<br>
+        INIT_LIST_HEAD(&priv->send_list);
       <br>
 <br>
         INIT_WORK(&priv->pkey_task,
   ipoib_pkey_poll,          priv->dev);</font>
<br><font size=2 face="sans-serif">         INIT_WORK(&priv->mcast_task,
  ipoib_mcast_join_task,    priv->dev);</font>
<br>
<br><font size=2 face="sans-serif"><br>
Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638<br>
<br>
</font></div>