<br><font size=2 face="sans-serif">Roland, </font>
<br>
<br><font size=2 face="sans-serif">Here is the tx_ring removal patch for
you to review.</font>
<br>
<div>
<br><font size=2 face="sans-serif">diff -urpN infiniband-ah/ulp/ipoib/ipoib.h
infiniband-tx/ulp/ipoib/ipoib.h<br>
--- infiniband-ah/ulp/ipoib/ipoib.h 2006-05-23
10:09:05.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib.h 2006-05-24
11:45:52.000000000 -0700<br>
@@ -114,11 +114,19 @@ struct ipoib_rx_buf {<br>
dma_addr_t mapping;<br>
};<br>
<br>
-struct ipoib_tx_buf {<br>
- struct sk_buff *skb;<br>
- DECLARE_PCI_UNMAP_ADDR(mapping)<br>
+struct ipoib_skb_prv {<br>
+ dma_addr_t
addr;<br>
+ struct ipoib_ah
*ah;<br>
+ struct sk_buff
*skb;<br>
+ struct list_head
list;<br>
};<br>
<br>
+#define IPOIB_SKB_PRV_ADDR(skb) (((struct
ipoib_skb_prv *)(skb)->cb)->addr)<br>
+#define IPOIB_SKB_PRV_AH(skb) (((struct
ipoib_skb_prv *)(skb)->cb)->ah)<br>
+#define IPOIB_SKB_PRV_SKB(skb) (((struct
ipoib_skb_prv *)(skb)->cb)->skb)<br>
+#define IPOIB_SKB_PRV_LIST(skb) (((struct
ipoib_skb_prv *)(skb)->cb)->list)<br>
+<br>
+<br>
/*<br>
* Device private locking: tx_lock protects members used in TX fast<br>
* path (and we use LLTX so upper layers don't do extra locking).<br>
@@ -166,12 +174,11 @@ struct ipoib_dev_priv {<br>
<br>
struct ipoib_rx_buf *rx_ring;<br>
<br>
- spinlock_t
tx_lock ____cacheline_aligned_in_smp;</font>
<br><font size=2 face="sans-serif">- struct
ipoib_tx_buf *tx_ring;<br>
- unsigned
tx_head;<br>
- unsigned
tx_tail;<br>
+ spinlock_t
tx_lock;<br>
struct ib_sge
tx_sge;<br>
struct ib_send_wr tx_wr;<br>
+ spinlock_t
slist_lock;<br>
+ struct list_head send_list;<br>
<br>
struct list_head dead_ahs;<br>
<br>
diff -urpN infiniband-ah/ulp/ipoib/ipoib_ib.c infiniband-tx/ulp/ipoib/ipoib_ib.c<br>
--- infiniband-ah/ulp/ipoib/ipoib_ib.c 2006-05-23
10:14:08.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib_ib.c 2006-05-24
11:58:46.000000000 -0700<br>
@@ -243,45 +243,36 @@ static void ipoib_ib_handle_send_wc(stru<br>
struct ib_wc *wc)<br>
{<br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
- unsigned int wr_id = wc->wr_id;<br>
- struct ipoib_tx_buf *tx_req;<br>
- unsigned long flags;<br>
-<br>
- ipoib_dbg_data(priv, "called:
id %d, op %d, status: %d\n",<br>
-
wr_id, wc->opcode, wc->status);<br>
-<br>
- if (wr_id >= ipoib_sendq_size)
{<br>
- ipoib_warn(priv,
"completion event with wrid %d (> %d)\n",<br>
-
wr_id, ipoib_sendq_size);<br>
- return;<br>
- }<br>
-<br>
- ipoib_dbg_data(priv, "send complete,
wrid %d\n", wr_id);</font>
<br><font size=2 face="sans-serif">+ struct
sk_buff *skb;<br>
+ unsigned long wr_id = wc->wr_id;<br>
<br>
- tx_req = &priv->tx_ring[wr_id];<br>
-<br>
- dma_unmap_single(priv->ca->dma_device,<br>
-
pci_unmap_addr(tx_req, mapping),<br>
-
tx_req->skb->len,<br>
-
DMA_TO_DEVICE);<br>
+ skb = (struct sk_buff *)wr_id;<br>
+ kref_put(&IPOIB_SKB_PRV_AH(skb)->ref,
ipoib_free_ah);<br>
<br>
+ if (IS_ERR(skb) || skb != IPOIB_SKB_PRV_SKB(skb))
{<br>
+ ipoib_warn(priv,
"send completion event with corrupted wrid\n");<br>
+ return;<br>
+ }<br>
+ list_del(&IPOIB_SKB_PRV_LIST(skb));<br>
+ <br>
+ ipoib_dbg_data(priv, "send complete,
wrid %lu\n", wr_id);<br>
+ <br>
+ dma_unmap_single(priv->ca->dma_device,<br>
+
IPOIB_SKB_PRV_ADDR(skb),<br>
+
skb->len,<br>
+
DMA_TO_DEVICE);<br>
+ <br>
++priv->stats.tx_packets;<br>
- priv->stats.tx_bytes += tx_req->skb->len;<br>
-<br>
- dev_kfree_skb_any(tx_req->skb);<br>
-<br>
- spin_lock_irqsave(&priv->tx_lock,
flags);<br>
- ++priv->tx_tail;<br>
- if (netif_queue_stopped(dev) &&<br>
- priv->tx_head - priv->tx_tail
<= ipoib_sendq_size >> 1)<br>
- netif_wake_queue(dev);<br>
- spin_unlock_irqrestore(&priv->tx_lock,
flags);<br>
-<br>
- if (wc->status != IB_WC_SUCCESS
&&</font>
<br><font size=2 face="sans-serif">-
wc->status != IB_WC_WR_FLUSH_ERR)<br>
- ipoib_warn(priv,
"failed send event "<br>
-
"(status=%d, wrid=%d vend_err
%x)\n",<br>
-
wc->status, wr_id, wc->vendor_err);<br>
+ priv->stats.tx_bytes += skb->len;<br>
+ dev_kfree_skb_any(skb);<br>
+ <br>
+ if (netif_queue_stopped(dev))<br>
+ netif_wake_queue(dev);<br>
+ if (wc->status != IB_WC_SUCCESS
&&<br>
+ wc->status
!= IB_WC_WR_FLUSH_ERR)<br>
+ ipoib_warn(priv,
"failed send event "<br>
+
"(status=%d, wrid=%lu vend_err
%x)\n",<br>
+
wc->status, wr_id, wc->vendor_err);<br>
}<br>
<br>
void ipoib_ib_send_completion(struct ib_cq *cq, void *dev_ptr)<br>
@@ -313,7 +304,7 @@ void ipoib_ib_recv_completion(struct ib_<br>
}<br>
<br>
static inline int post_send(struct ipoib_dev_priv *priv,<br>
-
unsigned int wr_id,<br>
+
unsigned long wr_id,<br>
struct ib_ah *address, u32 qpn,<br>
dma_addr_t addr, int len)<br>
{<br>
@@ -333,8 +324,9 @@ void ipoib_send(struct net_device *dev, <br>
struct
ipoib_ah *address, u32 qpn)<br>
{<br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
- struct ipoib_tx_buf *tx_req;<br>
dma_addr_t addr;</font>
<br><font size=2 face="sans-serif">+ unsigned
long wr_id;<br>
+ unsigned long flags;<br>
int err;<br>
<br>
kref_get(&address->ref);<br>
@@ -350,38 +342,31 @@ void ipoib_send(struct net_device *dev, <br>
<br>
ipoib_dbg_data(priv, "sending
packet, length=%d address=%p qpn=0x%06x\n",<br>
skb->len, address, qpn);<br>
-<br>
- /*<br>
- * We put the skb into the tx_ring
_before_ we call post_send()<br>
- * because it's entirely possible
that the completion handler will<br>
- * run before we execute anything
after the post_send(). That<br>
- * means we have to make sure everything
is properly recorded and<br>
- * our state is consistent before
we call post_send().<br>
- */<br>
- tx_req = &priv->tx_ring[priv->tx_head
& (ipoib_sendq_size - 1)];<br>
- tx_req->skb = skb;<br>
addr = dma_map_single(priv->ca->dma_device,
skb->data, skb->len,<br>
DMA_TO_DEVICE);<br>
- pci_unmap_addr_set(tx_req, mapping,
addr);<br>
-<br>
- err = post_send(priv, priv->tx_head
& (ipoib_sendq_size - 1), <br>
-
address->ah, qpn, addr, skb->len);
<br>
- kref_put(&address->ref, ipoib_free_ah);<br>
- if (unlikely(err)) {<br>
- ipoib_warn(priv,
"post_send failed\n");</font>
<br><font size=2 face="sans-serif">-
++priv->stats.tx_errors;<br>
- dma_unmap_single(priv->ca->dma_device,
addr, skb->len,<br>
-
DMA_TO_DEVICE);<br>
- dev_kfree_skb_any(skb);<br>
- } else {<br>
- dev->trans_start
= jiffies;<br>
-<br>
- ++priv->tx_head;<br>
-<br>
- if
(priv->tx_head - priv->tx_tail == ipoib_sendq_size) {<br>
-
ipoib_dbg(priv, "TX ring full,
stopping kernel net queue\n");<br>
+ wr_id = (unsigned long)skb;<br>
+ err = post_send(priv, wr_id, address->ah,
qpn, addr, skb->len);<br>
+ if (!err) {<br>
+ dev->trans_start
= jiffies;<br>
+ IPOIB_SKB_PRV_ADDR(skb)
= addr;<br>
+ IPOIB_SKB_PRV_AH(skb)
= address;<br>
+ IPOIB_SKB_PRV_SKB(skb)
= skb;<br>
+ spin_lock_irqsave(&priv->slist_lock,
flags);<br>
+ list_add_tail(&IPOIB_SKB_PRV_LIST(skb),
&priv->send_list);<br>
+ spin_unlock_irqrestore(&priv->slist_lock,
flags);<br>
+ return;<br>
+ } else {<br>
+ if
(!netif_queue_stopped(dev)) {<br>
netif_stop_queue(dev);<br>
+
ipoib_warn(priv, "stopping kernel net
queue\n");<br>
}<br>
+ dma_unmap_single(priv->ca->dma_device,
addr, skb->len,<br>
+
DMA_TO_DEVICE);<br>
+ ipoib_warn(priv,
"post_send failed\n");<br>
+ ++priv->stats.tx_dropped;</font>
<br><font size=2 face="sans-serif">+
++priv->stats.tx_errors;<br>
+ dev_kfree_skb_any(skb);<br>
+ kref_put(&address->ref,
ipoib_free_ah);<br>
}<br>
}<br>
<br>
@@ -480,7 +465,9 @@ int ipoib_ib_dev_stop(struct net_device <br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
struct ib_qp_attr qp_attr;<br>
unsigned long begin;<br>
- struct ipoib_tx_buf *tx_req;<br>
+ unsigned long flags;<br>
+ struct ipoib_skb_prv *cb, *tcb;<br>
+ struct sk_buff *skb;<br>
int i;<br>
<br>
clear_bit(IPOIB_FLAG_INITIALIZED,
&priv->flags);<br>
@@ -496,25 +483,25 @@ int ipoib_ib_dev_stop(struct net_device <br>
/* Wait for all sends and receives
to complete */<br>
begin = jiffies;<br>
<br>
- while (priv->tx_head != priv->tx_tail
|| recvs_pending(dev)) {<br>
+ while (!list_empty(&priv->send_list)
|| recvs_pending(dev)) {<br>
if
(time_after(jiffies, begin + 5 * HZ)) {<br>
-
ipoib_warn(priv, "timing out; %d
sends %d receives not completed\n",<br>
-
priv->tx_head - priv->tx_tail, recvs_pending(dev));<br>
+
ipoib_warn(priv, "timing out; %d
receives not completed\n",<br>
+
recvs_pending(dev));<br>
<br>
/*<br>
* assume the HW is wedged and just free
up</font>
<br><font size=2 face="sans-serif">
* all our pending work requests.<br>
*/<br>
-
while ((int) priv->tx_tail - (int)
priv->tx_head < 0) {<br>
-
tx_req
= &priv->tx_ring[priv->tx_tail &<br>
-
(ipoib_sendq_size - 1)];<br>
-
dma_unmap_single(priv->ca->dma_device,<br>
-
pci_unmap_addr(tx_req, mapping),<br>
-
tx_req->skb->len,<br>
-
DMA_TO_DEVICE);<br>
-
dev_kfree_skb_any(tx_req->skb);<br>
-
++priv->tx_tail;<br>
-
}<br>
+
spin_lock_irqsave(&priv->slist_lock,
flags);<br>
+
list_for_each_entry_safe(cb, tcb, &priv->send_list,<br>
+
list) {<br>
+
skb =
cb->skb;<br>
+
dma_unmap_single(priv->ca->dma_device,<br>
+
IPOIB_SKB_PRV_ADDR(skb),<br>
+
skb->len,
DMA_TO_DEVICE);<br>
+
dev_kfree_skb_any(skb);<br>
+
}<br>
+
spin_unlock_irqrestore(&priv->slist_lock,
flags);<br>
<br>
for (i = 0; i < ipoib_recvq_size; ++i)<br>
if (priv->rx_ring[i].skb)
{<br>
diff -urpN infiniband-ah/ulp/ipoib/ipoib_main.c infiniband-tx/ulp/ipoib/ipoib_main.c<br>
--- infiniband-ah/ulp/ipoib/ipoib_main.c 2006-05-23
09:31:49.000000000 -0700<br>
+++ infiniband-tx/ulp/ipoib/ipoib_main.c 2006-05-24
11:47:06.000000000 -0700</font>
<br><font size=2 face="sans-serif">@@ -708,9 +708,7 @@ static void ipoib_timeout(struct
net_dev<br>
<br>
ipoib_warn(priv, "transmit timeout:
latency %d msecs\n",<br>
jiffies_to_msecs(jiffies - dev->trans_start));<br>
- ipoib_warn(priv, "queue stopped
%d, tx_head %u, tx_tail %u\n",<br>
-
netif_queue_stopped(dev),<br>
-
priv->tx_head, priv->tx_tail);<br>
+ ipoib_warn(priv, "queue stopped
%d\n", netif_queue_stopped(dev));<br>
/* XXX reset QP, etc. */<br>
}<br>
<br>
@@ -846,7 +844,7 @@ int ipoib_dev_init(struct net_device *de<br>
{<br>
struct ipoib_dev_priv *priv = netdev_priv(dev);<br>
<br>
- /* Allocate RX/TX "rings"
to hold queued skbs */<br>
+ /* Allocate RX "rings" to
hold queued skbs */<br>
priv->rx_ring =
kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,<br>
GFP_KERNEL);<br>
if (!priv->rx_ring) {<br>
@@ -855,24 +853,11 @@ int ipoib_dev_init(struct net_device *de<br>
goto
out;<br>
}<br>
<br>
- priv->tx_ring = kzalloc(ipoib_sendq_size
* sizeof *priv->tx_ring,<br>
-
GFP_KERNEL);<br>
- if (!priv->tx_ring) {<br>
- printk(KERN_WARNING
"%s: failed to allocate TX ring (%d entries)\n",<br>
-
ca->name, ipoib_sendq_size);</font>
<br><font size=2 face="sans-serif">-
goto out_rx_ring_cleanup;<br>
- }<br>
-<br>
- /* priv->tx_head & tx_tail
are already 0 */<br>
-<br>
if (ipoib_ib_dev_init(dev, ca, port))<br>
- goto
out_tx_ring_cleanup;<br>
+ goto
out_rx_ring_cleanup;<br>
<br>
return 0;<br>
<br>
-out_tx_ring_cleanup:<br>
- kfree(priv->tx_ring);<br>
-<br>
out_rx_ring_cleanup:<br>
kfree(priv->rx_ring);<br>
<br>
@@ -896,10 +881,8 @@ void ipoib_dev_cleanup(struct net_device<br>
ipoib_ib_dev_cleanup(dev);<br>
<br>
kfree(priv->rx_ring);<br>
- kfree(priv->tx_ring);<br>
<br>
priv->rx_ring = NULL;<br>
- priv->tx_ring = NULL;<br>
}<br>
<br>
static void ipoib_setup(struct net_device *dev)<br>
@@ -944,6 +927,7 @@ static void ipoib_setup(struct net_devic<br>
<br>
spin_lock_init(&priv->lock);<br>
spin_lock_init(&priv->tx_lock);<br>
+ spin_lock_init(&priv->slist_lock);<br>
<br>
mutex_init(&priv->mcast_mutex);<br>
mutex_init(&priv->vlan_mutex);<br>
@@ -952,6 +936,7 @@ static void ipoib_setup(struct net_devic<br>
INIT_LIST_HEAD(&priv->child_intfs);<br>
INIT_LIST_HEAD(&priv->dead_ahs);<br>
INIT_LIST_HEAD(&priv->multicast_list);<br>
+ INIT_LIST_HEAD(&priv->send_list);
<br>
<br>
INIT_WORK(&priv->pkey_task,
ipoib_pkey_poll, priv->dev);</font>
<br><font size=2 face="sans-serif"> INIT_WORK(&priv->mcast_task,
ipoib_mcast_join_task, priv->dev);</font>
<br>
<br><font size=2 face="sans-serif"><br>
Thanks<br>
Shirley Ma<br>
IBM Linux Technology Center<br>
15300 SW Koll Parkway<br>
Beaverton, OR 97006-6063<br>
Phone(Fax): (503) 578-7638<br>
<br>
</font></div>