[ofa-general] [PATCH] ipoib: null tx/rx_ring skb pointers on free
akepner at sgi.com
akepner at sgi.com
Wed Nov 5 17:23:07 PST 2008
Way back in:
http://lists.openfabrics.org/pipermail/general/2008-May/050196.html
I described an IPoIB-related panic we were seeing on large
clusters. The signature was a backtrace like this:
skb_over_panic
:ib_ipoib:ipoib_ib_handle_rx_wc
:ib_ipoib:ipoib_poll
net_rx_action
.....
The bug is difficult to reproduce, but we finally got a crashdump,
and the problem appears to be that stale skb pointers on the tx_ring
were left pointing to skbs that had been since reused, so that the
skb's data region was now unexpectedly short, etc.
Recently LLNL reported something similar:
http://lists.openfabrics.org/pipermail/general/2008-October/054824.html
A patch similar to the following seems to fix thing up.
Ira, Al, if this looks OK, can you please sign off on it?
Signed-off-by: Arthur Kepner <akepner at sgi.com>
---
ipoib_cm.c | 5 +++++
ipoib_ib.c | 4 ++++
2 files changed, 9 insertions(+)
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 7b14c2c..8f8650b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
rx_ring[i].mapping);
dev_kfree_skb_any(rx_ring[i].skb);
+ rx_ring[i].skb = NULL;
}
vfree(rx_ring);
@@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
++dev->stats.tx_errors;
dev_kfree_skb_any(skb);
+ tx_req->skb = NULL;
return;
}
@@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
++dev->stats.tx_errors;
ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
dev_kfree_skb_any(skb);
+ tx_req->skb = NULL;
} else {
dev->trans_start = jiffies;
++tx->tx_head;
@@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
dev->stats.tx_bytes += tx_req->skb->len;
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb = NULL;
netif_tx_lock(dev);
@@ -1179,6 +1183,7 @@ timeout:
ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
DMA_TO_DEVICE);
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb = NULL;
++p->tx_tail;
netif_tx_lock_bh(p->dev);
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 28eb6f0..f7e3497 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
dev->stats.tx_bytes += tx_req->skb->len;
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb = NULL;
++priv->tx_tail;
if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
@@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
++dev->stats.tx_errors;
dev_kfree_skb_any(skb);
+ tx_req->skb = NULL;
return;
}
@@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
--priv->tx_outstanding;
ipoib_dma_unmap_tx(priv->ca, tx_req);
dev_kfree_skb_any(skb);
+ tx_req->skb = NULL;
if (netif_queue_stopped(dev))
netif_wake_queue(dev);
} else {
@@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
(ipoib_sendq_size - 1)];
ipoib_dma_unmap_tx(priv->ca, tx_req);
dev_kfree_skb_any(tx_req->skb);
+ tx_req->skb = NULL;
++priv->tx_tail;
--priv->tx_outstanding;
}
More information about the general
mailing list