[ofa-general] [PATCH] ipoib: null tx/rx_ring skb pointers on free

akepner at sgi.com akepner at sgi.com
Wed Nov 5 17:23:07 PST 2008


Way back in:

http://lists.openfabrics.org/pipermail/general/2008-May/050196.html

I described an IPoIB-related panic we were seeing on large 
clusters. The signature was a backtrace like this:

	skb_over_panic
	:ib_ipoib:ipoib_ib_handle_rx_wc
	:ib_ipoib:ipoib_poll
	net_rx_action
	.....

The bug is difficult to reproduce, but we finally got a crashdump, 
and the problem appears to be that stale skb pointers on the tx_ring 
were left pointing to skbs that had been since reused, so that the 
skb's data region was now unexpectedly short, etc. 

Recently LLNL reported something similar:

http://lists.openfabrics.org/pipermail/general/2008-October/054824.html

A patch similar to the following seems to fix thing up. 

Ira, Al, if this looks OK, can you please sign off on it?

Signed-off-by: Arthur Kepner <akepner at sgi.com>

--- 

 ipoib_cm.c |    5 +++++
 ipoib_ib.c |    4 ++++
 2 files changed, 9 insertions(+)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 7b14c2c..8f8650b 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -200,6 +200,7 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev,
 			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
 					      rx_ring[i].mapping);
 			dev_kfree_skb_any(rx_ring[i].skb);
+			rx_ring[i].skb = NULL;
 		}
 
 	vfree(rx_ring);
@@ -736,6 +737,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++dev->stats.tx_errors;
 		dev_kfree_skb_any(skb);
+		tx_req->skb = NULL;
 		return;
 	}
 
@@ -747,6 +749,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 		++dev->stats.tx_errors;
 		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
 		dev_kfree_skb_any(skb);
+		tx_req->skb = NULL;
 	} else {
 		dev->trans_start = jiffies;
 		++tx->tx_head;
@@ -785,6 +788,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	dev->stats.tx_bytes += tx_req->skb->len;
 
 	dev_kfree_skb_any(tx_req->skb);
+	tx_req->skb = NULL;
 
 	netif_tx_lock(dev);
 
@@ -1179,6 +1183,7 @@ timeout:
 		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
 				    DMA_TO_DEVICE);
 		dev_kfree_skb_any(tx_req->skb);
+		tx_req->skb = NULL;
 		++p->tx_tail;
 		netif_tx_lock_bh(p->dev);
 		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 28eb6f0..f7e3497 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -383,6 +383,7 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	dev->stats.tx_bytes += tx_req->skb->len;
 
 	dev_kfree_skb_any(tx_req->skb);
+	tx_req->skb = NULL;
 
 	++priv->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
@@ -572,6 +573,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
 		++dev->stats.tx_errors;
 		dev_kfree_skb_any(skb);
+		tx_req->skb = NULL;
 		return;
 	}
 
@@ -594,6 +596,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
 		--priv->tx_outstanding;
 		ipoib_dma_unmap_tx(priv->ca, tx_req);
 		dev_kfree_skb_any(skb);
+		tx_req->skb = NULL;
 		if (netif_queue_stopped(dev))
 			netif_wake_queue(dev);
 	} else {
@@ -833,6 +836,7 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 							(ipoib_sendq_size - 1)];
 				ipoib_dma_unmap_tx(priv->ca, tx_req);
 				dev_kfree_skb_any(tx_req->skb);
+				tx_req->skb = NULL;
 				++priv->tx_tail;
 				--priv->tx_outstanding;
 			}



More information about the general mailing list